diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
commit | 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (patch) | |
tree | d1fc25d9650d3ac24591bba6f5e2e7a1afc54796 /arch | |
parent | 3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (diff) | |
parent | 1a577b72475d161b6677c05abe57301362023bb2 (diff) |
Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity:
"Highlights include
- full big real mode emulation on pre-Westmere Intel hosts (can be
disabled with emulate_invalid_guest_state=0)
- relatively small ppc and s390 updates
- PCID/INVPCID support in guests
- EOI avoidance; 3.6 guests should perform better on 3.6 hosts on
interrupt intensive workloads)
- Lockless write faults during live migration
- EPT accessed/dirty bits support for new Intel processors"
Fix up conflicts in:
- Documentation/virtual/kvm/api.txt:
Stupid subchapter numbering, added next to each other.
- arch/powerpc/kvm/booke_interrupts.S:
PPC asm changes clashing with the KVM fixes
- arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c:
Duplicated commits through the kvm tree and the s390 tree, with
subsequent edits in the KVM tree.
* tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits)
KVM: fix race with level interrupts
x86, hyper: fix build with !CONFIG_KVM_GUEST
Revert "apic: fix kvm build on UP without IOAPIC"
KVM guest: switch to apic_set_eoi_write, apic_write
apic: add apic_set_eoi_write for PV use
KVM: VMX: Implement PCID/INVPCID for guests with EPT
KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check
KVM: PPC: Critical interrupt emulation support
KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests
KVM: PPC64: booke: Set interrupt computation mode for 64-bit host
KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt
KVM: PPC: bookehv64: Add support for std/ld emulation.
booke: Added crit/mc exception handler for e500v2
booke/bookehv: Add host crit-watchdog exception support
KVM: MMU: document mmu-lock and fast page fault
KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
KVM: MMU: trace fast page fault
KVM: MMU: fast path of handling guest page fault
KVM: MMU: introduce SPTE_MMU_WRITEABLE bit
KVM: MMU: fold tlb flush judgement into mmu_spte_update
...
Diffstat (limited to 'arch')
55 files changed, 1594 insertions, 450 deletions
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b9f82c84f093..ec6c6b301238 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h | |||
@@ -26,6 +26,7 @@ | |||
26 | 26 | ||
27 | /* Select x86 specific features in <linux/kvm.h> */ | 27 | /* Select x86 specific features in <linux/kvm.h> */ |
28 | #define __KVM_HAVE_IOAPIC | 28 | #define __KVM_HAVE_IOAPIC |
29 | #define __KVM_HAVE_IRQ_LINE | ||
29 | #define __KVM_HAVE_DEVICE_ASSIGNMENT | 30 | #define __KVM_HAVE_DEVICE_ASSIGNMENT |
30 | 31 | ||
31 | /* Architectural interrupt line count. */ | 32 | /* Architectural interrupt line count. */ |
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 9806e55f91be..df5351e3eed7 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig | |||
@@ -19,6 +19,7 @@ if VIRTUALIZATION | |||
19 | 19 | ||
20 | config KVM | 20 | config KVM |
21 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
22 | depends on BROKEN | ||
22 | depends on HAVE_KVM && MODULES && EXPERIMENTAL | 23 | depends on HAVE_KVM && MODULES && EXPERIMENTAL |
23 | # for device assignment: | 24 | # for device assignment: |
24 | depends on PCI | 25 | depends on PCI |
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 976835d8f22e..bf2c06c33871 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h | |||
@@ -153,6 +153,8 @@ | |||
153 | #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" | 153 | #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" |
154 | #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" | 154 | #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" |
155 | 155 | ||
156 | extern bool epapr_paravirt_enabled; | ||
157 | extern u32 epapr_hypercall_start[]; | ||
156 | 158 | ||
157 | /* | 159 | /* |
158 | * We use "uintptr_t" to define a register because it's guaranteed to be a | 160 | * We use "uintptr_t" to define a register because it's guaranteed to be a |
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 0554ab062bdc..e45c4947a772 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h | |||
@@ -34,6 +34,8 @@ extern void __replay_interrupt(unsigned int vector); | |||
34 | 34 | ||
35 | extern void timer_interrupt(struct pt_regs *); | 35 | extern void timer_interrupt(struct pt_regs *); |
36 | extern void performance_monitor_exception(struct pt_regs *regs); | 36 | extern void performance_monitor_exception(struct pt_regs *regs); |
37 | extern void WatchdogException(struct pt_regs *regs); | ||
38 | extern void unknown_exception(struct pt_regs *regs); | ||
37 | 39 | ||
38 | #ifdef CONFIG_PPC64 | 40 | #ifdef CONFIG_PPC64 |
39 | #include <asm/paca.h> | 41 | #include <asm/paca.h> |
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b0c08b142770..0dd1d86d3e31 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h | |||
@@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) | |||
36 | #define SPAPR_TCE_SHIFT 12 | 36 | #define SPAPR_TCE_SHIFT 12 |
37 | 37 | ||
38 | #ifdef CONFIG_KVM_BOOK3S_64_HV | 38 | #ifdef CONFIG_KVM_BOOK3S_64_HV |
39 | /* For now use fixed-size 16MB page table */ | 39 | #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ |
40 | #define HPT_ORDER 24 | 40 | extern int kvm_hpt_order; /* order of preallocated HPTs */ |
41 | #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ | ||
42 | #define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ | ||
43 | #define HPT_HASH_MASK (HPT_NPTEG - 1) | ||
44 | #endif | 41 | #endif |
45 | 42 | ||
46 | #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ | 43 | #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d848cdc49715..50ea12fd7bf5 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
@@ -237,6 +237,10 @@ struct kvm_arch { | |||
237 | unsigned long vrma_slb_v; | 237 | unsigned long vrma_slb_v; |
238 | int rma_setup_done; | 238 | int rma_setup_done; |
239 | int using_mmu_notifiers; | 239 | int using_mmu_notifiers; |
240 | u32 hpt_order; | ||
241 | atomic_t vcpus_running; | ||
242 | unsigned long hpt_npte; | ||
243 | unsigned long hpt_mask; | ||
240 | spinlock_t slot_phys_lock; | 244 | spinlock_t slot_phys_lock; |
241 | unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; | 245 | unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; |
242 | int slot_npages[KVM_MEM_SLOTS_NUM]; | 246 | int slot_npages[KVM_MEM_SLOTS_NUM]; |
@@ -414,7 +418,9 @@ struct kvm_vcpu_arch { | |||
414 | ulong mcsrr1; | 418 | ulong mcsrr1; |
415 | ulong mcsr; | 419 | ulong mcsr; |
416 | u32 dec; | 420 | u32 dec; |
421 | #ifdef CONFIG_BOOKE | ||
417 | u32 decar; | 422 | u32 decar; |
423 | #endif | ||
418 | u32 tbl; | 424 | u32 tbl; |
419 | u32 tbu; | 425 | u32 tbu; |
420 | u32 tcr; | 426 | u32 tcr; |
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f68c22fa2fce..0124937a23b9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h | |||
@@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); | |||
119 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); | 119 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); |
120 | extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); | 120 | extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); |
121 | 121 | ||
122 | extern long kvmppc_alloc_hpt(struct kvm *kvm); | 122 | extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); |
123 | extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); | ||
123 | extern void kvmppc_free_hpt(struct kvm *kvm); | 124 | extern void kvmppc_free_hpt(struct kvm *kvm); |
124 | extern long kvmppc_prepare_vrma(struct kvm *kvm, | 125 | extern long kvmppc_prepare_vrma(struct kvm *kvm, |
125 | struct kvm_userspace_memory_region *mem); | 126 | struct kvm_userspace_memory_region *mem); |
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 83afacd3ba7b..bb282dd81612 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile | |||
@@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) | |||
128 | obj-y += ppc_save_regs.o | 128 | obj-y += ppc_save_regs.o |
129 | endif | 129 | endif |
130 | 130 | ||
131 | obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o | ||
131 | obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o | 132 | obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o |
132 | 133 | ||
133 | # Disable GCOV in odd or sensitive code | 134 | # Disable GCOV in odd or sensitive code |
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S new file mode 100644 index 000000000000..697b390ebfd8 --- /dev/null +++ b/arch/powerpc/kernel/epapr_hcalls.S | |||
@@ -0,0 +1,25 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Freescale Semiconductor, Inc. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #include <linux/threads.h> | ||
11 | #include <asm/reg.h> | ||
12 | #include <asm/page.h> | ||
13 | #include <asm/cputable.h> | ||
14 | #include <asm/thread_info.h> | ||
15 | #include <asm/ppc_asm.h> | ||
16 | #include <asm/asm-offsets.h> | ||
17 | |||
18 | /* Hypercall entry point. Will be patched with device tree instructions. */ | ||
19 | .global epapr_hypercall_start | ||
20 | epapr_hypercall_start: | ||
21 | li r3, -1 | ||
22 | nop | ||
23 | nop | ||
24 | nop | ||
25 | blr | ||
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c new file mode 100644 index 000000000000..028aeae370b6 --- /dev/null +++ b/arch/powerpc/kernel/epapr_paravirt.c | |||
@@ -0,0 +1,52 @@ | |||
1 | /* | ||
2 | * ePAPR para-virtualization support. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License, version 2, as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
16 | * | ||
17 | * Copyright (C) 2012 Freescale Semiconductor, Inc. | ||
18 | */ | ||
19 | |||
20 | #include <linux/of.h> | ||
21 | #include <asm/epapr_hcalls.h> | ||
22 | #include <asm/cacheflush.h> | ||
23 | #include <asm/code-patching.h> | ||
24 | |||
25 | bool epapr_paravirt_enabled; | ||
26 | |||
27 | static int __init epapr_paravirt_init(void) | ||
28 | { | ||
29 | struct device_node *hyper_node; | ||
30 | const u32 *insts; | ||
31 | int len, i; | ||
32 | |||
33 | hyper_node = of_find_node_by_path("/hypervisor"); | ||
34 | if (!hyper_node) | ||
35 | return -ENODEV; | ||
36 | |||
37 | insts = of_get_property(hyper_node, "hcall-instructions", &len); | ||
38 | if (!insts) | ||
39 | return -ENODEV; | ||
40 | |||
41 | if (len % 4 || len > (4 * 4)) | ||
42 | return -ENODEV; | ||
43 | |||
44 | for (i = 0; i < (len / 4); i++) | ||
45 | patch_instruction(epapr_hypercall_start + i, insts[i]); | ||
46 | |||
47 | epapr_paravirt_enabled = true; | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | early_initcall(epapr_paravirt_init); | ||
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 02c167db6ba0..867db1de8949 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <asm/cacheflush.h> | 31 | #include <asm/cacheflush.h> |
32 | #include <asm/disassemble.h> | 32 | #include <asm/disassemble.h> |
33 | #include <asm/ppc-opcode.h> | 33 | #include <asm/ppc-opcode.h> |
34 | #include <asm/epapr_hcalls.h> | ||
34 | 35 | ||
35 | #define KVM_MAGIC_PAGE (-4096L) | 36 | #define KVM_MAGIC_PAGE (-4096L) |
36 | #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) | 37 | #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) |
@@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in, | |||
726 | unsigned long register r11 asm("r11") = nr; | 727 | unsigned long register r11 asm("r11") = nr; |
727 | unsigned long register r12 asm("r12"); | 728 | unsigned long register r12 asm("r12"); |
728 | 729 | ||
729 | asm volatile("bl kvm_hypercall_start" | 730 | asm volatile("bl epapr_hypercall_start" |
730 | : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), | 731 | : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), |
731 | "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), | 732 | "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), |
732 | "=r"(r12) | 733 | "=r"(r12) |
@@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in, | |||
747 | } | 748 | } |
748 | EXPORT_SYMBOL_GPL(kvm_hypercall); | 749 | EXPORT_SYMBOL_GPL(kvm_hypercall); |
749 | 750 | ||
750 | static int kvm_para_setup(void) | ||
751 | { | ||
752 | extern u32 kvm_hypercall_start; | ||
753 | struct device_node *hyper_node; | ||
754 | u32 *insts; | ||
755 | int len, i; | ||
756 | |||
757 | hyper_node = of_find_node_by_path("/hypervisor"); | ||
758 | if (!hyper_node) | ||
759 | return -1; | ||
760 | |||
761 | insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len); | ||
762 | if (len % 4) | ||
763 | return -1; | ||
764 | if (len > (4 * 4)) | ||
765 | return -1; | ||
766 | |||
767 | for (i = 0; i < (len / 4); i++) | ||
768 | kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]); | ||
769 | |||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | static __init void kvm_free_tmp(void) | 751 | static __init void kvm_free_tmp(void) |
774 | { | 752 | { |
775 | unsigned long start, end; | 753 | unsigned long start, end; |
@@ -791,7 +769,7 @@ static int __init kvm_guest_init(void) | |||
791 | if (!kvm_para_available()) | 769 | if (!kvm_para_available()) |
792 | goto free_tmp; | 770 | goto free_tmp; |
793 | 771 | ||
794 | if (kvm_para_setup()) | 772 | if (!epapr_paravirt_enabled) |
795 | goto free_tmp; | 773 | goto free_tmp; |
796 | 774 | ||
797 | if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) | 775 | if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) |
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index e291cf3cf954..e100ff324a85 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S | |||
@@ -24,16 +24,6 @@ | |||
24 | #include <asm/page.h> | 24 | #include <asm/page.h> |
25 | #include <asm/asm-offsets.h> | 25 | #include <asm/asm-offsets.h> |
26 | 26 | ||
27 | /* Hypercall entry point. Will be patched with device tree instructions. */ | ||
28 | |||
29 | .global kvm_hypercall_start | ||
30 | kvm_hypercall_start: | ||
31 | li r3, -1 | ||
32 | nop | ||
33 | nop | ||
34 | nop | ||
35 | blr | ||
36 | |||
37 | #define KVM_MAGIC_PAGE (-4096) | 27 | #define KVM_MAGIC_PAGE (-4096) |
38 | 28 | ||
39 | #ifdef CONFIG_64BIT | 29 | #ifdef CONFIG_64BIT |
@@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len: | |||
132 | .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 | 122 | .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 |
133 | 123 | ||
134 | 124 | ||
135 | #define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) | 125 | #define MSR_SAFE_BITS (MSR_EE | MSR_RI) |
136 | #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS | 126 | #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS |
137 | 127 | ||
138 | .global kvm_emulate_mtmsr | 128 | .global kvm_emulate_mtmsr |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 80a577517584..d03eb6f7b058 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c | |||
@@ -37,56 +37,121 @@ | |||
37 | /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ | 37 | /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ |
38 | #define MAX_LPID_970 63 | 38 | #define MAX_LPID_970 63 |
39 | 39 | ||
40 | long kvmppc_alloc_hpt(struct kvm *kvm) | 40 | /* Power architecture requires HPT is at least 256kB */ |
41 | #define PPC_MIN_HPT_ORDER 18 | ||
42 | |||
43 | long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) | ||
41 | { | 44 | { |
42 | unsigned long hpt; | 45 | unsigned long hpt; |
43 | long lpid; | ||
44 | struct revmap_entry *rev; | 46 | struct revmap_entry *rev; |
45 | struct kvmppc_linear_info *li; | 47 | struct kvmppc_linear_info *li; |
48 | long order = kvm_hpt_order; | ||
46 | 49 | ||
47 | /* Allocate guest's hashed page table */ | 50 | if (htab_orderp) { |
48 | li = kvm_alloc_hpt(); | 51 | order = *htab_orderp; |
49 | if (li) { | 52 | if (order < PPC_MIN_HPT_ORDER) |
50 | /* using preallocated memory */ | 53 | order = PPC_MIN_HPT_ORDER; |
51 | hpt = (ulong)li->base_virt; | 54 | } |
52 | kvm->arch.hpt_li = li; | 55 | |
53 | } else { | 56 | /* |
54 | /* using dynamic memory */ | 57 | * If the user wants a different size from default, |
58 | * try first to allocate it from the kernel page allocator. | ||
59 | */ | ||
60 | hpt = 0; | ||
61 | if (order != kvm_hpt_order) { | ||
55 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| | 62 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| |
56 | __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); | 63 | __GFP_NOWARN, order - PAGE_SHIFT); |
64 | if (!hpt) | ||
65 | --order; | ||
57 | } | 66 | } |
58 | 67 | ||
68 | /* Next try to allocate from the preallocated pool */ | ||
59 | if (!hpt) { | 69 | if (!hpt) { |
60 | pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); | 70 | li = kvm_alloc_hpt(); |
61 | return -ENOMEM; | 71 | if (li) { |
72 | hpt = (ulong)li->base_virt; | ||
73 | kvm->arch.hpt_li = li; | ||
74 | order = kvm_hpt_order; | ||
75 | } | ||
62 | } | 76 | } |
77 | |||
78 | /* Lastly try successively smaller sizes from the page allocator */ | ||
79 | while (!hpt && order > PPC_MIN_HPT_ORDER) { | ||
80 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| | ||
81 | __GFP_NOWARN, order - PAGE_SHIFT); | ||
82 | if (!hpt) | ||
83 | --order; | ||
84 | } | ||
85 | |||
86 | if (!hpt) | ||
87 | return -ENOMEM; | ||
88 | |||
63 | kvm->arch.hpt_virt = hpt; | 89 | kvm->arch.hpt_virt = hpt; |
90 | kvm->arch.hpt_order = order; | ||
91 | /* HPTEs are 2**4 bytes long */ | ||
92 | kvm->arch.hpt_npte = 1ul << (order - 4); | ||
93 | /* 128 (2**7) bytes in each HPTEG */ | ||
94 | kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; | ||
64 | 95 | ||
65 | /* Allocate reverse map array */ | 96 | /* Allocate reverse map array */ |
66 | rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); | 97 | rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); |
67 | if (!rev) { | 98 | if (!rev) { |
68 | pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); | 99 | pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); |
69 | goto out_freehpt; | 100 | goto out_freehpt; |
70 | } | 101 | } |
71 | kvm->arch.revmap = rev; | 102 | kvm->arch.revmap = rev; |
103 | kvm->arch.sdr1 = __pa(hpt) | (order - 18); | ||
72 | 104 | ||
73 | lpid = kvmppc_alloc_lpid(); | 105 | pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", |
74 | if (lpid < 0) | 106 | hpt, order, kvm->arch.lpid); |
75 | goto out_freeboth; | ||
76 | 107 | ||
77 | kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); | 108 | if (htab_orderp) |
78 | kvm->arch.lpid = lpid; | 109 | *htab_orderp = order; |
79 | |||
80 | pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); | ||
81 | return 0; | 110 | return 0; |
82 | 111 | ||
83 | out_freeboth: | ||
84 | vfree(rev); | ||
85 | out_freehpt: | 112 | out_freehpt: |
86 | free_pages(hpt, HPT_ORDER - PAGE_SHIFT); | 113 | if (kvm->arch.hpt_li) |
114 | kvm_release_hpt(kvm->arch.hpt_li); | ||
115 | else | ||
116 | free_pages(hpt, order - PAGE_SHIFT); | ||
87 | return -ENOMEM; | 117 | return -ENOMEM; |
88 | } | 118 | } |
89 | 119 | ||
120 | long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) | ||
121 | { | ||
122 | long err = -EBUSY; | ||
123 | long order; | ||
124 | |||
125 | mutex_lock(&kvm->lock); | ||
126 | if (kvm->arch.rma_setup_done) { | ||
127 | kvm->arch.rma_setup_done = 0; | ||
128 | /* order rma_setup_done vs. vcpus_running */ | ||
129 | smp_mb(); | ||
130 | if (atomic_read(&kvm->arch.vcpus_running)) { | ||
131 | kvm->arch.rma_setup_done = 1; | ||
132 | goto out; | ||
133 | } | ||
134 | } | ||
135 | if (kvm->arch.hpt_virt) { | ||
136 | order = kvm->arch.hpt_order; | ||
137 | /* Set the entire HPT to 0, i.e. invalid HPTEs */ | ||
138 | memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); | ||
139 | /* | ||
140 | * Set the whole last_vcpu array to an invalid vcpu number. | ||
141 | * This ensures that each vcpu will flush its TLB on next entry. | ||
142 | */ | ||
143 | memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); | ||
144 | *htab_orderp = order; | ||
145 | err = 0; | ||
146 | } else { | ||
147 | err = kvmppc_alloc_hpt(kvm, htab_orderp); | ||
148 | order = *htab_orderp; | ||
149 | } | ||
150 | out: | ||
151 | mutex_unlock(&kvm->lock); | ||
152 | return err; | ||
153 | } | ||
154 | |||
90 | void kvmppc_free_hpt(struct kvm *kvm) | 155 | void kvmppc_free_hpt(struct kvm *kvm) |
91 | { | 156 | { |
92 | kvmppc_free_lpid(kvm->arch.lpid); | 157 | kvmppc_free_lpid(kvm->arch.lpid); |
@@ -94,7 +159,8 @@ void kvmppc_free_hpt(struct kvm *kvm) | |||
94 | if (kvm->arch.hpt_li) | 159 | if (kvm->arch.hpt_li) |
95 | kvm_release_hpt(kvm->arch.hpt_li); | 160 | kvm_release_hpt(kvm->arch.hpt_li); |
96 | else | 161 | else |
97 | free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); | 162 | free_pages(kvm->arch.hpt_virt, |
163 | kvm->arch.hpt_order - PAGE_SHIFT); | ||
98 | } | 164 | } |
99 | 165 | ||
100 | /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ | 166 | /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ |
@@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
119 | unsigned long psize; | 185 | unsigned long psize; |
120 | unsigned long hp0, hp1; | 186 | unsigned long hp0, hp1; |
121 | long ret; | 187 | long ret; |
188 | struct kvm *kvm = vcpu->kvm; | ||
122 | 189 | ||
123 | psize = 1ul << porder; | 190 | psize = 1ul << porder; |
124 | npages = memslot->npages >> (porder - PAGE_SHIFT); | 191 | npages = memslot->npages >> (porder - PAGE_SHIFT); |
@@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
127 | if (npages > 1ul << (40 - porder)) | 194 | if (npages > 1ul << (40 - porder)) |
128 | npages = 1ul << (40 - porder); | 195 | npages = 1ul << (40 - porder); |
129 | /* Can't use more than 1 HPTE per HPTEG */ | 196 | /* Can't use more than 1 HPTE per HPTEG */ |
130 | if (npages > HPT_NPTEG) | 197 | if (npages > kvm->arch.hpt_mask + 1) |
131 | npages = HPT_NPTEG; | 198 | npages = kvm->arch.hpt_mask + 1; |
132 | 199 | ||
133 | hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | | 200 | hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | |
134 | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); | 201 | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); |
@@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
138 | for (i = 0; i < npages; ++i) { | 205 | for (i = 0; i < npages; ++i) { |
139 | addr = i << porder; | 206 | addr = i << porder; |
140 | /* can't use hpt_hash since va > 64 bits */ | 207 | /* can't use hpt_hash since va > 64 bits */ |
141 | hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; | 208 | hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; |
142 | /* | 209 | /* |
143 | * We assume that the hash table is empty and no | 210 | * We assume that the hash table is empty and no |
144 | * vcpus are using it at this stage. Since we create | 211 | * vcpus are using it at this stage. Since we create |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3abe1b86e583..83e929e66f9d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
@@ -56,7 +56,7 @@ | |||
56 | /* #define EXIT_DEBUG_INT */ | 56 | /* #define EXIT_DEBUG_INT */ |
57 | 57 | ||
58 | static void kvmppc_end_cede(struct kvm_vcpu *vcpu); | 58 | static void kvmppc_end_cede(struct kvm_vcpu *vcpu); |
59 | static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); | 59 | static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); |
60 | 60 | ||
61 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 61 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
62 | { | 62 | { |
@@ -1104,11 +1104,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
1104 | return -EINTR; | 1104 | return -EINTR; |
1105 | } | 1105 | } |
1106 | 1106 | ||
1107 | /* On the first time here, set up VRMA or RMA */ | 1107 | atomic_inc(&vcpu->kvm->arch.vcpus_running); |
1108 | /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ | ||
1109 | smp_mb(); | ||
1110 | |||
1111 | /* On the first time here, set up HTAB and VRMA or RMA */ | ||
1108 | if (!vcpu->kvm->arch.rma_setup_done) { | 1112 | if (!vcpu->kvm->arch.rma_setup_done) { |
1109 | r = kvmppc_hv_setup_rma(vcpu); | 1113 | r = kvmppc_hv_setup_htab_rma(vcpu); |
1110 | if (r) | 1114 | if (r) |
1111 | return r; | 1115 | goto out; |
1112 | } | 1116 | } |
1113 | 1117 | ||
1114 | flush_fp_to_thread(current); | 1118 | flush_fp_to_thread(current); |
@@ -1126,6 +1130,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
1126 | kvmppc_core_prepare_to_enter(vcpu); | 1130 | kvmppc_core_prepare_to_enter(vcpu); |
1127 | } | 1131 | } |
1128 | } while (r == RESUME_GUEST); | 1132 | } while (r == RESUME_GUEST); |
1133 | |||
1134 | out: | ||
1135 | atomic_dec(&vcpu->kvm->arch.vcpus_running); | ||
1129 | return r; | 1136 | return r; |
1130 | } | 1137 | } |
1131 | 1138 | ||
@@ -1341,7 +1348,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, | |||
1341 | { | 1348 | { |
1342 | } | 1349 | } |
1343 | 1350 | ||
1344 | static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | 1351 | static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) |
1345 | { | 1352 | { |
1346 | int err = 0; | 1353 | int err = 0; |
1347 | struct kvm *kvm = vcpu->kvm; | 1354 | struct kvm *kvm = vcpu->kvm; |
@@ -1360,6 +1367,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | |||
1360 | if (kvm->arch.rma_setup_done) | 1367 | if (kvm->arch.rma_setup_done) |
1361 | goto out; /* another vcpu beat us to it */ | 1368 | goto out; /* another vcpu beat us to it */ |
1362 | 1369 | ||
1370 | /* Allocate hashed page table (if not done already) and reset it */ | ||
1371 | if (!kvm->arch.hpt_virt) { | ||
1372 | err = kvmppc_alloc_hpt(kvm, NULL); | ||
1373 | if (err) { | ||
1374 | pr_err("KVM: Couldn't alloc HPT\n"); | ||
1375 | goto out; | ||
1376 | } | ||
1377 | } | ||
1378 | |||
1363 | /* Look up the memslot for guest physical address 0 */ | 1379 | /* Look up the memslot for guest physical address 0 */ |
1364 | memslot = gfn_to_memslot(kvm, 0); | 1380 | memslot = gfn_to_memslot(kvm, 0); |
1365 | 1381 | ||
@@ -1471,13 +1487,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | |||
1471 | 1487 | ||
1472 | int kvmppc_core_init_vm(struct kvm *kvm) | 1488 | int kvmppc_core_init_vm(struct kvm *kvm) |
1473 | { | 1489 | { |
1474 | long r; | 1490 | unsigned long lpcr, lpid; |
1475 | unsigned long lpcr; | ||
1476 | 1491 | ||
1477 | /* Allocate hashed page table */ | 1492 | /* Allocate the guest's logical partition ID */ |
1478 | r = kvmppc_alloc_hpt(kvm); | 1493 | |
1479 | if (r) | 1494 | lpid = kvmppc_alloc_lpid(); |
1480 | return r; | 1495 | if (lpid < 0) |
1496 | return -ENOMEM; | ||
1497 | kvm->arch.lpid = lpid; | ||
1481 | 1498 | ||
1482 | INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); | 1499 | INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); |
1483 | 1500 | ||
@@ -1487,7 +1504,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) | |||
1487 | 1504 | ||
1488 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { | 1505 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { |
1489 | /* PPC970; HID4 is effectively the LPCR */ | 1506 | /* PPC970; HID4 is effectively the LPCR */ |
1490 | unsigned long lpid = kvm->arch.lpid; | ||
1491 | kvm->arch.host_lpid = 0; | 1507 | kvm->arch.host_lpid = 0; |
1492 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); | 1508 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); |
1493 | lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); | 1509 | lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); |
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index e1b60f56f2a1..fb4eac290fef 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c | |||
@@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type); | |||
25 | static struct kvmppc_linear_info *kvm_alloc_linear(int type); | 25 | static struct kvmppc_linear_info *kvm_alloc_linear(int type); |
26 | static void kvm_release_linear(struct kvmppc_linear_info *ri); | 26 | static void kvm_release_linear(struct kvmppc_linear_info *ri); |
27 | 27 | ||
28 | int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; | ||
29 | EXPORT_SYMBOL_GPL(kvm_hpt_order); | ||
30 | |||
28 | /*************** RMA *************/ | 31 | /*************** RMA *************/ |
29 | 32 | ||
30 | /* | 33 | /* |
@@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri) | |||
209 | void __init kvm_linear_init(void) | 212 | void __init kvm_linear_init(void) |
210 | { | 213 | { |
211 | /* HPT */ | 214 | /* HPT */ |
212 | kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); | 215 | kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); |
213 | 216 | ||
214 | /* RMA */ | 217 | /* RMA */ |
215 | /* Only do this on PPC970 in HV mode */ | 218 | /* Only do this on PPC970 in HV mode */ |
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index cec4daddbf31..5c70d19494f9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c | |||
@@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, | |||
237 | 237 | ||
238 | /* Find and lock the HPTEG slot to use */ | 238 | /* Find and lock the HPTEG slot to use */ |
239 | do_insert: | 239 | do_insert: |
240 | if (pte_index >= HPT_NPTE) | 240 | if (pte_index >= kvm->arch.hpt_npte) |
241 | return H_PARAMETER; | 241 | return H_PARAMETER; |
242 | if (likely((flags & H_EXACT) == 0)) { | 242 | if (likely((flags & H_EXACT) == 0)) { |
243 | pte_index &= ~7UL; | 243 | pte_index &= ~7UL; |
@@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, | |||
352 | unsigned long v, r, rb; | 352 | unsigned long v, r, rb; |
353 | struct revmap_entry *rev; | 353 | struct revmap_entry *rev; |
354 | 354 | ||
355 | if (pte_index >= HPT_NPTE) | 355 | if (pte_index >= kvm->arch.hpt_npte) |
356 | return H_PARAMETER; | 356 | return H_PARAMETER; |
357 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | 357 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); |
358 | while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) | 358 | while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) |
@@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) | |||
419 | i = 4; | 419 | i = 4; |
420 | break; | 420 | break; |
421 | } | 421 | } |
422 | if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { | 422 | if (req != 1 || flags == 3 || |
423 | pte_index >= kvm->arch.hpt_npte) { | ||
423 | /* parameter error */ | 424 | /* parameter error */ |
424 | args[j] = ((0xa0 | flags) << 56) + pte_index; | 425 | args[j] = ((0xa0 | flags) << 56) + pte_index; |
425 | ret = H_PARAMETER; | 426 | ret = H_PARAMETER; |
@@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, | |||
521 | struct revmap_entry *rev; | 522 | struct revmap_entry *rev; |
522 | unsigned long v, r, rb, mask, bits; | 523 | unsigned long v, r, rb, mask, bits; |
523 | 524 | ||
524 | if (pte_index >= HPT_NPTE) | 525 | if (pte_index >= kvm->arch.hpt_npte) |
525 | return H_PARAMETER; | 526 | return H_PARAMETER; |
526 | 527 | ||
527 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | 528 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); |
@@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, | |||
583 | int i, n = 1; | 584 | int i, n = 1; |
584 | struct revmap_entry *rev = NULL; | 585 | struct revmap_entry *rev = NULL; |
585 | 586 | ||
586 | if (pte_index >= HPT_NPTE) | 587 | if (pte_index >= kvm->arch.hpt_npte) |
587 | return H_PARAMETER; | 588 | return H_PARAMETER; |
588 | if (flags & H_READ_4) { | 589 | if (flags & H_READ_4) { |
589 | pte_index &= ~3; | 590 | pte_index &= ~3; |
@@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, | |||
678 | somask = (1UL << 28) - 1; | 679 | somask = (1UL << 28) - 1; |
679 | vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; | 680 | vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; |
680 | } | 681 | } |
681 | hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; | 682 | hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; |
682 | avpn = slb_v & ~(somask >> 16); /* also includes B */ | 683 | avpn = slb_v & ~(somask >> 16); /* also includes B */ |
683 | avpn |= (eaddr & somask) >> 16; | 684 | avpn |= (eaddr & somask) >> 16; |
684 | 685 | ||
@@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, | |||
723 | if (val & HPTE_V_SECONDARY) | 724 | if (val & HPTE_V_SECONDARY) |
724 | break; | 725 | break; |
725 | val |= HPTE_V_SECONDARY; | 726 | val |= HPTE_V_SECONDARY; |
726 | hash = hash ^ HPT_HASH_MASK; | 727 | hash = hash ^ kvm->arch.hpt_mask; |
727 | } | 728 | } |
728 | return -1; | 729 | return -1; |
729 | } | 730 | } |
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 72f13f4a06e0..d25a097c852b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c | |||
@@ -612,6 +612,12 @@ static void kvmppc_fill_pt_regs(struct pt_regs *regs) | |||
612 | regs->link = lr; | 612 | regs->link = lr; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* | ||
616 | * For interrupts needed to be handled by host interrupt handlers, | ||
617 | * corresponding host handler are called from here in similar way | ||
618 | * (but not exact) as they are called from low level handler | ||
619 | * (such as from arch/powerpc/kernel/head_fsl_booke.S). | ||
620 | */ | ||
615 | static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, | 621 | static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, |
616 | unsigned int exit_nr) | 622 | unsigned int exit_nr) |
617 | { | 623 | { |
@@ -639,6 +645,17 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, | |||
639 | kvmppc_fill_pt_regs(®s); | 645 | kvmppc_fill_pt_regs(®s); |
640 | performance_monitor_exception(®s); | 646 | performance_monitor_exception(®s); |
641 | break; | 647 | break; |
648 | case BOOKE_INTERRUPT_WATCHDOG: | ||
649 | kvmppc_fill_pt_regs(®s); | ||
650 | #ifdef CONFIG_BOOKE_WDT | ||
651 | WatchdogException(®s); | ||
652 | #else | ||
653 | unknown_exception(®s); | ||
654 | #endif | ||
655 | break; | ||
656 | case BOOKE_INTERRUPT_CRITICAL: | ||
657 | unknown_exception(®s); | ||
658 | break; | ||
642 | } | 659 | } |
643 | } | 660 | } |
644 | 661 | ||
@@ -683,6 +700,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
683 | r = RESUME_GUEST; | 700 | r = RESUME_GUEST; |
684 | break; | 701 | break; |
685 | 702 | ||
703 | case BOOKE_INTERRUPT_WATCHDOG: | ||
704 | r = RESUME_GUEST; | ||
705 | break; | ||
706 | |||
686 | case BOOKE_INTERRUPT_DOORBELL: | 707 | case BOOKE_INTERRUPT_DOORBELL: |
687 | kvmppc_account_exit(vcpu, DBELL_EXITS); | 708 | kvmppc_account_exit(vcpu, DBELL_EXITS); |
688 | r = RESUME_GUEST; | 709 | r = RESUME_GUEST; |
@@ -1267,6 +1288,11 @@ void kvmppc_decrementer_func(unsigned long data) | |||
1267 | { | 1288 | { |
1268 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; | 1289 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; |
1269 | 1290 | ||
1291 | if (vcpu->arch.tcr & TCR_ARE) { | ||
1292 | vcpu->arch.dec = vcpu->arch.decar; | ||
1293 | kvmppc_emulate_dec(vcpu); | ||
1294 | } | ||
1295 | |||
1270 | kvmppc_set_tsr_bits(vcpu, TSR_DIS); | 1296 | kvmppc_set_tsr_bits(vcpu, TSR_DIS); |
1271 | } | 1297 | } |
1272 | 1298 | ||
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 6c76397f2af4..12834bb608ab 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include "booke.h" | 24 | #include "booke.h" |
25 | 25 | ||
26 | #define OP_19_XOP_RFI 50 | 26 | #define OP_19_XOP_RFI 50 |
27 | #define OP_19_XOP_RFCI 51 | ||
27 | 28 | ||
28 | #define OP_31_XOP_MFMSR 83 | 29 | #define OP_31_XOP_MFMSR 83 |
29 | #define OP_31_XOP_WRTEE 131 | 30 | #define OP_31_XOP_WRTEE 131 |
@@ -36,6 +37,12 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) | |||
36 | kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); | 37 | kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); |
37 | } | 38 | } |
38 | 39 | ||
40 | static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) | ||
41 | { | ||
42 | vcpu->arch.pc = vcpu->arch.csrr0; | ||
43 | kvmppc_set_msr(vcpu, vcpu->arch.csrr1); | ||
44 | } | ||
45 | |||
39 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | 46 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, |
40 | unsigned int inst, int *advance) | 47 | unsigned int inst, int *advance) |
41 | { | 48 | { |
@@ -52,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
52 | *advance = 0; | 59 | *advance = 0; |
53 | break; | 60 | break; |
54 | 61 | ||
62 | case OP_19_XOP_RFCI: | ||
63 | kvmppc_emul_rfci(vcpu); | ||
64 | kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); | ||
65 | *advance = 0; | ||
66 | break; | ||
67 | |||
55 | default: | 68 | default: |
56 | emulated = EMULATE_FAIL; | 69 | emulated = EMULATE_FAIL; |
57 | break; | 70 | break; |
@@ -113,6 +126,12 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) | |||
113 | case SPRN_ESR: | 126 | case SPRN_ESR: |
114 | vcpu->arch.shared->esr = spr_val; | 127 | vcpu->arch.shared->esr = spr_val; |
115 | break; | 128 | break; |
129 | case SPRN_CSRR0: | ||
130 | vcpu->arch.csrr0 = spr_val; | ||
131 | break; | ||
132 | case SPRN_CSRR1: | ||
133 | vcpu->arch.csrr1 = spr_val; | ||
134 | break; | ||
116 | case SPRN_DBCR0: | 135 | case SPRN_DBCR0: |
117 | vcpu->arch.dbcr0 = spr_val; | 136 | vcpu->arch.dbcr0 = spr_val; |
118 | break; | 137 | break; |
@@ -129,6 +148,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) | |||
129 | kvmppc_set_tcr(vcpu, spr_val); | 148 | kvmppc_set_tcr(vcpu, spr_val); |
130 | break; | 149 | break; |
131 | 150 | ||
151 | case SPRN_DECAR: | ||
152 | vcpu->arch.decar = spr_val; | ||
153 | break; | ||
132 | /* | 154 | /* |
133 | * Note: SPRG4-7 are user-readable. | 155 | * Note: SPRG4-7 are user-readable. |
134 | * These values are loaded into the real SPRGs when resuming the | 156 | * These values are loaded into the real SPRGs when resuming the |
@@ -229,6 +251,12 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) | |||
229 | case SPRN_ESR: | 251 | case SPRN_ESR: |
230 | *spr_val = vcpu->arch.shared->esr; | 252 | *spr_val = vcpu->arch.shared->esr; |
231 | break; | 253 | break; |
254 | case SPRN_CSRR0: | ||
255 | *spr_val = vcpu->arch.csrr0; | ||
256 | break; | ||
257 | case SPRN_CSRR1: | ||
258 | *spr_val = vcpu->arch.csrr1; | ||
259 | break; | ||
232 | case SPRN_DBCR0: | 260 | case SPRN_DBCR0: |
233 | *spr_val = vcpu->arch.dbcr0; | 261 | *spr_val = vcpu->arch.dbcr0; |
234 | break; | 262 | break; |
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 8fd4b2a0911b..bb46b32f9813 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S | |||
@@ -52,16 +52,21 @@ | |||
52 | (1<<BOOKE_INTERRUPT_PROGRAM) | \ | 52 | (1<<BOOKE_INTERRUPT_PROGRAM) | \ |
53 | (1<<BOOKE_INTERRUPT_DTLB_MISS)) | 53 | (1<<BOOKE_INTERRUPT_DTLB_MISS)) |
54 | 54 | ||
55 | .macro KVM_HANDLER ivor_nr | 55 | .macro KVM_HANDLER ivor_nr scratch srr0 |
56 | _GLOBAL(kvmppc_handler_\ivor_nr) | 56 | _GLOBAL(kvmppc_handler_\ivor_nr) |
57 | /* Get pointer to vcpu and record exit number. */ | 57 | /* Get pointer to vcpu and record exit number. */ |
58 | mtspr SPRN_SPRG_WSCRATCH0, r4 | 58 | mtspr \scratch , r4 |
59 | mfspr r4, SPRN_SPRG_RVCPU | 59 | mfspr r4, SPRN_SPRG_RVCPU |
60 | stw r3, VCPU_GPR(R3)(r4) | ||
60 | stw r5, VCPU_GPR(R5)(r4) | 61 | stw r5, VCPU_GPR(R5)(r4) |
61 | stw r6, VCPU_GPR(R6)(r4) | 62 | stw r6, VCPU_GPR(R6)(r4) |
63 | mfspr r3, \scratch | ||
62 | mfctr r5 | 64 | mfctr r5 |
63 | lis r6, kvmppc_resume_host@h | 65 | stw r3, VCPU_GPR(R4)(r4) |
64 | stw r5, VCPU_CTR(r4) | 66 | stw r5, VCPU_CTR(r4) |
67 | mfspr r3, \srr0 | ||
68 | lis r6, kvmppc_resume_host@h | ||
69 | stw r3, VCPU_PC(r4) | ||
65 | li r5, \ivor_nr | 70 | li r5, \ivor_nr |
66 | ori r6, r6, kvmppc_resume_host@l | 71 | ori r6, r6, kvmppc_resume_host@l |
67 | mtctr r6 | 72 | mtctr r6 |
@@ -69,37 +74,35 @@ _GLOBAL(kvmppc_handler_\ivor_nr) | |||
69 | .endm | 74 | .endm |
70 | 75 | ||
71 | _GLOBAL(kvmppc_handlers_start) | 76 | _GLOBAL(kvmppc_handlers_start) |
72 | KVM_HANDLER BOOKE_INTERRUPT_CRITICAL | 77 | KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
73 | KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK | 78 | KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 |
74 | KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE | 79 | KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
75 | KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE | 80 | KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
76 | KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL | 81 | KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
77 | KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT | 82 | KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
78 | KVM_HANDLER BOOKE_INTERRUPT_PROGRAM | 83 | KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
79 | KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL | 84 | KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
80 | KVM_HANDLER BOOKE_INTERRUPT_SYSCALL | 85 | KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
81 | KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL | 86 | KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
82 | KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER | 87 | KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
83 | KVM_HANDLER BOOKE_INTERRUPT_FIT | 88 | KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
84 | KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG | 89 | KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
85 | KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS | 90 | KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
86 | KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS | 91 | KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
87 | KVM_HANDLER BOOKE_INTERRUPT_DEBUG | 92 | KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
88 | KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL | 93 | KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
89 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA | 94 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
90 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND | 95 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
91 | 96 | ||
92 | _GLOBAL(kvmppc_handler_len) | 97 | _GLOBAL(kvmppc_handler_len) |
93 | .long kvmppc_handler_1 - kvmppc_handler_0 | 98 | .long kvmppc_handler_1 - kvmppc_handler_0 |
94 | 99 | ||
95 | |||
96 | /* Registers: | 100 | /* Registers: |
97 | * SPRG_SCRATCH0: guest r4 | 101 | * SPRG_SCRATCH0: guest r4 |
98 | * r4: vcpu pointer | 102 | * r4: vcpu pointer |
99 | * r5: KVM exit number | 103 | * r5: KVM exit number |
100 | */ | 104 | */ |
101 | _GLOBAL(kvmppc_resume_host) | 105 | _GLOBAL(kvmppc_resume_host) |
102 | stw r3, VCPU_GPR(R3)(r4) | ||
103 | mfcr r3 | 106 | mfcr r3 |
104 | stw r3, VCPU_CR(r4) | 107 | stw r3, VCPU_CR(r4) |
105 | stw r7, VCPU_GPR(R7)(r4) | 108 | stw r7, VCPU_GPR(R7)(r4) |
@@ -180,10 +183,6 @@ _GLOBAL(kvmppc_resume_host) | |||
180 | stw r3, VCPU_LR(r4) | 183 | stw r3, VCPU_LR(r4) |
181 | mfxer r3 | 184 | mfxer r3 |
182 | stw r3, VCPU_XER(r4) | 185 | stw r3, VCPU_XER(r4) |
183 | mfspr r3, SPRN_SPRG_RSCRATCH0 | ||
184 | stw r3, VCPU_GPR(R4)(r4) | ||
185 | mfspr r3, SPRN_SRR0 | ||
186 | stw r3, VCPU_PC(r4) | ||
187 | 186 | ||
188 | /* Restore host stack pointer and PID before IVPR, since the host | 187 | /* Restore host stack pointer and PID before IVPR, since the host |
189 | * exception handlers use them. */ | 188 | * exception handlers use them. */ |
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 1685dc43bcf2..d28c2d43ac1b 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S | |||
@@ -262,7 +262,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ | |||
262 | kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ | 262 | kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ |
263 | SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 | 263 | SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 |
264 | kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ | 264 | kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ |
265 | SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) | 265 | SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) |
266 | kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR | 266 | kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR |
267 | kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 | 267 | kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 |
268 | kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ | 268 | kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ |
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8b99e076dc81..e04b0ef55ce0 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c | |||
@@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) | |||
269 | *spr_val = vcpu->arch.shared->mas7_3 >> 32; | 269 | *spr_val = vcpu->arch.shared->mas7_3 >> 32; |
270 | break; | 270 | break; |
271 | #endif | 271 | #endif |
272 | case SPRN_DECAR: | ||
273 | *spr_val = vcpu->arch.decar; | ||
274 | break; | ||
272 | case SPRN_TLB0CFG: | 275 | case SPRN_TLB0CFG: |
273 | *spr_val = vcpu->arch.tlbcfg[0]; | 276 | *spr_val = vcpu->arch.tlbcfg[0]; |
274 | break; | 277 | break; |
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index fe6c1de6b701..1f89d26e65fb 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved. | 2 | * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. |
3 | * | 3 | * |
4 | * Author: Varun Sethi, <varun.sethi@freescale.com> | 4 | * Author: Varun Sethi, <varun.sethi@freescale.com> |
5 | * | 5 | * |
@@ -57,7 +57,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
57 | struct kvm_book3e_206_tlb_entry *gtlbe) | 57 | struct kvm_book3e_206_tlb_entry *gtlbe) |
58 | { | 58 | { |
59 | unsigned int tid, ts; | 59 | unsigned int tid, ts; |
60 | u32 val, eaddr, lpid; | 60 | gva_t eaddr; |
61 | u32 val, lpid; | ||
61 | unsigned long flags; | 62 | unsigned long flags; |
62 | 63 | ||
63 | ts = get_tlb_ts(gtlbe); | 64 | ts = get_tlb_ts(gtlbe); |
@@ -183,6 +184,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) | |||
183 | 184 | ||
184 | vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ | 185 | vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ |
185 | SPRN_EPCR_DUVD; | 186 | SPRN_EPCR_DUVD; |
187 | #ifdef CONFIG_64BIT | ||
188 | vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; | ||
189 | #endif | ||
186 | vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; | 190 | vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; |
187 | vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); | 191 | vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); |
188 | vcpu->arch.epsc = vcpu->arch.eplc; | 192 | vcpu->arch.epsc = vcpu->arch.eplc; |
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index f90e86dea7a2..ee04abaefe23 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c | |||
@@ -59,11 +59,13 @@ | |||
59 | #define OP_31_XOP_STHBRX 918 | 59 | #define OP_31_XOP_STHBRX 918 |
60 | 60 | ||
61 | #define OP_LWZ 32 | 61 | #define OP_LWZ 32 |
62 | #define OP_LD 58 | ||
62 | #define OP_LWZU 33 | 63 | #define OP_LWZU 33 |
63 | #define OP_LBZ 34 | 64 | #define OP_LBZ 34 |
64 | #define OP_LBZU 35 | 65 | #define OP_LBZU 35 |
65 | #define OP_STW 36 | 66 | #define OP_STW 36 |
66 | #define OP_STWU 37 | 67 | #define OP_STWU 37 |
68 | #define OP_STD 62 | ||
67 | #define OP_STB 38 | 69 | #define OP_STB 38 |
68 | #define OP_STBU 39 | 70 | #define OP_STBU 39 |
69 | #define OP_LHZ 40 | 71 | #define OP_LHZ 40 |
@@ -392,6 +394,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
392 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); | 394 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); |
393 | break; | 395 | break; |
394 | 396 | ||
397 | /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ | ||
398 | case OP_LD: | ||
399 | rt = get_rt(inst); | ||
400 | emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); | ||
401 | break; | ||
402 | |||
395 | case OP_LWZU: | 403 | case OP_LWZU: |
396 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); | 404 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); |
397 | kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); | 405 | kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); |
@@ -412,6 +420,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
412 | 4, 1); | 420 | 4, 1); |
413 | break; | 421 | break; |
414 | 422 | ||
423 | /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ | ||
424 | case OP_STD: | ||
425 | rs = get_rs(inst); | ||
426 | emulated = kvmppc_handle_store(run, vcpu, | ||
427 | kvmppc_get_gpr(vcpu, rs), | ||
428 | 8, 1); | ||
429 | break; | ||
430 | |||
415 | case OP_STWU: | 431 | case OP_STWU: |
416 | emulated = kvmppc_handle_store(run, vcpu, | 432 | emulated = kvmppc_handle_store(run, vcpu, |
417 | kvmppc_get_gpr(vcpu, rs), | 433 | kvmppc_get_gpr(vcpu, rs), |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1493c8de947b..87f4dc886076 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
@@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
246 | #endif | 246 | #endif |
247 | #ifdef CONFIG_PPC_BOOK3S_64 | 247 | #ifdef CONFIG_PPC_BOOK3S_64 |
248 | case KVM_CAP_SPAPR_TCE: | 248 | case KVM_CAP_SPAPR_TCE: |
249 | case KVM_CAP_PPC_ALLOC_HTAB: | ||
249 | r = 1; | 250 | r = 1; |
250 | break; | 251 | break; |
251 | #endif /* CONFIG_PPC_BOOK3S_64 */ | 252 | #endif /* CONFIG_PPC_BOOK3S_64 */ |
@@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
802 | r = -EFAULT; | 803 | r = -EFAULT; |
803 | break; | 804 | break; |
804 | } | 805 | } |
806 | |||
807 | case KVM_PPC_ALLOCATE_HTAB: { | ||
808 | struct kvm *kvm = filp->private_data; | ||
809 | u32 htab_order; | ||
810 | |||
811 | r = -EFAULT; | ||
812 | if (get_user(htab_order, (u32 __user *)argp)) | ||
813 | break; | ||
814 | r = kvmppc_alloc_reset_hpt(kvm, &htab_order); | ||
815 | if (r) | ||
816 | break; | ||
817 | r = -EFAULT; | ||
818 | if (put_user(htab_order, (u32 __user *)argp)) | ||
819 | break; | ||
820 | r = 0; | ||
821 | break; | ||
822 | } | ||
805 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ | 823 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ |
806 | 824 | ||
807 | #ifdef CONFIG_PPC_BOOK3S_64 | 825 | #ifdef CONFIG_PPC_BOOK3S_64 |
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index a35ca44ade66..e7a896acd982 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig | |||
@@ -25,6 +25,7 @@ source "arch/powerpc/platforms/wsp/Kconfig" | |||
25 | config KVM_GUEST | 25 | config KVM_GUEST |
26 | bool "KVM Guest support" | 26 | bool "KVM Guest support" |
27 | default n | 27 | default n |
28 | select EPAPR_PARAVIRT | ||
28 | ---help--- | 29 | ---help--- |
29 | This option enables various optimizations for running under the KVM | 30 | This option enables various optimizations for running under the KVM |
30 | hypervisor. Overhead for the kernel when not running inside KVM should | 31 | hypervisor. Overhead for the kernel when not running inside KVM should |
@@ -32,6 +33,14 @@ config KVM_GUEST | |||
32 | 33 | ||
33 | In case of doubt, say Y | 34 | In case of doubt, say Y |
34 | 35 | ||
36 | config EPAPR_PARAVIRT | ||
37 | bool "ePAPR para-virtualization support" | ||
38 | default n | ||
39 | help | ||
40 | Enables ePAPR para-virtualization support for guests. | ||
41 | |||
42 | In case of doubt, say Y | ||
43 | |||
35 | config PPC_NATIVE | 44 | config PPC_NATIVE |
36 | bool | 45 | bool |
37 | depends on 6xx || PPC64 | 46 | depends on 6xx || PPC64 |
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 8685d1fb8b75..e62a555557ee 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h | |||
@@ -53,5 +53,7 @@ int sclp_chp_configure(struct chp_id chpid); | |||
53 | int sclp_chp_deconfigure(struct chp_id chpid); | 53 | int sclp_chp_deconfigure(struct chp_id chpid); |
54 | int sclp_chp_read_info(struct sclp_chp_info *info); | 54 | int sclp_chp_read_info(struct sclp_chp_info *info); |
55 | void sclp_get_ipl_info(struct sclp_ipl_info *info); | 55 | void sclp_get_ipl_info(struct sclp_ipl_info *info); |
56 | bool sclp_has_linemode(void); | ||
57 | bool sclp_has_vt220(void); | ||
56 | 58 | ||
57 | #endif /* _ASM_S390_SCLP_H */ | 59 | #endif /* _ASM_S390_SCLP_H */ |
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 7306270b5b84..5a87d16d3e7c 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h | |||
@@ -24,6 +24,7 @@ | |||
24 | 24 | ||
25 | #define SIGP_STATUS_CHECK_STOP 0x00000010UL | 25 | #define SIGP_STATUS_CHECK_STOP 0x00000010UL |
26 | #define SIGP_STATUS_STOPPED 0x00000040UL | 26 | #define SIGP_STATUS_STOPPED 0x00000040UL |
27 | #define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL | ||
27 | #define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL | 28 | #define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL |
28 | #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL | 29 | #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL |
29 | #define SIGP_STATUS_NOT_RUNNING 0x00000400UL | 30 | #define SIGP_STATUS_NOT_RUNNING 0x00000400UL |
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 34d75b50526c..743c0f32fe3b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c | |||
@@ -61,6 +61,7 @@ | |||
61 | #include <asm/kvm_virtio.h> | 61 | #include <asm/kvm_virtio.h> |
62 | #include <asm/diag.h> | 62 | #include <asm/diag.h> |
63 | #include <asm/os_info.h> | 63 | #include <asm/os_info.h> |
64 | #include <asm/sclp.h> | ||
64 | #include "entry.h" | 65 | #include "entry.h" |
65 | 66 | ||
66 | long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | | 67 | long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | |
@@ -136,9 +137,14 @@ __setup("condev=", condev_setup); | |||
136 | 137 | ||
137 | static void __init set_preferred_console(void) | 138 | static void __init set_preferred_console(void) |
138 | { | 139 | { |
139 | if (MACHINE_IS_KVM) | 140 | if (MACHINE_IS_KVM) { |
140 | add_preferred_console("hvc", 0, NULL); | 141 | if (sclp_has_vt220()) |
141 | else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) | 142 | add_preferred_console("ttyS", 1, NULL); |
143 | else if (sclp_has_linemode()) | ||
144 | add_preferred_console("ttyS", 0, NULL); | ||
145 | else | ||
146 | add_preferred_console("hvc", 0, NULL); | ||
147 | } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) | ||
142 | add_preferred_console("ttyS", 0, NULL); | 148 | add_preferred_console("ttyS", 0, NULL); |
143 | else if (CONSOLE_IS_3270) | 149 | else if (CONSOLE_IS_3270) |
144 | add_preferred_console("tty3270", 0, NULL); | 150 | add_preferred_console("tty3270", 0, NULL); |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c552d1f4103f..d470ccbfabae 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
@@ -347,6 +347,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) | |||
347 | vcpu->arch.guest_fpregs.fpc = 0; | 347 | vcpu->arch.guest_fpregs.fpc = 0; |
348 | asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); | 348 | asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); |
349 | vcpu->arch.sie_block->gbea = 1; | 349 | vcpu->arch.sie_block->gbea = 1; |
350 | atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); | ||
350 | } | 351 | } |
351 | 352 | ||
352 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | 353 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 1ab2ce1611c5..56f80e1f98f7 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c | |||
@@ -26,19 +26,23 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
26 | int rc; | 26 | int rc; |
27 | 27 | ||
28 | if (cpu_addr >= KVM_MAX_VCPUS) | 28 | if (cpu_addr >= KVM_MAX_VCPUS) |
29 | return 3; /* not operational */ | 29 | return SIGP_CC_NOT_OPERATIONAL; |
30 | 30 | ||
31 | spin_lock(&fi->lock); | 31 | spin_lock(&fi->lock); |
32 | if (fi->local_int[cpu_addr] == NULL) | 32 | if (fi->local_int[cpu_addr] == NULL) |
33 | rc = 3; /* not operational */ | 33 | rc = SIGP_CC_NOT_OPERATIONAL; |
34 | else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) | 34 | else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) |
35 | & CPUSTAT_STOPPED)) { | 35 | & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) |
36 | *reg &= 0xffffffff00000000UL; | 36 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
37 | rc = 1; /* status stored */ | 37 | else { |
38 | } else { | ||
39 | *reg &= 0xffffffff00000000UL; | 38 | *reg &= 0xffffffff00000000UL; |
40 | *reg |= SIGP_STATUS_STOPPED; | 39 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) |
41 | rc = 1; /* status stored */ | 40 | & CPUSTAT_ECALL_PEND) |
41 | *reg |= SIGP_STATUS_EXT_CALL_PENDING; | ||
42 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) | ||
43 | & CPUSTAT_STOPPED) | ||
44 | *reg |= SIGP_STATUS_STOPPED; | ||
45 | rc = SIGP_CC_STATUS_STORED; | ||
42 | } | 46 | } |
43 | spin_unlock(&fi->lock); | 47 | spin_unlock(&fi->lock); |
44 | 48 | ||
@@ -54,7 +58,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
54 | int rc; | 58 | int rc; |
55 | 59 | ||
56 | if (cpu_addr >= KVM_MAX_VCPUS) | 60 | if (cpu_addr >= KVM_MAX_VCPUS) |
57 | return 3; /* not operational */ | 61 | return SIGP_CC_NOT_OPERATIONAL; |
58 | 62 | ||
59 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 63 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
60 | if (!inti) | 64 | if (!inti) |
@@ -66,7 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
66 | spin_lock(&fi->lock); | 70 | spin_lock(&fi->lock); |
67 | li = fi->local_int[cpu_addr]; | 71 | li = fi->local_int[cpu_addr]; |
68 | if (li == NULL) { | 72 | if (li == NULL) { |
69 | rc = 3; /* not operational */ | 73 | rc = SIGP_CC_NOT_OPERATIONAL; |
70 | kfree(inti); | 74 | kfree(inti); |
71 | goto unlock; | 75 | goto unlock; |
72 | } | 76 | } |
@@ -77,7 +81,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
77 | if (waitqueue_active(&li->wq)) | 81 | if (waitqueue_active(&li->wq)) |
78 | wake_up_interruptible(&li->wq); | 82 | wake_up_interruptible(&li->wq); |
79 | spin_unlock_bh(&li->lock); | 83 | spin_unlock_bh(&li->lock); |
80 | rc = 0; /* order accepted */ | 84 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
81 | VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); | 85 | VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); |
82 | unlock: | 86 | unlock: |
83 | spin_unlock(&fi->lock); | 87 | spin_unlock(&fi->lock); |
@@ -92,7 +96,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
92 | int rc; | 96 | int rc; |
93 | 97 | ||
94 | if (cpu_addr >= KVM_MAX_VCPUS) | 98 | if (cpu_addr >= KVM_MAX_VCPUS) |
95 | return 3; /* not operational */ | 99 | return SIGP_CC_NOT_OPERATIONAL; |
96 | 100 | ||
97 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 101 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
98 | if (!inti) | 102 | if (!inti) |
@@ -104,7 +108,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
104 | spin_lock(&fi->lock); | 108 | spin_lock(&fi->lock); |
105 | li = fi->local_int[cpu_addr]; | 109 | li = fi->local_int[cpu_addr]; |
106 | if (li == NULL) { | 110 | if (li == NULL) { |
107 | rc = 3; /* not operational */ | 111 | rc = SIGP_CC_NOT_OPERATIONAL; |
108 | kfree(inti); | 112 | kfree(inti); |
109 | goto unlock; | 113 | goto unlock; |
110 | } | 114 | } |
@@ -115,7 +119,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
115 | if (waitqueue_active(&li->wq)) | 119 | if (waitqueue_active(&li->wq)) |
116 | wake_up_interruptible(&li->wq); | 120 | wake_up_interruptible(&li->wq); |
117 | spin_unlock_bh(&li->lock); | 121 | spin_unlock_bh(&li->lock); |
118 | rc = 0; /* order accepted */ | 122 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
119 | VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); | 123 | VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); |
120 | unlock: | 124 | unlock: |
121 | spin_unlock(&fi->lock); | 125 | spin_unlock(&fi->lock); |
@@ -143,7 +147,7 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) | |||
143 | out: | 147 | out: |
144 | spin_unlock_bh(&li->lock); | 148 | spin_unlock_bh(&li->lock); |
145 | 149 | ||
146 | return 0; /* order accepted */ | 150 | return SIGP_CC_ORDER_CODE_ACCEPTED; |
147 | } | 151 | } |
148 | 152 | ||
149 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) | 153 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) |
@@ -153,12 +157,12 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) | |||
153 | int rc; | 157 | int rc; |
154 | 158 | ||
155 | if (cpu_addr >= KVM_MAX_VCPUS) | 159 | if (cpu_addr >= KVM_MAX_VCPUS) |
156 | return 3; /* not operational */ | 160 | return SIGP_CC_NOT_OPERATIONAL; |
157 | 161 | ||
158 | spin_lock(&fi->lock); | 162 | spin_lock(&fi->lock); |
159 | li = fi->local_int[cpu_addr]; | 163 | li = fi->local_int[cpu_addr]; |
160 | if (li == NULL) { | 164 | if (li == NULL) { |
161 | rc = 3; /* not operational */ | 165 | rc = SIGP_CC_NOT_OPERATIONAL; |
162 | goto unlock; | 166 | goto unlock; |
163 | } | 167 | } |
164 | 168 | ||
@@ -182,11 +186,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) | |||
182 | 186 | ||
183 | switch (parameter & 0xff) { | 187 | switch (parameter & 0xff) { |
184 | case 0: | 188 | case 0: |
185 | rc = 3; /* not operational */ | 189 | rc = SIGP_CC_NOT_OPERATIONAL; |
186 | break; | 190 | break; |
187 | case 1: | 191 | case 1: |
188 | case 2: | 192 | case 2: |
189 | rc = 0; /* order accepted */ | 193 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
190 | break; | 194 | break; |
191 | default: | 195 | default: |
192 | rc = -EOPNOTSUPP; | 196 | rc = -EOPNOTSUPP; |
@@ -207,21 +211,23 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
207 | address = address & 0x7fffe000u; | 211 | address = address & 0x7fffe000u; |
208 | if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || | 212 | if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || |
209 | copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { | 213 | copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { |
214 | *reg &= 0xffffffff00000000UL; | ||
210 | *reg |= SIGP_STATUS_INVALID_PARAMETER; | 215 | *reg |= SIGP_STATUS_INVALID_PARAMETER; |
211 | return 1; /* invalid parameter */ | 216 | return SIGP_CC_STATUS_STORED; |
212 | } | 217 | } |
213 | 218 | ||
214 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 219 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
215 | if (!inti) | 220 | if (!inti) |
216 | return 2; /* busy */ | 221 | return SIGP_CC_BUSY; |
217 | 222 | ||
218 | spin_lock(&fi->lock); | 223 | spin_lock(&fi->lock); |
219 | if (cpu_addr < KVM_MAX_VCPUS) | 224 | if (cpu_addr < KVM_MAX_VCPUS) |
220 | li = fi->local_int[cpu_addr]; | 225 | li = fi->local_int[cpu_addr]; |
221 | 226 | ||
222 | if (li == NULL) { | 227 | if (li == NULL) { |
223 | rc = 1; /* incorrect state */ | 228 | *reg &= 0xffffffff00000000UL; |
224 | *reg &= SIGP_STATUS_INCORRECT_STATE; | 229 | *reg |= SIGP_STATUS_INCORRECT_STATE; |
230 | rc = SIGP_CC_STATUS_STORED; | ||
225 | kfree(inti); | 231 | kfree(inti); |
226 | goto out_fi; | 232 | goto out_fi; |
227 | } | 233 | } |
@@ -229,8 +235,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
229 | spin_lock_bh(&li->lock); | 235 | spin_lock_bh(&li->lock); |
230 | /* cpu must be in stopped state */ | 236 | /* cpu must be in stopped state */ |
231 | if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { | 237 | if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { |
232 | rc = 1; /* incorrect state */ | 238 | *reg &= 0xffffffff00000000UL; |
233 | *reg &= SIGP_STATUS_INCORRECT_STATE; | 239 | *reg |= SIGP_STATUS_INCORRECT_STATE; |
240 | rc = SIGP_CC_STATUS_STORED; | ||
234 | kfree(inti); | 241 | kfree(inti); |
235 | goto out_li; | 242 | goto out_li; |
236 | } | 243 | } |
@@ -242,7 +249,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
242 | atomic_set(&li->active, 1); | 249 | atomic_set(&li->active, 1); |
243 | if (waitqueue_active(&li->wq)) | 250 | if (waitqueue_active(&li->wq)) |
244 | wake_up_interruptible(&li->wq); | 251 | wake_up_interruptible(&li->wq); |
245 | rc = 0; /* order accepted */ | 252 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
246 | 253 | ||
247 | VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); | 254 | VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); |
248 | out_li: | 255 | out_li: |
@@ -259,21 +266,21 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
259 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | 266 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; |
260 | 267 | ||
261 | if (cpu_addr >= KVM_MAX_VCPUS) | 268 | if (cpu_addr >= KVM_MAX_VCPUS) |
262 | return 3; /* not operational */ | 269 | return SIGP_CC_NOT_OPERATIONAL; |
263 | 270 | ||
264 | spin_lock(&fi->lock); | 271 | spin_lock(&fi->lock); |
265 | if (fi->local_int[cpu_addr] == NULL) | 272 | if (fi->local_int[cpu_addr] == NULL) |
266 | rc = 3; /* not operational */ | 273 | rc = SIGP_CC_NOT_OPERATIONAL; |
267 | else { | 274 | else { |
268 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) | 275 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) |
269 | & CPUSTAT_RUNNING) { | 276 | & CPUSTAT_RUNNING) { |
270 | /* running */ | 277 | /* running */ |
271 | rc = 1; | 278 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
272 | } else { | 279 | } else { |
273 | /* not running */ | 280 | /* not running */ |
274 | *reg &= 0xffffffff00000000UL; | 281 | *reg &= 0xffffffff00000000UL; |
275 | *reg |= SIGP_STATUS_NOT_RUNNING; | 282 | *reg |= SIGP_STATUS_NOT_RUNNING; |
276 | rc = 0; | 283 | rc = SIGP_CC_STATUS_STORED; |
277 | } | 284 | } |
278 | } | 285 | } |
279 | spin_unlock(&fi->lock); | 286 | spin_unlock(&fi->lock); |
@@ -286,23 +293,23 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
286 | 293 | ||
287 | static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) | 294 | static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) |
288 | { | 295 | { |
289 | int rc = 0; | ||
290 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | 296 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; |
291 | struct kvm_s390_local_interrupt *li; | 297 | struct kvm_s390_local_interrupt *li; |
298 | int rc = SIGP_CC_ORDER_CODE_ACCEPTED; | ||
292 | 299 | ||
293 | if (cpu_addr >= KVM_MAX_VCPUS) | 300 | if (cpu_addr >= KVM_MAX_VCPUS) |
294 | return 3; /* not operational */ | 301 | return SIGP_CC_NOT_OPERATIONAL; |
295 | 302 | ||
296 | spin_lock(&fi->lock); | 303 | spin_lock(&fi->lock); |
297 | li = fi->local_int[cpu_addr]; | 304 | li = fi->local_int[cpu_addr]; |
298 | if (li == NULL) { | 305 | if (li == NULL) { |
299 | rc = 3; /* not operational */ | 306 | rc = SIGP_CC_NOT_OPERATIONAL; |
300 | goto out; | 307 | goto out; |
301 | } | 308 | } |
302 | 309 | ||
303 | spin_lock_bh(&li->lock); | 310 | spin_lock_bh(&li->lock); |
304 | if (li->action_bits & ACTION_STOP_ON_STOP) | 311 | if (li->action_bits & ACTION_STOP_ON_STOP) |
305 | rc = 2; /* busy */ | 312 | rc = SIGP_CC_BUSY; |
306 | else | 313 | else |
307 | VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", | 314 | VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", |
308 | cpu_addr); | 315 | cpu_addr); |
@@ -377,7 +384,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) | |||
377 | case SIGP_RESTART: | 384 | case SIGP_RESTART: |
378 | vcpu->stat.instruction_sigp_restart++; | 385 | vcpu->stat.instruction_sigp_restart++; |
379 | rc = __sigp_restart(vcpu, cpu_addr); | 386 | rc = __sigp_restart(vcpu, cpu_addr); |
380 | if (rc == 2) /* busy */ | 387 | if (rc == SIGP_CC_BUSY) |
381 | break; | 388 | break; |
382 | /* user space must know about restart */ | 389 | /* user space must know about restart */ |
383 | default: | 390 | default: |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 88093c1d44fd..3ea51a84a0e4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -465,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void) | |||
465 | return apic->safe_wait_icr_idle(); | 465 | return apic->safe_wait_icr_idle(); |
466 | } | 466 | } |
467 | 467 | ||
468 | extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); | ||
469 | |||
468 | #else /* CONFIG_X86_LOCAL_APIC */ | 470 | #else /* CONFIG_X86_LOCAL_APIC */ |
469 | 471 | ||
470 | static inline u32 apic_read(u32 reg) { return 0; } | 472 | static inline u32 apic_read(u32 reg) { return 0; } |
@@ -474,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; } | |||
474 | static inline void apic_icr_write(u32 low, u32 high) { } | 476 | static inline void apic_icr_write(u32 low, u32 high) { } |
475 | static inline void apic_wait_icr_idle(void) { } | 477 | static inline void apic_wait_icr_idle(void) { } |
476 | static inline u32 safe_apic_wait_icr_idle(void) { return 0; } | 478 | static inline u32 safe_apic_wait_icr_idle(void) { return 0; } |
479 | static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} | ||
477 | 480 | ||
478 | #endif /* CONFIG_X86_LOCAL_APIC */ | 481 | #endif /* CONFIG_X86_LOCAL_APIC */ |
479 | 482 | ||
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b277220..72f5009deb5a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h | |||
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
264 | * This operation is non-atomic and can be reordered. | 264 | * This operation is non-atomic and can be reordered. |
265 | * If two examples of this operation race, one can appear to succeed | 265 | * If two examples of this operation race, one can appear to succeed |
266 | * but actually fail. You must protect multiple accesses with a lock. | 266 | * but actually fail. You must protect multiple accesses with a lock. |
267 | * | ||
268 | * Note: the operation is performed atomically with respect to | ||
269 | * the local CPU, but not other CPUs. Portable code should not | ||
270 | * rely on this behaviour. | ||
271 | * KVM relies on this behaviour on x86 for modifying memory that is also | ||
272 | * accessed from a hypervisor on the same CPU if running in a VM: don't change | ||
273 | * this without also updating arch/x86/kernel/kvm.c | ||
267 | */ | 274 | */ |
268 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 275 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
269 | { | 276 | { |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675d..b518c7509933 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper; | |||
49 | extern const struct hypervisor_x86 x86_hyper_vmware; | 49 | extern const struct hypervisor_x86 x86_hyper_vmware; |
50 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; | 50 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; |
51 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; | 51 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; |
52 | extern const struct hypervisor_x86 x86_hyper_kvm; | ||
52 | 53 | ||
53 | static inline bool hypervisor_x2apic_available(void) | 54 | static inline bool hypervisor_x2apic_available(void) |
54 | { | 55 | { |
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d272..246617efd67f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
@@ -12,6 +12,7 @@ | |||
12 | /* Select x86 specific features in <linux/kvm.h> */ | 12 | /* Select x86 specific features in <linux/kvm.h> */ |
13 | #define __KVM_HAVE_PIT | 13 | #define __KVM_HAVE_PIT |
14 | #define __KVM_HAVE_IOAPIC | 14 | #define __KVM_HAVE_IOAPIC |
15 | #define __KVM_HAVE_IRQ_LINE | ||
15 | #define __KVM_HAVE_DEVICE_ASSIGNMENT | 16 | #define __KVM_HAVE_DEVICE_ASSIGNMENT |
16 | #define __KVM_HAVE_MSI | 17 | #define __KVM_HAVE_MSI |
17 | #define __KVM_HAVE_USER_NMI | 18 | #define __KVM_HAVE_USER_NMI |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd50..c764f43b71c5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -192,8 +192,8 @@ struct x86_emulate_ops { | |||
192 | struct x86_instruction_info *info, | 192 | struct x86_instruction_info *info, |
193 | enum x86_intercept_stage stage); | 193 | enum x86_intercept_stage stage); |
194 | 194 | ||
195 | bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, | 195 | void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, |
196 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); | 196 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); |
197 | }; | 197 | }; |
198 | 198 | ||
199 | typedef u32 __attribute__((vector_size(16))) sse128_t; | 199 | typedef u32 __attribute__((vector_size(16))) sse128_t; |
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt { | |||
280 | u8 modrm_seg; | 280 | u8 modrm_seg; |
281 | bool rip_relative; | 281 | bool rip_relative; |
282 | unsigned long _eip; | 282 | unsigned long _eip; |
283 | struct operand memop; | ||
283 | /* Fields above regs are cleared together. */ | 284 | /* Fields above regs are cleared together. */ |
284 | unsigned long regs[NR_VCPU_REGS]; | 285 | unsigned long regs[NR_VCPU_REGS]; |
285 | struct operand memop; | ||
286 | struct operand *memopp; | 286 | struct operand *memopp; |
287 | struct fetch_cache fetch; | 287 | struct fetch_cache fetch; |
288 | struct read_cache io_read; | 288 | struct read_cache io_read; |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2da88c0cda14..09155d64cf7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -48,12 +48,13 @@ | |||
48 | 48 | ||
49 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | 49 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) |
50 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | 50 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) |
51 | #define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL | ||
51 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ | 52 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ |
52 | 0xFFFFFF0000000000ULL) | 53 | 0xFFFFFF0000000000ULL) |
53 | #define CR4_RESERVED_BITS \ | 54 | #define CR4_RESERVED_BITS \ |
54 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | 55 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ |
55 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | 56 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ |
56 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | 57 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ |
57 | | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | 58 | | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ |
58 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 59 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
59 | 60 | ||
@@ -175,6 +176,13 @@ enum { | |||
175 | 176 | ||
176 | /* apic attention bits */ | 177 | /* apic attention bits */ |
177 | #define KVM_APIC_CHECK_VAPIC 0 | 178 | #define KVM_APIC_CHECK_VAPIC 0 |
179 | /* | ||
180 | * The following bit is set with PV-EOI, unset on EOI. | ||
181 | * We detect PV-EOI changes by guest by comparing | ||
182 | * this bit with PV-EOI in guest memory. | ||
183 | * See the implementation in apic_update_pv_eoi. | ||
184 | */ | ||
185 | #define KVM_APIC_PV_EOI_PENDING 1 | ||
178 | 186 | ||
179 | /* | 187 | /* |
180 | * We don't want allocation failures within the mmu code, so we preallocate | 188 | * We don't want allocation failures within the mmu code, so we preallocate |
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch { | |||
484 | u64 length; | 492 | u64 length; |
485 | u64 status; | 493 | u64 status; |
486 | } osvw; | 494 | } osvw; |
495 | |||
496 | struct { | ||
497 | u64 msr_val; | ||
498 | struct gfn_to_hva_cache data; | ||
499 | } pv_eoi; | ||
487 | }; | 500 | }; |
488 | 501 | ||
489 | struct kvm_lpage_info { | 502 | struct kvm_lpage_info { |
@@ -661,6 +674,7 @@ struct kvm_x86_ops { | |||
661 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 674 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
662 | int (*get_lpage_level)(void); | 675 | int (*get_lpage_level)(void); |
663 | bool (*rdtscp_supported)(void); | 676 | bool (*rdtscp_supported)(void); |
677 | bool (*invpcid_supported)(void); | ||
664 | void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); | 678 | void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); |
665 | 679 | ||
666 | void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | 680 | void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); |
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
802 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); | 816 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
803 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | 817 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); |
804 | 818 | ||
805 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 819 | static inline int __kvm_irq_line_state(unsigned long *irq_state, |
820 | int irq_source_id, int level) | ||
821 | { | ||
822 | /* Logical OR for level trig interrupt */ | ||
823 | if (level) | ||
824 | __set_bit(irq_source_id, irq_state); | ||
825 | else | ||
826 | __clear_bit(irq_source_id, irq_state); | ||
827 | |||
828 | return !!(*irq_state); | ||
829 | } | ||
830 | |||
831 | int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); | ||
832 | void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); | ||
806 | 833 | ||
807 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); | 834 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); |
808 | 835 | ||
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00e..2f7712e08b1e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
23 | #define KVM_FEATURE_ASYNC_PF 4 | 23 | #define KVM_FEATURE_ASYNC_PF 4 |
24 | #define KVM_FEATURE_STEAL_TIME 5 | 24 | #define KVM_FEATURE_STEAL_TIME 5 |
25 | #define KVM_FEATURE_PV_EOI 6 | ||
25 | 26 | ||
26 | /* The last 8 bits are used to indicate how to interpret the flags field | 27 | /* The last 8 bits are used to indicate how to interpret the flags field |
27 | * in pvclock structure. If no bits are set, all flags are ignored. | 28 | * in pvclock structure. If no bits are set, all flags are ignored. |
@@ -37,6 +38,7 @@ | |||
37 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 38 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
38 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | 39 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 |
39 | #define MSR_KVM_STEAL_TIME 0x4b564d03 | 40 | #define MSR_KVM_STEAL_TIME 0x4b564d03 |
41 | #define MSR_KVM_PV_EOI_EN 0x4b564d04 | ||
40 | 42 | ||
41 | struct kvm_steal_time { | 43 | struct kvm_steal_time { |
42 | __u64 steal; | 44 | __u64 steal; |
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { | |||
89 | __u32 enabled; | 91 | __u32 enabled; |
90 | }; | 92 | }; |
91 | 93 | ||
94 | #define KVM_PV_EOI_BIT 0 | ||
95 | #define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) | ||
96 | #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK | ||
97 | #define KVM_PV_EOI_DISABLED 0x0 | ||
98 | |||
92 | #ifdef __KERNEL__ | 99 | #ifdef __KERNEL__ |
93 | #include <asm/processor.h> | 100 | #include <asm/processor.h> |
94 | 101 | ||
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad128..aea1d1d848c7 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h | |||
@@ -44,6 +44,7 @@ | |||
44 | */ | 44 | */ |
45 | #define X86_CR3_PWT 0x00000008 /* Page Write Through */ | 45 | #define X86_CR3_PWT 0x00000008 /* Page Write Through */ |
46 | #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ | 46 | #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ |
47 | #define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ | ||
47 | 48 | ||
48 | /* | 49 | /* |
49 | * Intel CPU features in CR4 | 50 | * Intel CPU features in CR4 |
@@ -61,6 +62,7 @@ | |||
61 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ | 62 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ |
62 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ | 63 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ |
63 | #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ | 64 | #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ |
65 | #define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ | ||
64 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ | 66 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ |
65 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ | 67 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ |
66 | 68 | ||
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce9..74fcb963595b 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -60,6 +60,7 @@ | |||
60 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 60 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
61 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 | 61 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 |
62 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 | 62 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 |
63 | #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 | ||
63 | 64 | ||
64 | 65 | ||
65 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 66 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
@@ -281,6 +282,7 @@ enum vmcs_field { | |||
281 | #define EXIT_REASON_EPT_MISCONFIG 49 | 282 | #define EXIT_REASON_EPT_MISCONFIG 49 |
282 | #define EXIT_REASON_WBINVD 54 | 283 | #define EXIT_REASON_WBINVD 54 |
283 | #define EXIT_REASON_XSETBV 55 | 284 | #define EXIT_REASON_XSETBV 55 |
285 | #define EXIT_REASON_INVPCID 58 | ||
284 | 286 | ||
285 | /* | 287 | /* |
286 | * Interruption-information format | 288 | * Interruption-information format |
@@ -404,6 +406,7 @@ enum vmcs_field { | |||
404 | #define VMX_EPTP_WB_BIT (1ull << 14) | 406 | #define VMX_EPTP_WB_BIT (1ull << 14) |
405 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) | 407 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) |
406 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) | 408 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) |
409 | #define VMX_EPT_AD_BIT (1ull << 21) | ||
407 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) | 410 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) |
408 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 411 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
409 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 412 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
@@ -415,11 +418,14 @@ enum vmcs_field { | |||
415 | #define VMX_EPT_MAX_GAW 0x4 | 418 | #define VMX_EPT_MAX_GAW 0x4 |
416 | #define VMX_EPT_MT_EPTE_SHIFT 3 | 419 | #define VMX_EPT_MT_EPTE_SHIFT 3 |
417 | #define VMX_EPT_GAW_EPTP_SHIFT 3 | 420 | #define VMX_EPT_GAW_EPTP_SHIFT 3 |
421 | #define VMX_EPT_AD_ENABLE_BIT (1ull << 6) | ||
418 | #define VMX_EPT_DEFAULT_MT 0x6ull | 422 | #define VMX_EPT_DEFAULT_MT 0x6ull |
419 | #define VMX_EPT_READABLE_MASK 0x1ull | 423 | #define VMX_EPT_READABLE_MASK 0x1ull |
420 | #define VMX_EPT_WRITABLE_MASK 0x2ull | 424 | #define VMX_EPT_WRITABLE_MASK 0x2ull |
421 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull | 425 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull |
422 | #define VMX_EPT_IPAT_BIT (1ull << 6) | 426 | #define VMX_EPT_IPAT_BIT (1ull << 6) |
427 | #define VMX_EPT_ACCESS_BIT (1ull << 8) | ||
428 | #define VMX_EPT_DIRTY_BIT (1ull << 9) | ||
423 | 429 | ||
424 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul | 430 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul |
425 | 431 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c421512ca5eb..98e24131ff3a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -2143,6 +2143,23 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, | |||
2143 | } | 2143 | } |
2144 | 2144 | ||
2145 | /* | 2145 | /* |
2146 | * Override the generic EOI implementation with an optimized version. | ||
2147 | * Only called during early boot when only one CPU is active and with | ||
2148 | * interrupts disabled, so we know this does not race with actual APIC driver | ||
2149 | * use. | ||
2150 | */ | ||
2151 | void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) | ||
2152 | { | ||
2153 | struct apic **drv; | ||
2154 | |||
2155 | for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { | ||
2156 | /* Should happen once for each apic */ | ||
2157 | WARN_ON((*drv)->eoi_write == eoi_write); | ||
2158 | (*drv)->eoi_write = eoi_write; | ||
2159 | } | ||
2160 | } | ||
2161 | |||
2162 | /* | ||
2146 | * Power management | 2163 | * Power management |
2147 | */ | 2164 | */ |
2148 | #ifdef CONFIG_PM | 2165 | #ifdef CONFIG_PM |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb0743..a8f8fa9769d6 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = | |||
37 | #endif | 37 | #endif |
38 | &x86_hyper_vmware, | 38 | &x86_hyper_vmware, |
39 | &x86_hyper_ms_hyperv, | 39 | &x86_hyper_ms_hyperv, |
40 | #ifdef CONFIG_KVM_GUEST | ||
41 | &x86_hyper_kvm, | ||
42 | #endif | ||
40 | }; | 43 | }; |
41 | 44 | ||
42 | const struct hypervisor_x86 *x86_hyper; | 45 | const struct hypervisor_x86 *x86_hyper; |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8..c1d61ee4b4f1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -39,6 +39,9 @@ | |||
39 | #include <asm/desc.h> | 39 | #include <asm/desc.h> |
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | #include <asm/idle.h> | 41 | #include <asm/idle.h> |
42 | #include <asm/apic.h> | ||
43 | #include <asm/apicdef.h> | ||
44 | #include <asm/hypervisor.h> | ||
42 | 45 | ||
43 | static int kvmapf = 1; | 46 | static int kvmapf = 1; |
44 | 47 | ||
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void) | |||
283 | cpu, __pa(st)); | 286 | cpu, __pa(st)); |
284 | } | 287 | } |
285 | 288 | ||
289 | static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; | ||
290 | |||
291 | static void kvm_guest_apic_eoi_write(u32 reg, u32 val) | ||
292 | { | ||
293 | /** | ||
294 | * This relies on __test_and_clear_bit to modify the memory | ||
295 | * in a way that is atomic with respect to the local CPU. | ||
296 | * The hypervisor only accesses this memory from the local CPU so | ||
297 | * there's no need for lock or memory barriers. | ||
298 | * An optimization barrier is implied in apic write. | ||
299 | */ | ||
300 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) | ||
301 | return; | ||
302 | apic_write(APIC_EOI, APIC_EOI_ACK); | ||
303 | } | ||
304 | |||
286 | void __cpuinit kvm_guest_cpu_init(void) | 305 | void __cpuinit kvm_guest_cpu_init(void) |
287 | { | 306 | { |
288 | if (!kvm_para_available()) | 307 | if (!kvm_para_available()) |
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void) | |||
300 | smp_processor_id()); | 319 | smp_processor_id()); |
301 | } | 320 | } |
302 | 321 | ||
322 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { | ||
323 | unsigned long pa; | ||
324 | /* Size alignment is implied but just to make it explicit. */ | ||
325 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); | ||
326 | __get_cpu_var(kvm_apic_eoi) = 0; | ||
327 | pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; | ||
328 | wrmsrl(MSR_KVM_PV_EOI_EN, pa); | ||
329 | } | ||
330 | |||
303 | if (has_steal_clock) | 331 | if (has_steal_clock) |
304 | kvm_register_steal_time(); | 332 | kvm_register_steal_time(); |
305 | } | 333 | } |
306 | 334 | ||
307 | static void kvm_pv_disable_apf(void *unused) | 335 | static void kvm_pv_disable_apf(void) |
308 | { | 336 | { |
309 | if (!__get_cpu_var(apf_reason).enabled) | 337 | if (!__get_cpu_var(apf_reason).enabled) |
310 | return; | 338 | return; |
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused) | |||
316 | smp_processor_id()); | 344 | smp_processor_id()); |
317 | } | 345 | } |
318 | 346 | ||
347 | static void kvm_pv_guest_cpu_reboot(void *unused) | ||
348 | { | ||
349 | /* | ||
350 | * We disable PV EOI before we load a new kernel by kexec, | ||
351 | * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. | ||
352 | * New kernel can re-enable when it boots. | ||
353 | */ | ||
354 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | ||
355 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
356 | kvm_pv_disable_apf(); | ||
357 | } | ||
358 | |||
319 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | 359 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
320 | unsigned long code, void *unused) | 360 | unsigned long code, void *unused) |
321 | { | 361 | { |
322 | if (code == SYS_RESTART) | 362 | if (code == SYS_RESTART) |
323 | on_each_cpu(kvm_pv_disable_apf, NULL, 1); | 363 | on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); |
324 | return NOTIFY_DONE; | 364 | return NOTIFY_DONE; |
325 | } | 365 | } |
326 | 366 | ||
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) | |||
371 | static void kvm_guest_cpu_offline(void *dummy) | 411 | static void kvm_guest_cpu_offline(void *dummy) |
372 | { | 412 | { |
373 | kvm_disable_steal_time(); | 413 | kvm_disable_steal_time(); |
374 | kvm_pv_disable_apf(NULL); | 414 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
415 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
416 | kvm_pv_disable_apf(); | ||
375 | apf_task_wake_all(); | 417 | apf_task_wake_all(); |
376 | } | 418 | } |
377 | 419 | ||
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void) | |||
424 | pv_time_ops.steal_clock = kvm_steal_clock; | 466 | pv_time_ops.steal_clock = kvm_steal_clock; |
425 | } | 467 | } |
426 | 468 | ||
469 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | ||
470 | apic_set_eoi_write(kvm_guest_apic_eoi_write); | ||
471 | |||
427 | #ifdef CONFIG_SMP | 472 | #ifdef CONFIG_SMP |
428 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 473 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
429 | register_cpu_notifier(&kvm_cpu_notifier); | 474 | register_cpu_notifier(&kvm_cpu_notifier); |
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void) | |||
432 | #endif | 477 | #endif |
433 | } | 478 | } |
434 | 479 | ||
480 | static bool __init kvm_detect(void) | ||
481 | { | ||
482 | if (!kvm_para_available()) | ||
483 | return false; | ||
484 | return true; | ||
485 | } | ||
486 | |||
487 | const struct hypervisor_x86 x86_hyper_kvm __refconst = { | ||
488 | .name = "KVM", | ||
489 | .detect = kvm_detect, | ||
490 | }; | ||
491 | EXPORT_SYMBOL_GPL(x86_hyper_kvm); | ||
492 | |||
435 | static __init int activate_jump_labels(void) | 493 | static __init int activate_jump_labels(void) |
436 | { | 494 | { |
437 | if (has_steal_clock) { | 495 | if (has_steal_clock) { |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839fb..0595f1397b7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
201 | unsigned f_lm = 0; | 201 | unsigned f_lm = 0; |
202 | #endif | 202 | #endif |
203 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; | 203 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; |
204 | unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; | ||
204 | 205 | ||
205 | /* cpuid 1.edx */ | 206 | /* cpuid 1.edx */ |
206 | const u32 kvm_supported_word0_x86_features = | 207 | const u32 kvm_supported_word0_x86_features = |
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
228 | 0 /* DS-CPL, VMX, SMX, EST */ | | 229 | 0 /* DS-CPL, VMX, SMX, EST */ | |
229 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 230 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
230 | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | | 231 | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | |
231 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 232 | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | |
232 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 233 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
233 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | 234 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
234 | F(F16C) | F(RDRAND); | 235 | F(F16C) | F(RDRAND); |
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
248 | /* cpuid 7.0.ebx */ | 249 | /* cpuid 7.0.ebx */ |
249 | const u32 kvm_supported_word9_x86_features = | 250 | const u32 kvm_supported_word9_x86_features = |
250 | F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | | 251 | F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | |
251 | F(BMI2) | F(ERMS) | F(RTM); | 252 | F(BMI2) | F(ERMS) | f_invpcid | F(RTM); |
252 | 253 | ||
253 | /* all calls to cpuid_count() should be made on the same cpu */ | 254 | /* all calls to cpuid_count() should be made on the same cpu */ |
254 | get_cpu(); | 255 | get_cpu(); |
@@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
409 | (1 << KVM_FEATURE_NOP_IO_DELAY) | | 410 | (1 << KVM_FEATURE_NOP_IO_DELAY) | |
410 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | 411 | (1 << KVM_FEATURE_CLOCKSOURCE2) | |
411 | (1 << KVM_FEATURE_ASYNC_PF) | | 412 | (1 << KVM_FEATURE_ASYNC_PF) | |
413 | (1 << KVM_FEATURE_PV_EOI) | | ||
412 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | 414 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); |
413 | 415 | ||
414 | if (sched_info_on()) | 416 | if (sched_info_on()) |
@@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, | |||
639 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); | 641 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); |
640 | } | 642 | } |
641 | 643 | ||
642 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | 644 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) |
643 | { | 645 | { |
644 | u32 function, index; | 646 | u32 function = *eax, index = *ecx; |
645 | struct kvm_cpuid_entry2 *best; | 647 | struct kvm_cpuid_entry2 *best; |
646 | 648 | ||
647 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
648 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
649 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); | ||
650 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); | ||
651 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); | ||
652 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); | ||
653 | best = kvm_find_cpuid_entry(vcpu, function, index); | 649 | best = kvm_find_cpuid_entry(vcpu, function, index); |
654 | 650 | ||
655 | if (!best) | 651 | if (!best) |
656 | best = check_cpuid_limit(vcpu, function, index); | 652 | best = check_cpuid_limit(vcpu, function, index); |
657 | 653 | ||
658 | if (best) { | 654 | if (best) { |
659 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); | 655 | *eax = best->eax; |
660 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); | 656 | *ebx = best->ebx; |
661 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); | 657 | *ecx = best->ecx; |
662 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | 658 | *edx = best->edx; |
663 | } | 659 | } else |
660 | *eax = *ebx = *ecx = *edx = 0; | ||
661 | } | ||
662 | |||
663 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
664 | { | ||
665 | u32 function, eax, ebx, ecx, edx; | ||
666 | |||
667 | function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
668 | ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
669 | kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); | ||
670 | kvm_register_write(vcpu, VCPU_REGS_RAX, eax); | ||
671 | kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); | ||
672 | kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); | ||
673 | kvm_register_write(vcpu, VCPU_REGS_RDX, edx); | ||
664 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 674 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
665 | trace_kvm_cpuid(function, | 675 | trace_kvm_cpuid(function, eax, ebx, ecx, edx); |
666 | kvm_register_read(vcpu, VCPU_REGS_RAX), | ||
667 | kvm_register_read(vcpu, VCPU_REGS_RBX), | ||
668 | kvm_register_read(vcpu, VCPU_REGS_RCX), | ||
669 | kvm_register_read(vcpu, VCPU_REGS_RDX)); | ||
670 | } | 676 | } |
671 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 677 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb5..a10e46016851 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | |||
17 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | 17 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, |
18 | struct kvm_cpuid2 *cpuid, | 18 | struct kvm_cpuid2 *cpuid, |
19 | struct kvm_cpuid_entry2 __user *entries); | 19 | struct kvm_cpuid_entry2 __user *entries); |
20 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); | ||
20 | 21 | ||
21 | 22 | ||
22 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | 23 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) |
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) | |||
51 | return best && (best->ecx & bit(X86_FEATURE_OSVW)); | 52 | return best && (best->ecx & bit(X86_FEATURE_OSVW)); |
52 | } | 53 | } |
53 | 54 | ||
55 | static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) | ||
56 | { | ||
57 | struct kvm_cpuid_entry2 *best; | ||
58 | |||
59 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
60 | return best && (best->ecx & bit(X86_FEATURE_PCID)); | ||
61 | } | ||
62 | |||
54 | #endif | 63 | #endif |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f7..97d9a9914ba8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | |||
433 | return ctxt->ops->intercept(ctxt, &info, stage); | 433 | return ctxt->ops->intercept(ctxt, &info, stage); |
434 | } | 434 | } |
435 | 435 | ||
436 | static void assign_masked(ulong *dest, ulong src, ulong mask) | ||
437 | { | ||
438 | *dest = (*dest & ~mask) | (src & mask); | ||
439 | } | ||
440 | |||
436 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) | 441 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) |
437 | { | 442 | { |
438 | return (1UL << (ctxt->ad_bytes << 3)) - 1; | 443 | return (1UL << (ctxt->ad_bytes << 3)) - 1; |
439 | } | 444 | } |
440 | 445 | ||
446 | static ulong stack_mask(struct x86_emulate_ctxt *ctxt) | ||
447 | { | ||
448 | u16 sel; | ||
449 | struct desc_struct ss; | ||
450 | |||
451 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
452 | return ~0UL; | ||
453 | ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); | ||
454 | return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ | ||
455 | } | ||
456 | |||
457 | static int stack_size(struct x86_emulate_ctxt *ctxt) | ||
458 | { | ||
459 | return (__fls(stack_mask(ctxt)) + 1) >> 3; | ||
460 | } | ||
461 | |||
441 | /* Access/update address held in a register, based on addressing mode. */ | 462 | /* Access/update address held in a register, based on addressing mode. */ |
442 | static inline unsigned long | 463 | static inline unsigned long |
443 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) | 464 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
@@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
958 | op->orig_val = op->val; | 979 | op->orig_val = op->val; |
959 | } | 980 | } |
960 | 981 | ||
982 | static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) | ||
983 | { | ||
984 | if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) | ||
985 | ctxt->modrm_seg = VCPU_SREG_SS; | ||
986 | } | ||
987 | |||
961 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 988 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
962 | struct operand *op) | 989 | struct operand *op) |
963 | { | 990 | { |
@@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1061 | 1088 | ||
1062 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) | 1089 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) |
1063 | modrm_ea += insn_fetch(s32, ctxt); | 1090 | modrm_ea += insn_fetch(s32, ctxt); |
1064 | else | 1091 | else { |
1065 | modrm_ea += ctxt->regs[base_reg]; | 1092 | modrm_ea += ctxt->regs[base_reg]; |
1093 | adjust_modrm_seg(ctxt, base_reg); | ||
1094 | } | ||
1066 | if (index_reg != 4) | 1095 | if (index_reg != 4) |
1067 | modrm_ea += ctxt->regs[index_reg] << scale; | 1096 | modrm_ea += ctxt->regs[index_reg] << scale; |
1068 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { | 1097 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { |
1069 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 1098 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1070 | ctxt->rip_relative = 1; | 1099 | ctxt->rip_relative = 1; |
1071 | } else | 1100 | } else { |
1072 | modrm_ea += ctxt->regs[ctxt->modrm_rm]; | 1101 | base_reg = ctxt->modrm_rm; |
1102 | modrm_ea += ctxt->regs[base_reg]; | ||
1103 | adjust_modrm_seg(ctxt, base_reg); | ||
1104 | } | ||
1073 | switch (ctxt->modrm_mod) { | 1105 | switch (ctxt->modrm_mod) { |
1074 | case 0: | 1106 | case 0: |
1075 | if (ctxt->modrm_rm == 5) | 1107 | if (ctxt->modrm_rm == 5) |
@@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | |||
1264 | 1296 | ||
1265 | /* allowed just for 8 bytes segments */ | 1297 | /* allowed just for 8 bytes segments */ |
1266 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1298 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1267 | u16 selector, struct desc_struct *desc) | 1299 | u16 selector, struct desc_struct *desc, |
1300 | ulong *desc_addr_p) | ||
1268 | { | 1301 | { |
1269 | struct desc_ptr dt; | 1302 | struct desc_ptr dt; |
1270 | u16 index = selector >> 3; | 1303 | u16 index = selector >> 3; |
@@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1275 | if (dt.size < index * 8 + 7) | 1308 | if (dt.size < index * 8 + 7) |
1276 | return emulate_gp(ctxt, selector & 0xfffc); | 1309 | return emulate_gp(ctxt, selector & 0xfffc); |
1277 | 1310 | ||
1278 | addr = dt.address + index * 8; | 1311 | *desc_addr_p = addr = dt.address + index * 8; |
1279 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | 1312 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, |
1280 | &ctxt->exception); | 1313 | &ctxt->exception); |
1281 | } | 1314 | } |
@@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1302 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1335 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1303 | u16 selector, int seg) | 1336 | u16 selector, int seg) |
1304 | { | 1337 | { |
1305 | struct desc_struct seg_desc; | 1338 | struct desc_struct seg_desc, old_desc; |
1306 | u8 dpl, rpl, cpl; | 1339 | u8 dpl, rpl, cpl; |
1307 | unsigned err_vec = GP_VECTOR; | 1340 | unsigned err_vec = GP_VECTOR; |
1308 | u32 err_code = 0; | 1341 | u32 err_code = 0; |
1309 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ | 1342 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ |
1343 | ulong desc_addr; | ||
1310 | int ret; | 1344 | int ret; |
1311 | 1345 | ||
1312 | memset(&seg_desc, 0, sizeof seg_desc); | 1346 | memset(&seg_desc, 0, sizeof seg_desc); |
@@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1324 | goto load; | 1358 | goto load; |
1325 | } | 1359 | } |
1326 | 1360 | ||
1327 | /* NULL selector is not valid for TR, CS and SS */ | 1361 | rpl = selector & 3; |
1328 | if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) | 1362 | cpl = ctxt->ops->cpl(ctxt); |
1363 | |||
1364 | /* NULL selector is not valid for TR, CS and SS (except for long mode) */ | ||
1365 | if ((seg == VCPU_SREG_CS | ||
1366 | || (seg == VCPU_SREG_SS | ||
1367 | && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) | ||
1368 | || seg == VCPU_SREG_TR) | ||
1329 | && null_selector) | 1369 | && null_selector) |
1330 | goto exception; | 1370 | goto exception; |
1331 | 1371 | ||
@@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1336 | if (null_selector) /* for NULL selector skip all following checks */ | 1376 | if (null_selector) /* for NULL selector skip all following checks */ |
1337 | goto load; | 1377 | goto load; |
1338 | 1378 | ||
1339 | ret = read_segment_descriptor(ctxt, selector, &seg_desc); | 1379 | ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); |
1340 | if (ret != X86EMUL_CONTINUE) | 1380 | if (ret != X86EMUL_CONTINUE) |
1341 | return ret; | 1381 | return ret; |
1342 | 1382 | ||
@@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1352 | goto exception; | 1392 | goto exception; |
1353 | } | 1393 | } |
1354 | 1394 | ||
1355 | rpl = selector & 3; | ||
1356 | dpl = seg_desc.dpl; | 1395 | dpl = seg_desc.dpl; |
1357 | cpl = ctxt->ops->cpl(ctxt); | ||
1358 | 1396 | ||
1359 | switch (seg) { | 1397 | switch (seg) { |
1360 | case VCPU_SREG_SS: | 1398 | case VCPU_SREG_SS: |
@@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1384 | case VCPU_SREG_TR: | 1422 | case VCPU_SREG_TR: |
1385 | if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) | 1423 | if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) |
1386 | goto exception; | 1424 | goto exception; |
1425 | old_desc = seg_desc; | ||
1426 | seg_desc.type |= 2; /* busy */ | ||
1427 | ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, | ||
1428 | sizeof(seg_desc), &ctxt->exception); | ||
1429 | if (ret != X86EMUL_CONTINUE) | ||
1430 | return ret; | ||
1387 | break; | 1431 | break; |
1388 | case VCPU_SREG_LDTR: | 1432 | case VCPU_SREG_LDTR: |
1389 | if (seg_desc.s || seg_desc.type != 2) | 1433 | if (seg_desc.s || seg_desc.type != 2) |
@@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1474 | return X86EMUL_CONTINUE; | 1518 | return X86EMUL_CONTINUE; |
1475 | } | 1519 | } |
1476 | 1520 | ||
1477 | static int em_push(struct x86_emulate_ctxt *ctxt) | 1521 | static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) |
1478 | { | 1522 | { |
1479 | struct segmented_address addr; | 1523 | struct segmented_address addr; |
1480 | 1524 | ||
1481 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); | 1525 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); |
1482 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); | 1526 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
1483 | addr.seg = VCPU_SREG_SS; | 1527 | addr.seg = VCPU_SREG_SS; |
1484 | 1528 | ||
1529 | return segmented_write(ctxt, addr, data, bytes); | ||
1530 | } | ||
1531 | |||
1532 | static int em_push(struct x86_emulate_ctxt *ctxt) | ||
1533 | { | ||
1485 | /* Disable writeback. */ | 1534 | /* Disable writeback. */ |
1486 | ctxt->dst.type = OP_NONE; | 1535 | ctxt->dst.type = OP_NONE; |
1487 | return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); | 1536 | return push(ctxt, &ctxt->src.val, ctxt->op_bytes); |
1488 | } | 1537 | } |
1489 | 1538 | ||
1490 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1539 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
@@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) | |||
1556 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); | 1605 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
1557 | } | 1606 | } |
1558 | 1607 | ||
1608 | static int em_enter(struct x86_emulate_ctxt *ctxt) | ||
1609 | { | ||
1610 | int rc; | ||
1611 | unsigned frame_size = ctxt->src.val; | ||
1612 | unsigned nesting_level = ctxt->src2.val & 31; | ||
1613 | |||
1614 | if (nesting_level) | ||
1615 | return X86EMUL_UNHANDLEABLE; | ||
1616 | |||
1617 | rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); | ||
1618 | if (rc != X86EMUL_CONTINUE) | ||
1619 | return rc; | ||
1620 | assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], | ||
1621 | stack_mask(ctxt)); | ||
1622 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], | ||
1623 | ctxt->regs[VCPU_REGS_RSP] - frame_size, | ||
1624 | stack_mask(ctxt)); | ||
1625 | return X86EMUL_CONTINUE; | ||
1626 | } | ||
1627 | |||
1628 | static int em_leave(struct x86_emulate_ctxt *ctxt) | ||
1629 | { | ||
1630 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], | ||
1631 | stack_mask(ctxt)); | ||
1632 | return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); | ||
1633 | } | ||
1634 | |||
1559 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) | 1635 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) |
1560 | { | 1636 | { |
1561 | int seg = ctxt->src2.val; | 1637 | int seg = ctxt->src2.val; |
@@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) | |||
1993 | u32 eax, ebx, ecx, edx; | 2069 | u32 eax, ebx, ecx, edx; |
1994 | 2070 | ||
1995 | eax = ecx = 0; | 2071 | eax = ecx = 0; |
1996 | return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) | 2072 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); |
1997 | && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx | 2073 | return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx |
1998 | && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx | 2074 | && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx |
1999 | && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; | 2075 | && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; |
2000 | } | 2076 | } |
@@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) | |||
2013 | 2089 | ||
2014 | eax = 0x00000000; | 2090 | eax = 0x00000000; |
2015 | ecx = 0x00000000; | 2091 | ecx = 0x00000000; |
2016 | if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { | 2092 | ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); |
2017 | /* | 2093 | /* |
2018 | * Intel ("GenuineIntel") | 2094 | * Intel ("GenuineIntel") |
2019 | * remark: Intel CPUs only support "syscall" in 64bit | 2095 | * remark: Intel CPUs only support "syscall" in 64bit |
2020 | * longmode. Also an 64bit guest with a | 2096 | * longmode. Also an 64bit guest with a |
2021 | * 32bit compat-app running will #UD !! While this | 2097 | * 32bit compat-app running will #UD !! While this |
2022 | * behaviour can be fixed (by emulating) into AMD | 2098 | * behaviour can be fixed (by emulating) into AMD |
2023 | * response - CPUs of AMD can't behave like Intel. | 2099 | * response - CPUs of AMD can't behave like Intel. |
2024 | */ | 2100 | */ |
2025 | if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && | 2101 | if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && |
2026 | ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && | 2102 | ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && |
2027 | edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) | 2103 | edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) |
2028 | return false; | 2104 | return false; |
2029 | 2105 | ||
2030 | /* AMD ("AuthenticAMD") */ | 2106 | /* AMD ("AuthenticAMD") */ |
2031 | if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && | 2107 | if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && |
2032 | ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && | 2108 | ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && |
2033 | edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) | 2109 | edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) |
2034 | return true; | 2110 | return true; |
2035 | 2111 | ||
2036 | /* AMD ("AMDisbetter!") */ | 2112 | /* AMD ("AMDisbetter!") */ |
2037 | if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && | 2113 | if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && |
2038 | ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && | 2114 | ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && |
2039 | edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) | 2115 | edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) |
2040 | return true; | 2116 | return true; |
2041 | } | ||
2042 | 2117 | ||
2043 | /* default: (not Intel, not AMD), apply Intel's stricter rules... */ | 2118 | /* default: (not Intel, not AMD), apply Intel's stricter rules... */ |
2044 | return false; | 2119 | return false; |
@@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2547 | ulong old_tss_base = | 2622 | ulong old_tss_base = |
2548 | ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); | 2623 | ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); |
2549 | u32 desc_limit; | 2624 | u32 desc_limit; |
2625 | ulong desc_addr; | ||
2550 | 2626 | ||
2551 | /* FIXME: old_tss_base == ~0 ? */ | 2627 | /* FIXME: old_tss_base == ~0 ? */ |
2552 | 2628 | ||
2553 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); | 2629 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); |
2554 | if (ret != X86EMUL_CONTINUE) | 2630 | if (ret != X86EMUL_CONTINUE) |
2555 | return ret; | 2631 | return ret; |
2556 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); | 2632 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); |
2557 | if (ret != X86EMUL_CONTINUE) | 2633 | if (ret != X86EMUL_CONTINUE) |
2558 | return ret; | 2634 | return ret; |
2559 | 2635 | ||
@@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) | |||
2948 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); | 3024 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); |
2949 | } | 3025 | } |
2950 | 3026 | ||
3027 | static int em_lldt(struct x86_emulate_ctxt *ctxt) | ||
3028 | { | ||
3029 | u16 sel = ctxt->src.val; | ||
3030 | |||
3031 | /* Disable writeback. */ | ||
3032 | ctxt->dst.type = OP_NONE; | ||
3033 | return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); | ||
3034 | } | ||
3035 | |||
3036 | static int em_ltr(struct x86_emulate_ctxt *ctxt) | ||
3037 | { | ||
3038 | u16 sel = ctxt->src.val; | ||
3039 | |||
3040 | /* Disable writeback. */ | ||
3041 | ctxt->dst.type = OP_NONE; | ||
3042 | return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); | ||
3043 | } | ||
3044 | |||
2951 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | 3045 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) |
2952 | { | 3046 | { |
2953 | int rc; | 3047 | int rc; |
@@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) | |||
2989 | return X86EMUL_CONTINUE; | 3083 | return X86EMUL_CONTINUE; |
2990 | } | 3084 | } |
2991 | 3085 | ||
3086 | static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, | ||
3087 | void (*get)(struct x86_emulate_ctxt *ctxt, | ||
3088 | struct desc_ptr *ptr)) | ||
3089 | { | ||
3090 | struct desc_ptr desc_ptr; | ||
3091 | |||
3092 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
3093 | ctxt->op_bytes = 8; | ||
3094 | get(ctxt, &desc_ptr); | ||
3095 | if (ctxt->op_bytes == 2) { | ||
3096 | ctxt->op_bytes = 4; | ||
3097 | desc_ptr.address &= 0x00ffffff; | ||
3098 | } | ||
3099 | /* Disable writeback. */ | ||
3100 | ctxt->dst.type = OP_NONE; | ||
3101 | return segmented_write(ctxt, ctxt->dst.addr.mem, | ||
3102 | &desc_ptr, 2 + ctxt->op_bytes); | ||
3103 | } | ||
3104 | |||
3105 | static int em_sgdt(struct x86_emulate_ctxt *ctxt) | ||
3106 | { | ||
3107 | return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); | ||
3108 | } | ||
3109 | |||
3110 | static int em_sidt(struct x86_emulate_ctxt *ctxt) | ||
3111 | { | ||
3112 | return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); | ||
3113 | } | ||
3114 | |||
2992 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) | 3115 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) |
2993 | { | 3116 | { |
2994 | struct desc_ptr desc_ptr; | 3117 | struct desc_ptr desc_ptr; |
2995 | int rc; | 3118 | int rc; |
2996 | 3119 | ||
3120 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
3121 | ctxt->op_bytes = 8; | ||
2997 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, | 3122 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
2998 | &desc_ptr.size, &desc_ptr.address, | 3123 | &desc_ptr.size, &desc_ptr.address, |
2999 | ctxt->op_bytes); | 3124 | ctxt->op_bytes); |
@@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) | |||
3021 | struct desc_ptr desc_ptr; | 3146 | struct desc_ptr desc_ptr; |
3022 | int rc; | 3147 | int rc; |
3023 | 3148 | ||
3149 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
3150 | ctxt->op_bytes = 8; | ||
3024 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, | 3151 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
3025 | &desc_ptr.size, &desc_ptr.address, | 3152 | &desc_ptr.size, &desc_ptr.address, |
3026 | ctxt->op_bytes); | 3153 | ctxt->op_bytes); |
@@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt) | |||
3143 | return X86EMUL_CONTINUE; | 3270 | return X86EMUL_CONTINUE; |
3144 | } | 3271 | } |
3145 | 3272 | ||
3273 | static int em_cpuid(struct x86_emulate_ctxt *ctxt) | ||
3274 | { | ||
3275 | u32 eax, ebx, ecx, edx; | ||
3276 | |||
3277 | eax = ctxt->regs[VCPU_REGS_RAX]; | ||
3278 | ecx = ctxt->regs[VCPU_REGS_RCX]; | ||
3279 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); | ||
3280 | ctxt->regs[VCPU_REGS_RAX] = eax; | ||
3281 | ctxt->regs[VCPU_REGS_RBX] = ebx; | ||
3282 | ctxt->regs[VCPU_REGS_RCX] = ecx; | ||
3283 | ctxt->regs[VCPU_REGS_RDX] = edx; | ||
3284 | return X86EMUL_CONTINUE; | ||
3285 | } | ||
3286 | |||
3287 | static int em_lahf(struct x86_emulate_ctxt *ctxt) | ||
3288 | { | ||
3289 | ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; | ||
3290 | ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; | ||
3291 | return X86EMUL_CONTINUE; | ||
3292 | } | ||
3293 | |||
3294 | static int em_bswap(struct x86_emulate_ctxt *ctxt) | ||
3295 | { | ||
3296 | switch (ctxt->op_bytes) { | ||
3297 | #ifdef CONFIG_X86_64 | ||
3298 | case 8: | ||
3299 | asm("bswap %0" : "+r"(ctxt->dst.val)); | ||
3300 | break; | ||
3301 | #endif | ||
3302 | default: | ||
3303 | asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); | ||
3304 | break; | ||
3305 | } | ||
3306 | return X86EMUL_CONTINUE; | ||
3307 | } | ||
3308 | |||
3146 | static bool valid_cr(int nr) | 3309 | static bool valid_cr(int nr) |
3147 | { | 3310 | { |
3148 | switch (nr) { | 3311 | switch (nr) { |
@@ -3424,14 +3587,14 @@ static struct opcode group5[] = { | |||
3424 | static struct opcode group6[] = { | 3587 | static struct opcode group6[] = { |
3425 | DI(Prot, sldt), | 3588 | DI(Prot, sldt), |
3426 | DI(Prot, str), | 3589 | DI(Prot, str), |
3427 | DI(Prot | Priv, lldt), | 3590 | II(Prot | Priv | SrcMem16, em_lldt, lldt), |
3428 | DI(Prot | Priv, ltr), | 3591 | II(Prot | Priv | SrcMem16, em_ltr, ltr), |
3429 | N, N, N, N, | 3592 | N, N, N, N, |
3430 | }; | 3593 | }; |
3431 | 3594 | ||
3432 | static struct group_dual group7 = { { | 3595 | static struct group_dual group7 = { { |
3433 | DI(Mov | DstMem | Priv, sgdt), | 3596 | II(Mov | DstMem | Priv, em_sgdt, sgdt), |
3434 | DI(Mov | DstMem | Priv, sidt), | 3597 | II(Mov | DstMem | Priv, em_sidt, sidt), |
3435 | II(SrcMem | Priv, em_lgdt, lgdt), | 3598 | II(SrcMem | Priv, em_lgdt, lgdt), |
3436 | II(SrcMem | Priv, em_lidt, lidt), | 3599 | II(SrcMem | Priv, em_lidt, lidt), |
3437 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, | 3600 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, |
@@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = { | |||
3538 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), | 3701 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), |
3539 | I(SrcImmFAddr | No64, em_call_far), N, | 3702 | I(SrcImmFAddr | No64, em_call_far), N, |
3540 | II(ImplicitOps | Stack, em_pushf, pushf), | 3703 | II(ImplicitOps | Stack, em_pushf, pushf), |
3541 | II(ImplicitOps | Stack, em_popf, popf), N, N, | 3704 | II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), |
3542 | /* 0xA0 - 0xA7 */ | 3705 | /* 0xA0 - 0xA7 */ |
3543 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | 3706 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), |
3544 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), | 3707 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), |
@@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = { | |||
3561 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), | 3724 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), |
3562 | G(ByteOp, group11), G(0, group11), | 3725 | G(ByteOp, group11), G(0, group11), |
3563 | /* 0xC8 - 0xCF */ | 3726 | /* 0xC8 - 0xCF */ |
3564 | N, N, N, I(ImplicitOps | Stack, em_ret_far), | 3727 | I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), |
3728 | N, I(ImplicitOps | Stack, em_ret_far), | ||
3565 | D(ImplicitOps), DI(SrcImmByte, intn), | 3729 | D(ImplicitOps), DI(SrcImmByte, intn), |
3566 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), | 3730 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
3567 | /* 0xD0 - 0xD7 */ | 3731 | /* 0xD0 - 0xD7 */ |
@@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = { | |||
3635 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | 3799 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), |
3636 | /* 0xA0 - 0xA7 */ | 3800 | /* 0xA0 - 0xA7 */ |
3637 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), | 3801 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), |
3638 | DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), | 3802 | II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), |
3639 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 3803 | D(DstMem | SrcReg | Src2ImmByte | ModRM), |
3640 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | 3804 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, |
3641 | /* 0xA8 - 0xAF */ | 3805 | /* 0xA8 - 0xAF */ |
@@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = { | |||
3658 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), | 3822 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
3659 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), | 3823 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), |
3660 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3824 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3661 | /* 0xC0 - 0xCF */ | 3825 | /* 0xC0 - 0xC7 */ |
3662 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3826 | D2bv(DstMem | SrcReg | ModRM | Lock), |
3663 | N, D(DstMem | SrcReg | ModRM | Mov), | 3827 | N, D(DstMem | SrcReg | ModRM | Mov), |
3664 | N, N, N, GD(0, &group9), | 3828 | N, N, N, GD(0, &group9), |
3665 | N, N, N, N, N, N, N, N, | 3829 | /* 0xC8 - 0xCF */ |
3830 | X8(I(DstReg, em_bswap)), | ||
3666 | /* 0xD0 - 0xDF */ | 3831 | /* 0xD0 - 0xDF */ |
3667 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | 3832 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, |
3668 | /* 0xE0 - 0xEF */ | 3833 | /* 0xE0 - 0xEF */ |
@@ -4426,12 +4591,12 @@ twobyte_insn: | |||
4426 | break; | 4591 | break; |
4427 | case 0xb6 ... 0xb7: /* movzx */ | 4592 | case 0xb6 ... 0xb7: /* movzx */ |
4428 | ctxt->dst.bytes = ctxt->op_bytes; | 4593 | ctxt->dst.bytes = ctxt->op_bytes; |
4429 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val | 4594 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val |
4430 | : (u16) ctxt->src.val; | 4595 | : (u16) ctxt->src.val; |
4431 | break; | 4596 | break; |
4432 | case 0xbe ... 0xbf: /* movsx */ | 4597 | case 0xbe ... 0xbf: /* movsx */ |
4433 | ctxt->dst.bytes = ctxt->op_bytes; | 4598 | ctxt->dst.bytes = ctxt->op_bytes; |
4434 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : | 4599 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : |
4435 | (s16) ctxt->src.val; | 4600 | (s16) ctxt->src.val; |
4436 | break; | 4601 | break; |
4437 | case 0xc0 ... 0xc1: /* xadd */ | 4602 | case 0xc0 ... 0xc1: /* xadd */ |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2be..1df8fb9e1d5d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) | |||
188 | pic_unlock(s); | 188 | pic_unlock(s); |
189 | } | 189 | } |
190 | 190 | ||
191 | int kvm_pic_set_irq(void *opaque, int irq, int level) | 191 | int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) |
192 | { | 192 | { |
193 | struct kvm_pic *s = opaque; | ||
194 | int ret = -1; | 193 | int ret = -1; |
195 | 194 | ||
196 | pic_lock(s); | 195 | pic_lock(s); |
197 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 196 | if (irq >= 0 && irq < PIC_NUM_PINS) { |
198 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | 197 | int irq_level = __kvm_irq_line_state(&s->irq_states[irq], |
198 | irq_source_id, level); | ||
199 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); | ||
199 | pic_update_irq(s); | 200 | pic_update_irq(s); |
200 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, | 201 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, |
201 | s->pics[irq >> 3].imr, ret == 0); | 202 | s->pics[irq >> 3].imr, ret == 0); |
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) | |||
205 | return ret; | 206 | return ret; |
206 | } | 207 | } |
207 | 208 | ||
209 | void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) | ||
210 | { | ||
211 | int i; | ||
212 | |||
213 | pic_lock(s); | ||
214 | for (i = 0; i < PIC_NUM_PINS; i++) | ||
215 | __clear_bit(irq_source_id, &s->irq_states[i]); | ||
216 | pic_unlock(s); | ||
217 | } | ||
218 | |||
208 | /* | 219 | /* |
209 | * acknowledge interrupt 'irq' | 220 | * acknowledge interrupt 'irq' |
210 | */ | 221 | */ |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1ee..ce878788a39f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) | |||
107 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | 107 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); |
108 | } | 108 | } |
109 | 109 | ||
110 | static inline int __apic_test_and_set_vector(int vec, void *bitmap) | ||
111 | { | ||
112 | return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
113 | } | ||
114 | |||
115 | static inline int __apic_test_and_clear_vector(int vec, void *bitmap) | ||
116 | { | ||
117 | return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
118 | } | ||
119 | |||
110 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | 120 | static inline int apic_hw_enabled(struct kvm_lapic *apic) |
111 | { | 121 | { |
112 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | 122 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; |
@@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) | |||
210 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | 220 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); |
211 | } | 221 | } |
212 | 222 | ||
223 | static u8 count_vectors(void *bitmap) | ||
224 | { | ||
225 | u32 *word = bitmap; | ||
226 | int word_offset; | ||
227 | u8 count = 0; | ||
228 | for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) | ||
229 | count += hweight32(word[word_offset << 2]); | ||
230 | return count; | ||
231 | } | ||
232 | |||
213 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | 233 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) |
214 | { | 234 | { |
215 | apic->irr_pending = true; | 235 | apic->irr_pending = true; |
@@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | |||
242 | apic->irr_pending = true; | 262 | apic->irr_pending = true; |
243 | } | 263 | } |
244 | 264 | ||
265 | static inline void apic_set_isr(int vec, struct kvm_lapic *apic) | ||
266 | { | ||
267 | if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) | ||
268 | ++apic->isr_count; | ||
269 | BUG_ON(apic->isr_count > MAX_APIC_VECTOR); | ||
270 | /* | ||
271 | * ISR (in service register) bit is set when injecting an interrupt. | ||
272 | * The highest vector is injected. Thus the latest bit set matches | ||
273 | * the highest bit in ISR. | ||
274 | */ | ||
275 | apic->highest_isr_cache = vec; | ||
276 | } | ||
277 | |||
278 | static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) | ||
279 | { | ||
280 | if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) | ||
281 | --apic->isr_count; | ||
282 | BUG_ON(apic->isr_count < 0); | ||
283 | apic->highest_isr_cache = -1; | ||
284 | } | ||
285 | |||
245 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 286 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
246 | { | 287 | { |
247 | struct kvm_lapic *apic = vcpu->arch.apic; | 288 | struct kvm_lapic *apic = vcpu->arch.apic; |
@@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) | |||
270 | irq->level, irq->trig_mode); | 311 | irq->level, irq->trig_mode); |
271 | } | 312 | } |
272 | 313 | ||
314 | static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) | ||
315 | { | ||
316 | |||
317 | return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, | ||
318 | sizeof(val)); | ||
319 | } | ||
320 | |||
321 | static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) | ||
322 | { | ||
323 | |||
324 | return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, | ||
325 | sizeof(*val)); | ||
326 | } | ||
327 | |||
328 | static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) | ||
329 | { | ||
330 | return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; | ||
331 | } | ||
332 | |||
333 | static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) | ||
334 | { | ||
335 | u8 val; | ||
336 | if (pv_eoi_get_user(vcpu, &val) < 0) | ||
337 | apic_debug("Can't read EOI MSR value: 0x%llx\n", | ||
338 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
339 | return val & 0x1; | ||
340 | } | ||
341 | |||
342 | static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) | ||
343 | { | ||
344 | if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { | ||
345 | apic_debug("Can't set EOI MSR value: 0x%llx\n", | ||
346 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
347 | return; | ||
348 | } | ||
349 | __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); | ||
350 | } | ||
351 | |||
352 | static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) | ||
353 | { | ||
354 | if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { | ||
355 | apic_debug("Can't clear EOI MSR value: 0x%llx\n", | ||
356 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
357 | return; | ||
358 | } | ||
359 | __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); | ||
360 | } | ||
361 | |||
273 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | 362 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) |
274 | { | 363 | { |
275 | int result; | 364 | int result; |
365 | if (!apic->isr_count) | ||
366 | return -1; | ||
367 | if (likely(apic->highest_isr_cache != -1)) | ||
368 | return apic->highest_isr_cache; | ||
276 | 369 | ||
277 | result = find_highest_vector(apic->regs + APIC_ISR); | 370 | result = find_highest_vector(apic->regs + APIC_ISR); |
278 | ASSERT(result == -1 || result >= 16); | 371 | ASSERT(result == -1 || result >= 16); |
@@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) | |||
482 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; | 575 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; |
483 | } | 576 | } |
484 | 577 | ||
485 | static void apic_set_eoi(struct kvm_lapic *apic) | 578 | static int apic_set_eoi(struct kvm_lapic *apic) |
486 | { | 579 | { |
487 | int vector = apic_find_highest_isr(apic); | 580 | int vector = apic_find_highest_isr(apic); |
581 | |||
582 | trace_kvm_eoi(apic, vector); | ||
583 | |||
488 | /* | 584 | /* |
489 | * Not every write EOI will has corresponding ISR, | 585 | * Not every write EOI will has corresponding ISR, |
490 | * one example is when Kernel check timer on setup_IO_APIC | 586 | * one example is when Kernel check timer on setup_IO_APIC |
491 | */ | 587 | */ |
492 | if (vector == -1) | 588 | if (vector == -1) |
493 | return; | 589 | return vector; |
494 | 590 | ||
495 | apic_clear_vector(vector, apic->regs + APIC_ISR); | 591 | apic_clear_isr(vector, apic); |
496 | apic_update_ppr(apic); | 592 | apic_update_ppr(apic); |
497 | 593 | ||
498 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && | 594 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && |
@@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
505 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 601 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); |
506 | } | 602 | } |
507 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 603 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); |
604 | return vector; | ||
508 | } | 605 | } |
509 | 606 | ||
510 | static void apic_send_ipi(struct kvm_lapic *apic) | 607 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1081 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 1178 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
1082 | } | 1179 | } |
1083 | apic->irr_pending = false; | 1180 | apic->irr_pending = false; |
1181 | apic->isr_count = 0; | ||
1182 | apic->highest_isr_cache = -1; | ||
1084 | update_divide_count(apic); | 1183 | update_divide_count(apic); |
1085 | atomic_set(&apic->lapic_timer.pending, 0); | 1184 | atomic_set(&apic->lapic_timer.pending, 0); |
1086 | if (kvm_vcpu_is_bsp(vcpu)) | 1185 | if (kvm_vcpu_is_bsp(vcpu)) |
1087 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 1186 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
1187 | vcpu->arch.pv_eoi.msr_val = 0; | ||
1088 | apic_update_ppr(apic); | 1188 | apic_update_ppr(apic); |
1089 | 1189 | ||
1090 | vcpu->arch.apic_arb_prio = 0; | 1190 | vcpu->arch.apic_arb_prio = 0; |
@@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
1248 | if (vector == -1) | 1348 | if (vector == -1) |
1249 | return -1; | 1349 | return -1; |
1250 | 1350 | ||
1251 | apic_set_vector(vector, apic->regs + APIC_ISR); | 1351 | apic_set_isr(vector, apic); |
1252 | apic_update_ppr(apic); | 1352 | apic_update_ppr(apic); |
1253 | apic_clear_irr(vector, apic); | 1353 | apic_clear_irr(vector, apic); |
1254 | return vector; | 1354 | return vector; |
@@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1267 | update_divide_count(apic); | 1367 | update_divide_count(apic); |
1268 | start_apic_timer(apic); | 1368 | start_apic_timer(apic); |
1269 | apic->irr_pending = true; | 1369 | apic->irr_pending = true; |
1370 | apic->isr_count = count_vectors(apic->regs + APIC_ISR); | ||
1371 | apic->highest_isr_cache = -1; | ||
1270 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 1372 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
1271 | } | 1373 | } |
1272 | 1374 | ||
@@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | |||
1283 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 1385 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
1284 | } | 1386 | } |
1285 | 1387 | ||
1388 | /* | ||
1389 | * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt | ||
1390 | * | ||
1391 | * Detect whether guest triggered PV EOI since the | ||
1392 | * last entry. If yes, set EOI on guests's behalf. | ||
1393 | * Clear PV EOI in guest memory in any case. | ||
1394 | */ | ||
1395 | static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, | ||
1396 | struct kvm_lapic *apic) | ||
1397 | { | ||
1398 | bool pending; | ||
1399 | int vector; | ||
1400 | /* | ||
1401 | * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host | ||
1402 | * and KVM_PV_EOI_ENABLED in guest memory as follows: | ||
1403 | * | ||
1404 | * KVM_APIC_PV_EOI_PENDING is unset: | ||
1405 | * -> host disabled PV EOI. | ||
1406 | * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: | ||
1407 | * -> host enabled PV EOI, guest did not execute EOI yet. | ||
1408 | * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: | ||
1409 | * -> host enabled PV EOI, guest executed EOI. | ||
1410 | */ | ||
1411 | BUG_ON(!pv_eoi_enabled(vcpu)); | ||
1412 | pending = pv_eoi_get_pending(vcpu); | ||
1413 | /* | ||
1414 | * Clear pending bit in any case: it will be set again on vmentry. | ||
1415 | * While this might not be ideal from performance point of view, | ||
1416 | * this makes sure pv eoi is only enabled when we know it's safe. | ||
1417 | */ | ||
1418 | pv_eoi_clr_pending(vcpu); | ||
1419 | if (pending) | ||
1420 | return; | ||
1421 | vector = apic_set_eoi(apic); | ||
1422 | trace_kvm_pv_eoi(apic, vector); | ||
1423 | } | ||
1424 | |||
1286 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | 1425 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) |
1287 | { | 1426 | { |
1288 | u32 data; | 1427 | u32 data; |
1289 | void *vapic; | 1428 | void *vapic; |
1290 | 1429 | ||
1430 | if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) | ||
1431 | apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); | ||
1432 | |||
1291 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) | 1433 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
1292 | return; | 1434 | return; |
1293 | 1435 | ||
@@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | |||
1298 | apic_set_tpr(vcpu->arch.apic, data & 0xff); | 1440 | apic_set_tpr(vcpu->arch.apic, data & 0xff); |
1299 | } | 1441 | } |
1300 | 1442 | ||
1443 | /* | ||
1444 | * apic_sync_pv_eoi_to_guest - called before vmentry | ||
1445 | * | ||
1446 | * Detect whether it's safe to enable PV EOI and | ||
1447 | * if yes do so. | ||
1448 | */ | ||
1449 | static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, | ||
1450 | struct kvm_lapic *apic) | ||
1451 | { | ||
1452 | if (!pv_eoi_enabled(vcpu) || | ||
1453 | /* IRR set or many bits in ISR: could be nested. */ | ||
1454 | apic->irr_pending || | ||
1455 | /* Cache not set: could be safe but we don't bother. */ | ||
1456 | apic->highest_isr_cache == -1 || | ||
1457 | /* Need EOI to update ioapic. */ | ||
1458 | kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { | ||
1459 | /* | ||
1460 | * PV EOI was disabled by apic_sync_pv_eoi_from_guest | ||
1461 | * so we need not do anything here. | ||
1462 | */ | ||
1463 | return; | ||
1464 | } | ||
1465 | |||
1466 | pv_eoi_set_pending(apic->vcpu); | ||
1467 | } | ||
1468 | |||
1301 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | 1469 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) |
1302 | { | 1470 | { |
1303 | u32 data, tpr; | 1471 | u32 data, tpr; |
1304 | int max_irr, max_isr; | 1472 | int max_irr, max_isr; |
1305 | struct kvm_lapic *apic; | 1473 | struct kvm_lapic *apic = vcpu->arch.apic; |
1306 | void *vapic; | 1474 | void *vapic; |
1307 | 1475 | ||
1476 | apic_sync_pv_eoi_to_guest(vcpu, apic); | ||
1477 | |||
1308 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) | 1478 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
1309 | return; | 1479 | return; |
1310 | 1480 | ||
1311 | apic = vcpu->arch.apic; | ||
1312 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; | 1481 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; |
1313 | max_irr = apic_find_highest_irr(apic); | 1482 | max_irr = apic_find_highest_irr(apic); |
1314 | if (max_irr < 0) | 1483 | if (max_irr < 0) |
@@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) | |||
1394 | 1563 | ||
1395 | return 0; | 1564 | return 0; |
1396 | } | 1565 | } |
1566 | |||
1567 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) | ||
1568 | { | ||
1569 | u64 addr = data & ~KVM_MSR_ENABLED; | ||
1570 | if (!IS_ALIGNED(addr, 4)) | ||
1571 | return 1; | ||
1572 | |||
1573 | vcpu->arch.pv_eoi.msr_val = data; | ||
1574 | if (!pv_eoi_enabled(vcpu)) | ||
1575 | return 0; | ||
1576 | return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, | ||
1577 | addr); | ||
1578 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d09..4af5405ae1e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -13,6 +13,15 @@ struct kvm_lapic { | |||
13 | u32 divide_count; | 13 | u32 divide_count; |
14 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
15 | bool irr_pending; | 15 | bool irr_pending; |
16 | /* Number of bits set in ISR. */ | ||
17 | s16 isr_count; | ||
18 | /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ | ||
19 | int highest_isr_cache; | ||
20 | /** | ||
21 | * APIC register page. The layout matches the register layout seen by | ||
22 | * the guest 1:1, because it is accessed by the vmx microcode. | ||
23 | * Note: Only one register, the TPR, is used by the microcode. | ||
24 | */ | ||
16 | void *regs; | 25 | void *regs; |
17 | gpa_t vapic_addr; | 26 | gpa_t vapic_addr; |
18 | struct page *vapic_page; | 27 | struct page *vapic_page; |
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) | |||
60 | { | 69 | { |
61 | return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; | 70 | return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; |
62 | } | 71 | } |
72 | |||
73 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); | ||
63 | #endif | 74 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57e168e27b5b..01ca00423938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); | |||
90 | 90 | ||
91 | #define PTE_PREFETCH_NUM 8 | 91 | #define PTE_PREFETCH_NUM 8 |
92 | 92 | ||
93 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 93 | #define PT_FIRST_AVAIL_BITS_SHIFT 10 |
94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
95 | 95 | ||
96 | #define PT64_LEVEL_BITS 9 | 96 | #define PT64_LEVEL_BITS 9 |
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); | |||
145 | #define CREATE_TRACE_POINTS | 145 | #define CREATE_TRACE_POINTS |
146 | #include "mmutrace.h" | 146 | #include "mmutrace.h" |
147 | 147 | ||
148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | 148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) |
149 | #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) | ||
149 | 150 | ||
150 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 151 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
151 | 152 | ||
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask; | |||
188 | static u64 __read_mostly shadow_mmio_mask; | 189 | static u64 __read_mostly shadow_mmio_mask; |
189 | 190 | ||
190 | static void mmu_spte_set(u64 *sptep, u64 spte); | 191 | static void mmu_spte_set(u64 *sptep, u64 spte); |
192 | static void mmu_free_roots(struct kvm_vcpu *vcpu); | ||
191 | 193 | ||
192 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | 194 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) |
193 | { | 195 | { |
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) | |||
444 | } | 446 | } |
445 | #endif | 447 | #endif |
446 | 448 | ||
449 | static bool spte_is_locklessly_modifiable(u64 spte) | ||
450 | { | ||
451 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | ||
452 | } | ||
453 | |||
447 | static bool spte_has_volatile_bits(u64 spte) | 454 | static bool spte_has_volatile_bits(u64 spte) |
448 | { | 455 | { |
456 | /* | ||
457 | * Always atomicly update spte if it can be updated | ||
458 | * out of mmu-lock, it can ensure dirty bit is not lost, | ||
459 | * also, it can help us to get a stable is_writable_pte() | ||
460 | * to ensure tlb flush is not missed. | ||
461 | */ | ||
462 | if (spte_is_locklessly_modifiable(spte)) | ||
463 | return true; | ||
464 | |||
449 | if (!shadow_accessed_mask) | 465 | if (!shadow_accessed_mask) |
450 | return false; | 466 | return false; |
451 | 467 | ||
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) | |||
478 | 494 | ||
479 | /* Rules for using mmu_spte_update: | 495 | /* Rules for using mmu_spte_update: |
480 | * Update the state bits, it means the mapped pfn is not changged. | 496 | * Update the state bits, it means the mapped pfn is not changged. |
497 | * | ||
498 | * Whenever we overwrite a writable spte with a read-only one we | ||
499 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
500 | * will find a read-only spte, even though the writable spte | ||
501 | * might be cached on a CPU's TLB, the return value indicates this | ||
502 | * case. | ||
481 | */ | 503 | */ |
482 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | 504 | static bool mmu_spte_update(u64 *sptep, u64 new_spte) |
483 | { | 505 | { |
484 | u64 mask, old_spte = *sptep; | 506 | u64 old_spte = *sptep; |
507 | bool ret = false; | ||
485 | 508 | ||
486 | WARN_ON(!is_rmap_spte(new_spte)); | 509 | WARN_ON(!is_rmap_spte(new_spte)); |
487 | 510 | ||
488 | if (!is_shadow_present_pte(old_spte)) | 511 | if (!is_shadow_present_pte(old_spte)) { |
489 | return mmu_spte_set(sptep, new_spte); | 512 | mmu_spte_set(sptep, new_spte); |
490 | 513 | return ret; | |
491 | new_spte |= old_spte & shadow_dirty_mask; | 514 | } |
492 | |||
493 | mask = shadow_accessed_mask; | ||
494 | if (is_writable_pte(old_spte)) | ||
495 | mask |= shadow_dirty_mask; | ||
496 | 515 | ||
497 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | 516 | if (!spte_has_volatile_bits(old_spte)) |
498 | __update_clear_spte_fast(sptep, new_spte); | 517 | __update_clear_spte_fast(sptep, new_spte); |
499 | else | 518 | else |
500 | old_spte = __update_clear_spte_slow(sptep, new_spte); | 519 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
501 | 520 | ||
521 | /* | ||
522 | * For the spte updated out of mmu-lock is safe, since | ||
523 | * we always atomicly update it, see the comments in | ||
524 | * spte_has_volatile_bits(). | ||
525 | */ | ||
526 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) | ||
527 | ret = true; | ||
528 | |||
502 | if (!shadow_accessed_mask) | 529 | if (!shadow_accessed_mask) |
503 | return; | 530 | return ret; |
504 | 531 | ||
505 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | 532 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) |
506 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | 533 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); |
507 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | 534 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) |
508 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | 535 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); |
536 | |||
537 | return ret; | ||
509 | } | 538 | } |
510 | 539 | ||
511 | /* | 540 | /* |
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | |||
652 | mmu_page_header_cache); | 681 | mmu_page_header_cache); |
653 | } | 682 | } |
654 | 683 | ||
655 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | 684 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) |
656 | size_t size) | ||
657 | { | 685 | { |
658 | void *p; | 686 | void *p; |
659 | 687 | ||
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |||
664 | 692 | ||
665 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) | 693 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) |
666 | { | 694 | { |
667 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, | 695 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); |
668 | sizeof(struct pte_list_desc)); | ||
669 | } | 696 | } |
670 | 697 | ||
671 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) | 698 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) |
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
1051 | rmap_remove(kvm, sptep); | 1078 | rmap_remove(kvm, sptep); |
1052 | } | 1079 | } |
1053 | 1080 | ||
1054 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) | 1081 | |
1082 | static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) | ||
1083 | { | ||
1084 | if (is_large_pte(*sptep)) { | ||
1085 | WARN_ON(page_header(__pa(sptep))->role.level == | ||
1086 | PT_PAGE_TABLE_LEVEL); | ||
1087 | drop_spte(kvm, sptep); | ||
1088 | --kvm->stat.lpages; | ||
1089 | return true; | ||
1090 | } | ||
1091 | |||
1092 | return false; | ||
1093 | } | ||
1094 | |||
1095 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1096 | { | ||
1097 | if (__drop_large_spte(vcpu->kvm, sptep)) | ||
1098 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Write-protect on the specified @sptep, @pt_protect indicates whether | ||
1103 | * spte writ-protection is caused by protecting shadow page table. | ||
1104 | * @flush indicates whether tlb need be flushed. | ||
1105 | * | ||
1106 | * Note: write protection is difference between drity logging and spte | ||
1107 | * protection: | ||
1108 | * - for dirty logging, the spte can be set to writable at anytime if | ||
1109 | * its dirty bitmap is properly set. | ||
1110 | * - for spte protection, the spte can be writable only after unsync-ing | ||
1111 | * shadow page. | ||
1112 | * | ||
1113 | * Return true if the spte is dropped. | ||
1114 | */ | ||
1115 | static bool | ||
1116 | spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | ||
1117 | { | ||
1118 | u64 spte = *sptep; | ||
1119 | |||
1120 | if (!is_writable_pte(spte) && | ||
1121 | !(pt_protect && spte_is_locklessly_modifiable(spte))) | ||
1122 | return false; | ||
1123 | |||
1124 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | ||
1125 | |||
1126 | if (__drop_large_spte(kvm, sptep)) { | ||
1127 | *flush |= true; | ||
1128 | return true; | ||
1129 | } | ||
1130 | |||
1131 | if (pt_protect) | ||
1132 | spte &= ~SPTE_MMU_WRITEABLE; | ||
1133 | spte = spte & ~PT_WRITABLE_MASK; | ||
1134 | |||
1135 | *flush |= mmu_spte_update(sptep, spte); | ||
1136 | return false; | ||
1137 | } | ||
1138 | |||
1139 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | ||
1140 | int level, bool pt_protect) | ||
1055 | { | 1141 | { |
1056 | u64 *sptep; | 1142 | u64 *sptep; |
1057 | struct rmap_iterator iter; | 1143 | struct rmap_iterator iter; |
1058 | int write_protected = 0; | 1144 | bool flush = false; |
1059 | 1145 | ||
1060 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | 1146 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1061 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1147 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1062 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | 1148 | if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { |
1063 | |||
1064 | if (!is_writable_pte(*sptep)) { | ||
1065 | sptep = rmap_get_next(&iter); | ||
1066 | continue; | ||
1067 | } | ||
1068 | |||
1069 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
1070 | mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); | ||
1071 | sptep = rmap_get_next(&iter); | ||
1072 | } else { | ||
1073 | BUG_ON(!is_large_pte(*sptep)); | ||
1074 | drop_spte(kvm, sptep); | ||
1075 | --kvm->stat.lpages; | ||
1076 | sptep = rmap_get_first(*rmapp, &iter); | 1149 | sptep = rmap_get_first(*rmapp, &iter); |
1150 | continue; | ||
1077 | } | 1151 | } |
1078 | 1152 | ||
1079 | write_protected = 1; | 1153 | sptep = rmap_get_next(&iter); |
1080 | } | 1154 | } |
1081 | 1155 | ||
1082 | return write_protected; | 1156 | return flush; |
1083 | } | 1157 | } |
1084 | 1158 | ||
1085 | /** | 1159 | /** |
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
1100 | 1174 | ||
1101 | while (mask) { | 1175 | while (mask) { |
1102 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; | 1176 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; |
1103 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); | 1177 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); |
1104 | 1178 | ||
1105 | /* clear the first set bit */ | 1179 | /* clear the first set bit */ |
1106 | mask &= mask - 1; | 1180 | mask &= mask - 1; |
1107 | } | 1181 | } |
1108 | } | 1182 | } |
1109 | 1183 | ||
1110 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1184 | static bool rmap_write_protect(struct kvm *kvm, u64 gfn) |
1111 | { | 1185 | { |
1112 | struct kvm_memory_slot *slot; | 1186 | struct kvm_memory_slot *slot; |
1113 | unsigned long *rmapp; | 1187 | unsigned long *rmapp; |
1114 | int i; | 1188 | int i; |
1115 | int write_protected = 0; | 1189 | bool write_protected = false; |
1116 | 1190 | ||
1117 | slot = gfn_to_memslot(kvm, gfn); | 1191 | slot = gfn_to_memslot(kvm, gfn); |
1118 | 1192 | ||
1119 | for (i = PT_PAGE_TABLE_LEVEL; | 1193 | for (i = PT_PAGE_TABLE_LEVEL; |
1120 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1194 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1121 | rmapp = __gfn_to_rmap(gfn, i, slot); | 1195 | rmapp = __gfn_to_rmap(gfn, i, slot); |
1122 | write_protected |= __rmap_write_protect(kvm, rmapp, i); | 1196 | write_protected |= __rmap_write_protect(kvm, rmapp, i, true); |
1123 | } | 1197 | } |
1124 | 1198 | ||
1125 | return write_protected; | 1199 | return write_protected; |
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1238 | unsigned long data) | 1312 | unsigned long data) |
1239 | { | 1313 | { |
1240 | u64 *sptep; | 1314 | u64 *sptep; |
1241 | struct rmap_iterator iter; | 1315 | struct rmap_iterator uninitialized_var(iter); |
1242 | int young = 0; | 1316 | int young = 0; |
1243 | 1317 | ||
1244 | /* | 1318 | /* |
1245 | * Emulate the accessed bit for EPT, by checking if this page has | 1319 | * In case of absence of EPT Access and Dirty Bits supports, |
1320 | * emulate the accessed bit for EPT, by checking if this page has | ||
1246 | * an EPT mapping, and clearing it if it does. On the next access, | 1321 | * an EPT mapping, and clearing it if it does. On the next access, |
1247 | * a new EPT mapping will be established. | 1322 | * a new EPT mapping will be established. |
1248 | * This has some overhead, but not as much as the cost of swapping | 1323 | * This has some overhead, but not as much as the cost of swapping |
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1253 | 1328 | ||
1254 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1329 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1255 | sptep = rmap_get_next(&iter)) { | 1330 | sptep = rmap_get_next(&iter)) { |
1256 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1331 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1257 | 1332 | ||
1258 | if (*sptep & PT_ACCESSED_MASK) { | 1333 | if (*sptep & shadow_accessed_mask) { |
1259 | young = 1; | 1334 | young = 1; |
1260 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); | 1335 | clear_bit((ffs(shadow_accessed_mask) - 1), |
1336 | (unsigned long *)sptep); | ||
1261 | } | 1337 | } |
1262 | } | 1338 | } |
1263 | 1339 | ||
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1281 | 1357 | ||
1282 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1358 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1283 | sptep = rmap_get_next(&iter)) { | 1359 | sptep = rmap_get_next(&iter)) { |
1284 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1360 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1285 | 1361 | ||
1286 | if (*sptep & PT_ACCESSED_MASK) { | 1362 | if (*sptep & shadow_accessed_mask) { |
1287 | young = 1; | 1363 | young = 1; |
1288 | break; | 1364 | break; |
1289 | } | 1365 | } |
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1401 | u64 *parent_pte, int direct) | 1477 | u64 *parent_pte, int direct) |
1402 | { | 1478 | { |
1403 | struct kvm_mmu_page *sp; | 1479 | struct kvm_mmu_page *sp; |
1404 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, | 1480 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); |
1405 | sizeof *sp); | 1481 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1406 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1407 | if (!direct) | 1482 | if (!direct) |
1408 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | 1483 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1409 | PAGE_SIZE); | ||
1410 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1484 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1411 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1485 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1412 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); | 1486 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); |
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1701 | 1775 | ||
1702 | kvm_mmu_pages_init(parent, &parents, &pages); | 1776 | kvm_mmu_pages_init(parent, &parents, &pages); |
1703 | while (mmu_unsync_walk(parent, &pages)) { | 1777 | while (mmu_unsync_walk(parent, &pages)) { |
1704 | int protected = 0; | 1778 | bool protected = false; |
1705 | 1779 | ||
1706 | for_each_sp(pages, sp, parents, i) | 1780 | for_each_sp(pages, sp, parents, i) |
1707 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); | 1781 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); |
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
1866 | mmu_spte_set(sptep, spte); | 1940 | mmu_spte_set(sptep, spte); |
1867 | } | 1941 | } |
1868 | 1942 | ||
1869 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1870 | { | ||
1871 | if (is_large_pte(*sptep)) { | ||
1872 | drop_spte(vcpu->kvm, sptep); | ||
1873 | --vcpu->kvm->stat.lpages; | ||
1874 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1875 | } | ||
1876 | } | ||
1877 | |||
1878 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 1943 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
1879 | unsigned direct_access) | 1944 | unsigned direct_access) |
1880 | { | 1945 | { |
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2243 | gfn_t gfn, pfn_t pfn, bool speculative, | 2308 | gfn_t gfn, pfn_t pfn, bool speculative, |
2244 | bool can_unsync, bool host_writable) | 2309 | bool can_unsync, bool host_writable) |
2245 | { | 2310 | { |
2246 | u64 spte, entry = *sptep; | 2311 | u64 spte; |
2247 | int ret = 0; | 2312 | int ret = 0; |
2248 | 2313 | ||
2249 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2314 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) |
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2257 | spte |= shadow_x_mask; | 2322 | spte |= shadow_x_mask; |
2258 | else | 2323 | else |
2259 | spte |= shadow_nx_mask; | 2324 | spte |= shadow_nx_mask; |
2325 | |||
2260 | if (pte_access & ACC_USER_MASK) | 2326 | if (pte_access & ACC_USER_MASK) |
2261 | spte |= shadow_user_mask; | 2327 | spte |= shadow_user_mask; |
2328 | |||
2262 | if (level > PT_PAGE_TABLE_LEVEL) | 2329 | if (level > PT_PAGE_TABLE_LEVEL) |
2263 | spte |= PT_PAGE_SIZE_MASK; | 2330 | spte |= PT_PAGE_SIZE_MASK; |
2264 | if (tdp_enabled) | 2331 | if (tdp_enabled) |
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2283 | goto done; | 2350 | goto done; |
2284 | } | 2351 | } |
2285 | 2352 | ||
2286 | spte |= PT_WRITABLE_MASK; | 2353 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; |
2287 | 2354 | ||
2288 | if (!vcpu->arch.mmu.direct_map | 2355 | if (!vcpu->arch.mmu.direct_map |
2289 | && !(pte_access & ACC_WRITE_MASK)) { | 2356 | && !(pte_access & ACC_WRITE_MASK)) { |
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2312 | __func__, gfn); | 2379 | __func__, gfn); |
2313 | ret = 1; | 2380 | ret = 1; |
2314 | pte_access &= ~ACC_WRITE_MASK; | 2381 | pte_access &= ~ACC_WRITE_MASK; |
2315 | if (is_writable_pte(spte)) | 2382 | spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); |
2316 | spte &= ~PT_WRITABLE_MASK; | ||
2317 | } | 2383 | } |
2318 | } | 2384 | } |
2319 | 2385 | ||
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2321 | mark_page_dirty(vcpu->kvm, gfn); | 2387 | mark_page_dirty(vcpu->kvm, gfn); |
2322 | 2388 | ||
2323 | set_pte: | 2389 | set_pte: |
2324 | mmu_spte_update(sptep, spte); | 2390 | if (mmu_spte_update(sptep, spte)) |
2325 | /* | ||
2326 | * If we overwrite a writable spte with a read-only one we | ||
2327 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2328 | * will find a read-only spte, even though the writable spte | ||
2329 | * might be cached on a CPU's TLB. | ||
2330 | */ | ||
2331 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2332 | kvm_flush_remote_tlbs(vcpu->kvm); | 2391 | kvm_flush_remote_tlbs(vcpu->kvm); |
2333 | done: | 2392 | done: |
2334 | return ret; | 2393 | return ret; |
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2403 | 2462 | ||
2404 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2463 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
2405 | { | 2464 | { |
2465 | mmu_free_roots(vcpu); | ||
2406 | } | 2466 | } |
2407 | 2467 | ||
2408 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | 2468 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
@@ -2625,18 +2685,116 @@ exit: | |||
2625 | return ret; | 2685 | return ret; |
2626 | } | 2686 | } |
2627 | 2687 | ||
2688 | static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) | ||
2689 | { | ||
2690 | /* | ||
2691 | * #PF can be fast only if the shadow page table is present and it | ||
2692 | * is caused by write-protect, that means we just need change the | ||
2693 | * W bit of the spte which can be done out of mmu-lock. | ||
2694 | */ | ||
2695 | if (!(error_code & PFERR_PRESENT_MASK) || | ||
2696 | !(error_code & PFERR_WRITE_MASK)) | ||
2697 | return false; | ||
2698 | |||
2699 | return true; | ||
2700 | } | ||
2701 | |||
2702 | static bool | ||
2703 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) | ||
2704 | { | ||
2705 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
2706 | gfn_t gfn; | ||
2707 | |||
2708 | WARN_ON(!sp->role.direct); | ||
2709 | |||
2710 | /* | ||
2711 | * The gfn of direct spte is stable since it is calculated | ||
2712 | * by sp->gfn. | ||
2713 | */ | ||
2714 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
2715 | |||
2716 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) | ||
2717 | mark_page_dirty(vcpu->kvm, gfn); | ||
2718 | |||
2719 | return true; | ||
2720 | } | ||
2721 | |||
2722 | /* | ||
2723 | * Return value: | ||
2724 | * - true: let the vcpu to access on the same address again. | ||
2725 | * - false: let the real page fault path to fix it. | ||
2726 | */ | ||
2727 | static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | ||
2728 | u32 error_code) | ||
2729 | { | ||
2730 | struct kvm_shadow_walk_iterator iterator; | ||
2731 | bool ret = false; | ||
2732 | u64 spte = 0ull; | ||
2733 | |||
2734 | if (!page_fault_can_be_fast(vcpu, error_code)) | ||
2735 | return false; | ||
2736 | |||
2737 | walk_shadow_page_lockless_begin(vcpu); | ||
2738 | for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) | ||
2739 | if (!is_shadow_present_pte(spte) || iterator.level < level) | ||
2740 | break; | ||
2741 | |||
2742 | /* | ||
2743 | * If the mapping has been changed, let the vcpu fault on the | ||
2744 | * same address again. | ||
2745 | */ | ||
2746 | if (!is_rmap_spte(spte)) { | ||
2747 | ret = true; | ||
2748 | goto exit; | ||
2749 | } | ||
2750 | |||
2751 | if (!is_last_spte(spte, level)) | ||
2752 | goto exit; | ||
2753 | |||
2754 | /* | ||
2755 | * Check if it is a spurious fault caused by TLB lazily flushed. | ||
2756 | * | ||
2757 | * Need not check the access of upper level table entries since | ||
2758 | * they are always ACC_ALL. | ||
2759 | */ | ||
2760 | if (is_writable_pte(spte)) { | ||
2761 | ret = true; | ||
2762 | goto exit; | ||
2763 | } | ||
2764 | |||
2765 | /* | ||
2766 | * Currently, to simplify the code, only the spte write-protected | ||
2767 | * by dirty-log can be fast fixed. | ||
2768 | */ | ||
2769 | if (!spte_is_locklessly_modifiable(spte)) | ||
2770 | goto exit; | ||
2771 | |||
2772 | /* | ||
2773 | * Currently, fast page fault only works for direct mapping since | ||
2774 | * the gfn is not stable for indirect shadow page. | ||
2775 | * See Documentation/virtual/kvm/locking.txt to get more detail. | ||
2776 | */ | ||
2777 | ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); | ||
2778 | exit: | ||
2779 | trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, | ||
2780 | spte, ret); | ||
2781 | walk_shadow_page_lockless_end(vcpu); | ||
2782 | |||
2783 | return ret; | ||
2784 | } | ||
2785 | |||
2628 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2786 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2629 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2787 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2630 | 2788 | ||
2631 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | 2789 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
2632 | bool prefault) | 2790 | gfn_t gfn, bool prefault) |
2633 | { | 2791 | { |
2634 | int r; | 2792 | int r; |
2635 | int level; | 2793 | int level; |
2636 | int force_pt_level; | 2794 | int force_pt_level; |
2637 | pfn_t pfn; | 2795 | pfn_t pfn; |
2638 | unsigned long mmu_seq; | 2796 | unsigned long mmu_seq; |
2639 | bool map_writable; | 2797 | bool map_writable, write = error_code & PFERR_WRITE_MASK; |
2640 | 2798 | ||
2641 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | 2799 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2642 | if (likely(!force_pt_level)) { | 2800 | if (likely(!force_pt_level)) { |
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2653 | } else | 2811 | } else |
2654 | level = PT_PAGE_TABLE_LEVEL; | 2812 | level = PT_PAGE_TABLE_LEVEL; |
2655 | 2813 | ||
2814 | if (fast_page_fault(vcpu, v, level, error_code)) | ||
2815 | return 0; | ||
2816 | |||
2656 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2817 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2657 | smp_rmb(); | 2818 | smp_rmb(); |
2658 | 2819 | ||
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3041 | gfn = gva >> PAGE_SHIFT; | 3202 | gfn = gva >> PAGE_SHIFT; |
3042 | 3203 | ||
3043 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 3204 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
3044 | error_code & PFERR_WRITE_MASK, gfn, prefault); | 3205 | error_code, gfn, prefault); |
3045 | } | 3206 | } |
3046 | 3207 | ||
3047 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | 3208 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) |
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3121 | } else | 3282 | } else |
3122 | level = PT_PAGE_TABLE_LEVEL; | 3283 | level = PT_PAGE_TABLE_LEVEL; |
3123 | 3284 | ||
3285 | if (fast_page_fault(vcpu, gpa, level, error_code)) | ||
3286 | return 0; | ||
3287 | |||
3124 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 3288 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
3125 | smp_rmb(); | 3289 | smp_rmb(); |
3126 | 3290 | ||
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
3885 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 4049 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
3886 | { | 4050 | { |
3887 | struct kvm_mmu_page *sp; | 4051 | struct kvm_mmu_page *sp; |
4052 | bool flush = false; | ||
3888 | 4053 | ||
3889 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 4054 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
3890 | int i; | 4055 | int i; |
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3899 | !is_last_spte(pt[i], sp->role.level)) | 4064 | !is_last_spte(pt[i], sp->role.level)) |
3900 | continue; | 4065 | continue; |
3901 | 4066 | ||
3902 | if (is_large_pte(pt[i])) { | 4067 | spte_write_protect(kvm, &pt[i], &flush, false); |
3903 | drop_spte(kvm, &pt[i]); | ||
3904 | --kvm->stat.lpages; | ||
3905 | continue; | ||
3906 | } | ||
3907 | |||
3908 | /* avoid RMW */ | ||
3909 | if (is_writable_pte(pt[i])) | ||
3910 | mmu_spte_update(&pt[i], | ||
3911 | pt[i] & ~PT_WRITABLE_MASK); | ||
3912 | } | 4068 | } |
3913 | } | 4069 | } |
3914 | kvm_flush_remote_tlbs(kvm); | 4070 | kvm_flush_remote_tlbs(kvm); |
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |||
3945 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 4101 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
3946 | { | 4102 | { |
3947 | struct kvm *kvm; | 4103 | struct kvm *kvm; |
3948 | struct kvm *kvm_freed = NULL; | ||
3949 | int nr_to_scan = sc->nr_to_scan; | 4104 | int nr_to_scan = sc->nr_to_scan; |
3950 | 4105 | ||
3951 | if (nr_to_scan == 0) | 4106 | if (nr_to_scan == 0) |
@@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
3957 | int idx; | 4112 | int idx; |
3958 | LIST_HEAD(invalid_list); | 4113 | LIST_HEAD(invalid_list); |
3959 | 4114 | ||
4115 | /* | ||
4116 | * n_used_mmu_pages is accessed without holding kvm->mmu_lock | ||
4117 | * here. We may skip a VM instance errorneosly, but we do not | ||
4118 | * want to shrink a VM that only started to populate its MMU | ||
4119 | * anyway. | ||
4120 | */ | ||
4121 | if (kvm->arch.n_used_mmu_pages > 0) { | ||
4122 | if (!nr_to_scan--) | ||
4123 | break; | ||
4124 | continue; | ||
4125 | } | ||
4126 | |||
3960 | idx = srcu_read_lock(&kvm->srcu); | 4127 | idx = srcu_read_lock(&kvm->srcu); |
3961 | spin_lock(&kvm->mmu_lock); | 4128 | spin_lock(&kvm->mmu_lock); |
3962 | if (!kvm_freed && nr_to_scan > 0 && | ||
3963 | kvm->arch.n_used_mmu_pages > 0) { | ||
3964 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, | ||
3965 | &invalid_list); | ||
3966 | kvm_freed = kvm; | ||
3967 | } | ||
3968 | nr_to_scan--; | ||
3969 | 4129 | ||
4130 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); | ||
3970 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4131 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
4132 | |||
3971 | spin_unlock(&kvm->mmu_lock); | 4133 | spin_unlock(&kvm->mmu_lock); |
3972 | srcu_read_unlock(&kvm->srcu, idx); | 4134 | srcu_read_unlock(&kvm->srcu, idx); |
4135 | |||
4136 | list_move_tail(&kvm->vm_list, &vm_list); | ||
4137 | break; | ||
3973 | } | 4138 | } |
3974 | if (kvm_freed) | ||
3975 | list_move_tail(&kvm_freed->vm_list, &vm_list); | ||
3976 | 4139 | ||
3977 | raw_spin_unlock(&kvm_lock); | 4140 | raw_spin_unlock(&kvm_lock); |
3978 | 4141 | ||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322a..cd6e98333ba3 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -54,8 +54,8 @@ | |||
54 | */ | 54 | */ |
55 | TRACE_EVENT( | 55 | TRACE_EVENT( |
56 | kvm_mmu_pagetable_walk, | 56 | kvm_mmu_pagetable_walk, |
57 | TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), | 57 | TP_PROTO(u64 addr, u32 pferr), |
58 | TP_ARGS(addr, write_fault, user_fault, fetch_fault), | 58 | TP_ARGS(addr, pferr), |
59 | 59 | ||
60 | TP_STRUCT__entry( | 60 | TP_STRUCT__entry( |
61 | __field(__u64, addr) | 61 | __field(__u64, addr) |
@@ -64,8 +64,7 @@ TRACE_EVENT( | |||
64 | 64 | ||
65 | TP_fast_assign( | 65 | TP_fast_assign( |
66 | __entry->addr = addr; | 66 | __entry->addr = addr; |
67 | __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) | 67 | __entry->pferr = pferr; |
68 | | (!!fetch_fault << 4); | ||
69 | ), | 68 | ), |
70 | 69 | ||
71 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, | 70 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, |
@@ -243,6 +242,44 @@ TRACE_EVENT( | |||
243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, | 242 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, |
244 | __entry->access) | 243 | __entry->access) |
245 | ); | 244 | ); |
245 | |||
246 | #define __spte_satisfied(__spte) \ | ||
247 | (__entry->retry && is_writable_pte(__entry->__spte)) | ||
248 | |||
249 | TRACE_EVENT( | ||
250 | fast_page_fault, | ||
251 | TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, | ||
252 | u64 *sptep, u64 old_spte, bool retry), | ||
253 | TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), | ||
254 | |||
255 | TP_STRUCT__entry( | ||
256 | __field(int, vcpu_id) | ||
257 | __field(gva_t, gva) | ||
258 | __field(u32, error_code) | ||
259 | __field(u64 *, sptep) | ||
260 | __field(u64, old_spte) | ||
261 | __field(u64, new_spte) | ||
262 | __field(bool, retry) | ||
263 | ), | ||
264 | |||
265 | TP_fast_assign( | ||
266 | __entry->vcpu_id = vcpu->vcpu_id; | ||
267 | __entry->gva = gva; | ||
268 | __entry->error_code = error_code; | ||
269 | __entry->sptep = sptep; | ||
270 | __entry->old_spte = old_spte; | ||
271 | __entry->new_spte = *sptep; | ||
272 | __entry->retry = retry; | ||
273 | ), | ||
274 | |||
275 | TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" | ||
276 | " new %llx spurious %d fixed %d", __entry->vcpu_id, | ||
277 | __entry->gva, __print_flags(__entry->error_code, "|", | ||
278 | kvm_mmu_trace_pferr_flags), __entry->sptep, | ||
279 | __entry->old_spte, __entry->new_spte, | ||
280 | __spte_satisfied(old_spte), __spte_satisfied(new_spte) | ||
281 | ) | ||
282 | ); | ||
246 | #endif /* _TRACE_KVMMMU_H */ | 283 | #endif /* _TRACE_KVMMMU_H */ |
247 | 284 | ||
248 | #undef TRACE_INCLUDE_PATH | 285 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef1..bb7cf01cae76 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
154 | const int fetch_fault = access & PFERR_FETCH_MASK; | 154 | const int fetch_fault = access & PFERR_FETCH_MASK; |
155 | u16 errcode = 0; | 155 | u16 errcode = 0; |
156 | 156 | ||
157 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 157 | trace_kvm_mmu_pagetable_walk(addr, access); |
158 | fetch_fault); | ||
159 | retry_walk: | 158 | retry_walk: |
160 | eperm = false; | 159 | eperm = false; |
161 | walker->level = mmu->root_level; | 160 | walker->level = mmu->root_level; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b268..baead950d6c8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
3185 | break; | 3185 | break; |
3186 | case MSR_IA32_DEBUGCTLMSR: | 3186 | case MSR_IA32_DEBUGCTLMSR: |
3187 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { | 3187 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { |
3188 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", | 3188 | vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
3189 | __func__, data); | 3189 | __func__, data); |
3190 | break; | 3190 | break; |
3191 | } | 3191 | } |
3192 | if (data & DEBUGCTL_RESERVED_BITS) | 3192 | if (data & DEBUGCTL_RESERVED_BITS) |
@@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
3205 | case MSR_VM_CR: | 3205 | case MSR_VM_CR: |
3206 | return svm_set_vm_cr(vcpu, data); | 3206 | return svm_set_vm_cr(vcpu, data); |
3207 | case MSR_VM_IGNNE: | 3207 | case MSR_VM_IGNNE: |
3208 | pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); | 3208 | vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); |
3209 | break; | 3209 | break; |
3210 | default: | 3210 | default: |
3211 | return kvm_set_msr_common(vcpu, ecx, data); | 3211 | return kvm_set_msr_common(vcpu, ecx, data); |
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void) | |||
4044 | return false; | 4044 | return false; |
4045 | } | 4045 | } |
4046 | 4046 | ||
4047 | static bool svm_invpcid_supported(void) | ||
4048 | { | ||
4049 | return false; | ||
4050 | } | ||
4051 | |||
4047 | static bool svm_has_wbinvd_exit(void) | 4052 | static bool svm_has_wbinvd_exit(void) |
4048 | { | 4053 | { |
4049 | return true; | 4054 | return true; |
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
4312 | .cpuid_update = svm_cpuid_update, | 4317 | .cpuid_update = svm_cpuid_update, |
4313 | 4318 | ||
4314 | .rdtscp_supported = svm_rdtscp_supported, | 4319 | .rdtscp_supported = svm_rdtscp_supported, |
4320 | .invpcid_supported = svm_invpcid_supported, | ||
4315 | 4321 | ||
4316 | .set_supported_cpuid = svm_set_supported_cpuid, | 4322 | .set_supported_cpuid = svm_set_supported_cpuid, |
4317 | 4323 | ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 62d02e3c3ed6..a71faf727ff3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, | |||
517 | __entry->coalesced ? " (coalesced)" : "") | 517 | __entry->coalesced ? " (coalesced)" : "") |
518 | ); | 518 | ); |
519 | 519 | ||
520 | TRACE_EVENT(kvm_eoi, | ||
521 | TP_PROTO(struct kvm_lapic *apic, int vector), | ||
522 | TP_ARGS(apic, vector), | ||
523 | |||
524 | TP_STRUCT__entry( | ||
525 | __field( __u32, apicid ) | ||
526 | __field( int, vector ) | ||
527 | ), | ||
528 | |||
529 | TP_fast_assign( | ||
530 | __entry->apicid = apic->vcpu->vcpu_id; | ||
531 | __entry->vector = vector; | ||
532 | ), | ||
533 | |||
534 | TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) | ||
535 | ); | ||
536 | |||
537 | TRACE_EVENT(kvm_pv_eoi, | ||
538 | TP_PROTO(struct kvm_lapic *apic, int vector), | ||
539 | TP_ARGS(apic, vector), | ||
540 | |||
541 | TP_STRUCT__entry( | ||
542 | __field( __u32, apicid ) | ||
543 | __field( int, vector ) | ||
544 | ), | ||
545 | |||
546 | TP_fast_assign( | ||
547 | __entry->apicid = apic->vcpu->vcpu_id; | ||
548 | __entry->vector = vector; | ||
549 | ), | ||
550 | |||
551 | TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) | ||
552 | ); | ||
553 | |||
520 | /* | 554 | /* |
521 | * Tracepoint for nested VMRUN | 555 | * Tracepoint for nested VMRUN |
522 | */ | 556 | */ |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb58866292..c39b60707e02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1; | |||
71 | module_param_named(unrestricted_guest, | 71 | module_param_named(unrestricted_guest, |
72 | enable_unrestricted_guest, bool, S_IRUGO); | 72 | enable_unrestricted_guest, bool, S_IRUGO); |
73 | 73 | ||
74 | static bool __read_mostly emulate_invalid_guest_state = 0; | 74 | static bool __read_mostly enable_ept_ad_bits = 1; |
75 | module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); | ||
76 | |||
77 | static bool __read_mostly emulate_invalid_guest_state = true; | ||
75 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | 78 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
76 | 79 | ||
77 | static bool __read_mostly vmm_exclusive = 1; | 80 | static bool __read_mostly vmm_exclusive = 1; |
@@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); | |||
615 | static void kvm_cpu_vmxoff(void); | 618 | static void kvm_cpu_vmxoff(void); |
616 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | 619 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); |
617 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | 620 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); |
621 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
622 | struct kvm_segment *var, int seg); | ||
623 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
624 | struct kvm_segment *var, int seg); | ||
618 | 625 | ||
619 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 626 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
620 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 627 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
@@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) | |||
789 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; | 796 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; |
790 | } | 797 | } |
791 | 798 | ||
799 | static inline bool cpu_has_vmx_ept_ad_bits(void) | ||
800 | { | ||
801 | return vmx_capability.ept & VMX_EPT_AD_BIT; | ||
802 | } | ||
803 | |||
792 | static inline bool cpu_has_vmx_invept_individual_addr(void) | 804 | static inline bool cpu_has_vmx_invept_individual_addr(void) |
793 | { | 805 | { |
794 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; | 806 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; |
@@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void) | |||
849 | SECONDARY_EXEC_RDTSCP; | 861 | SECONDARY_EXEC_RDTSCP; |
850 | } | 862 | } |
851 | 863 | ||
864 | static inline bool cpu_has_vmx_invpcid(void) | ||
865 | { | ||
866 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
867 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
868 | } | ||
869 | |||
852 | static inline bool cpu_has_virtual_nmis(void) | 870 | static inline bool cpu_has_virtual_nmis(void) |
853 | { | 871 | { |
854 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | 872 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; |
@@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void) | |||
1739 | return cpu_has_vmx_rdtscp(); | 1757 | return cpu_has_vmx_rdtscp(); |
1740 | } | 1758 | } |
1741 | 1759 | ||
1760 | static bool vmx_invpcid_supported(void) | ||
1761 | { | ||
1762 | return cpu_has_vmx_invpcid() && enable_ept; | ||
1763 | } | ||
1764 | |||
1742 | /* | 1765 | /* |
1743 | * Swap MSR entry in host/guest MSR entry array. | 1766 | * Swap MSR entry in host/guest MSR entry array. |
1744 | */ | 1767 | */ |
@@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2458 | SECONDARY_EXEC_ENABLE_EPT | | 2481 | SECONDARY_EXEC_ENABLE_EPT | |
2459 | SECONDARY_EXEC_UNRESTRICTED_GUEST | | 2482 | SECONDARY_EXEC_UNRESTRICTED_GUEST | |
2460 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | | 2483 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | |
2461 | SECONDARY_EXEC_RDTSCP; | 2484 | SECONDARY_EXEC_RDTSCP | |
2485 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
2462 | if (adjust_vmx_controls(min2, opt2, | 2486 | if (adjust_vmx_controls(min2, opt2, |
2463 | MSR_IA32_VMX_PROCBASED_CTLS2, | 2487 | MSR_IA32_VMX_PROCBASED_CTLS2, |
2464 | &_cpu_based_2nd_exec_control) < 0) | 2488 | &_cpu_based_2nd_exec_control) < 0) |
@@ -2645,8 +2669,12 @@ static __init int hardware_setup(void) | |||
2645 | !cpu_has_vmx_ept_4levels()) { | 2669 | !cpu_has_vmx_ept_4levels()) { |
2646 | enable_ept = 0; | 2670 | enable_ept = 0; |
2647 | enable_unrestricted_guest = 0; | 2671 | enable_unrestricted_guest = 0; |
2672 | enable_ept_ad_bits = 0; | ||
2648 | } | 2673 | } |
2649 | 2674 | ||
2675 | if (!cpu_has_vmx_ept_ad_bits()) | ||
2676 | enable_ept_ad_bits = 0; | ||
2677 | |||
2650 | if (!cpu_has_vmx_unrestricted_guest()) | 2678 | if (!cpu_has_vmx_unrestricted_guest()) |
2651 | enable_unrestricted_guest = 0; | 2679 | enable_unrestricted_guest = 0; |
2652 | 2680 | ||
@@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
2770 | { | 2798 | { |
2771 | unsigned long flags; | 2799 | unsigned long flags; |
2772 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2800 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2801 | struct kvm_segment var; | ||
2773 | 2802 | ||
2774 | if (enable_unrestricted_guest) | 2803 | if (enable_unrestricted_guest) |
2775 | return; | 2804 | return; |
@@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
2813 | if (emulate_invalid_guest_state) | 2842 | if (emulate_invalid_guest_state) |
2814 | goto continue_rmode; | 2843 | goto continue_rmode; |
2815 | 2844 | ||
2816 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | 2845 | vmx_get_segment(vcpu, &var, VCPU_SREG_SS); |
2817 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | 2846 | vmx_set_segment(vcpu, &var, VCPU_SREG_SS); |
2818 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | 2847 | |
2848 | vmx_get_segment(vcpu, &var, VCPU_SREG_CS); | ||
2849 | vmx_set_segment(vcpu, &var, VCPU_SREG_CS); | ||
2850 | |||
2851 | vmx_get_segment(vcpu, &var, VCPU_SREG_ES); | ||
2852 | vmx_set_segment(vcpu, &var, VCPU_SREG_ES); | ||
2853 | |||
2854 | vmx_get_segment(vcpu, &var, VCPU_SREG_DS); | ||
2855 | vmx_set_segment(vcpu, &var, VCPU_SREG_DS); | ||
2819 | 2856 | ||
2820 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | 2857 | vmx_get_segment(vcpu, &var, VCPU_SREG_GS); |
2821 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | 2858 | vmx_set_segment(vcpu, &var, VCPU_SREG_GS); |
2822 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
2823 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
2824 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
2825 | 2859 | ||
2826 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); | 2860 | vmx_get_segment(vcpu, &var, VCPU_SREG_FS); |
2827 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); | 2861 | vmx_set_segment(vcpu, &var, VCPU_SREG_FS); |
2828 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); | ||
2829 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); | ||
2830 | 2862 | ||
2831 | continue_rmode: | 2863 | continue_rmode: |
2832 | kvm_mmu_reset_context(vcpu); | 2864 | kvm_mmu_reset_context(vcpu); |
@@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa) | |||
3027 | /* TODO write the value reading from MSR */ | 3059 | /* TODO write the value reading from MSR */ |
3028 | eptp = VMX_EPT_DEFAULT_MT | | 3060 | eptp = VMX_EPT_DEFAULT_MT | |
3029 | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; | 3061 | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; |
3062 | if (enable_ept_ad_bits) | ||
3063 | eptp |= VMX_EPT_AD_ENABLE_BIT; | ||
3030 | eptp |= (root_hpa & PAGE_MASK); | 3064 | eptp |= (root_hpa & PAGE_MASK); |
3031 | 3065 | ||
3032 | return eptp; | 3066 | return eptp; |
@@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) | |||
3153 | 3187 | ||
3154 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | 3188 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) |
3155 | { | 3189 | { |
3190 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
3191 | |||
3192 | /* | ||
3193 | * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations | ||
3194 | * fail; use the cache instead. | ||
3195 | */ | ||
3196 | if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { | ||
3197 | return vmx->cpl; | ||
3198 | } | ||
3199 | |||
3156 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { | 3200 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { |
3157 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3201 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
3158 | to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); | 3202 | vmx->cpl = __vmx_get_cpl(vcpu); |
3159 | } | 3203 | } |
3160 | return to_vmx(vcpu)->cpl; | 3204 | |
3205 | return vmx->cpl; | ||
3161 | } | 3206 | } |
3162 | 3207 | ||
3163 | 3208 | ||
@@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
3165 | { | 3210 | { |
3166 | u32 ar; | 3211 | u32 ar; |
3167 | 3212 | ||
3168 | if (var->unusable) | 3213 | if (var->unusable || !var->present) |
3169 | ar = 1 << 16; | 3214 | ar = 1 << 16; |
3170 | else { | 3215 | else { |
3171 | ar = var->type & 15; | 3216 | ar = var->type & 15; |
@@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
3177 | ar |= (var->db & 1) << 14; | 3222 | ar |= (var->db & 1) << 14; |
3178 | ar |= (var->g & 1) << 15; | 3223 | ar |= (var->g & 1) << 15; |
3179 | } | 3224 | } |
3180 | if (ar == 0) /* a 0 value means unusable */ | ||
3181 | ar = AR_UNUSABLE_MASK; | ||
3182 | 3225 | ||
3183 | return ar; | 3226 | return ar; |
3184 | } | 3227 | } |
@@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3229 | 3272 | ||
3230 | vmcs_write32(sf->ar_bytes, ar); | 3273 | vmcs_write32(sf->ar_bytes, ar); |
3231 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3274 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
3275 | |||
3276 | /* | ||
3277 | * Fix segments for real mode guest in hosts that don't have | ||
3278 | * "unrestricted_mode" or it was disabled. | ||
3279 | * This is done to allow migration of the guests from hosts with | ||
3280 | * unrestricted guest like Westmere to older host that don't have | ||
3281 | * unrestricted guest like Nehelem. | ||
3282 | */ | ||
3283 | if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { | ||
3284 | switch (seg) { | ||
3285 | case VCPU_SREG_CS: | ||
3286 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
3287 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
3288 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
3289 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
3290 | vmcs_write16(GUEST_CS_SELECTOR, | ||
3291 | vmcs_readl(GUEST_CS_BASE) >> 4); | ||
3292 | break; | ||
3293 | case VCPU_SREG_ES: | ||
3294 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); | ||
3295 | break; | ||
3296 | case VCPU_SREG_DS: | ||
3297 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); | ||
3298 | break; | ||
3299 | case VCPU_SREG_GS: | ||
3300 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); | ||
3301 | break; | ||
3302 | case VCPU_SREG_FS: | ||
3303 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); | ||
3304 | break; | ||
3305 | case VCPU_SREG_SS: | ||
3306 | vmcs_write16(GUEST_SS_SELECTOR, | ||
3307 | vmcs_readl(GUEST_SS_BASE) >> 4); | ||
3308 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
3309 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
3310 | break; | ||
3311 | } | ||
3312 | } | ||
3232 | } | 3313 | } |
3233 | 3314 | ||
3234 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 3315 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
@@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | |||
3731 | if (!enable_ept) { | 3812 | if (!enable_ept) { |
3732 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | 3813 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; |
3733 | enable_unrestricted_guest = 0; | 3814 | enable_unrestricted_guest = 0; |
3815 | /* Enable INVPCID for non-ept guests may cause performance regression. */ | ||
3816 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
3734 | } | 3817 | } |
3735 | if (!enable_unrestricted_guest) | 3818 | if (!enable_unrestricted_guest) |
3736 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | 3819 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; |
@@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
4489 | break; | 4572 | break; |
4490 | } | 4573 | } |
4491 | vcpu->run->exit_reason = 0; | 4574 | vcpu->run->exit_reason = 0; |
4492 | pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | 4575 | vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", |
4493 | (int)(exit_qualification >> 4) & 3, cr); | 4576 | (int)(exit_qualification >> 4) & 3, cr); |
4494 | return 0; | 4577 | return 0; |
4495 | } | 4578 | } |
@@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
4769 | { | 4852 | { |
4770 | unsigned long exit_qualification; | 4853 | unsigned long exit_qualification; |
4771 | gpa_t gpa; | 4854 | gpa_t gpa; |
4855 | u32 error_code; | ||
4772 | int gla_validity; | 4856 | int gla_validity; |
4773 | 4857 | ||
4774 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 4858 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
4793 | 4877 | ||
4794 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 4878 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
4795 | trace_kvm_page_fault(gpa, exit_qualification); | 4879 | trace_kvm_page_fault(gpa, exit_qualification); |
4796 | return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); | 4880 | |
4881 | /* It is a write fault? */ | ||
4882 | error_code = exit_qualification & (1U << 1); | ||
4883 | /* ept page table is present? */ | ||
4884 | error_code |= (exit_qualification >> 3) & 0x1; | ||
4885 | |||
4886 | return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); | ||
4797 | } | 4887 | } |
4798 | 4888 | ||
4799 | static u64 ept_rsvd_mask(u64 spte, int level) | 4889 | static u64 ept_rsvd_mask(u64 spte, int level) |
@@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
4908 | int ret = 1; | 4998 | int ret = 1; |
4909 | u32 cpu_exec_ctrl; | 4999 | u32 cpu_exec_ctrl; |
4910 | bool intr_window_requested; | 5000 | bool intr_window_requested; |
5001 | unsigned count = 130; | ||
4911 | 5002 | ||
4912 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 5003 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
4913 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | 5004 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; |
4914 | 5005 | ||
4915 | while (!guest_state_valid(vcpu)) { | 5006 | while (!guest_state_valid(vcpu) && count-- != 0) { |
4916 | if (intr_window_requested | 5007 | if (intr_window_requested && vmx_interrupt_allowed(vcpu)) |
4917 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | ||
4918 | return handle_interrupt_window(&vmx->vcpu); | 5008 | return handle_interrupt_window(&vmx->vcpu); |
4919 | 5009 | ||
5010 | if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) | ||
5011 | return 1; | ||
5012 | |||
4920 | err = emulate_instruction(vcpu, 0); | 5013 | err = emulate_instruction(vcpu, 0); |
4921 | 5014 | ||
4922 | if (err == EMULATE_DO_MMIO) { | 5015 | if (err == EMULATE_DO_MMIO) { |
@@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
4924 | goto out; | 5017 | goto out; |
4925 | } | 5018 | } |
4926 | 5019 | ||
4927 | if (err != EMULATE_DONE) | 5020 | if (err != EMULATE_DONE) { |
5021 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
5022 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
5023 | vcpu->run->internal.ndata = 0; | ||
4928 | return 0; | 5024 | return 0; |
5025 | } | ||
4929 | 5026 | ||
4930 | if (signal_pending(current)) | 5027 | if (signal_pending(current)) |
4931 | goto out; | 5028 | goto out; |
@@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
4933 | schedule(); | 5030 | schedule(); |
4934 | } | 5031 | } |
4935 | 5032 | ||
4936 | vmx->emulation_required = 0; | 5033 | vmx->emulation_required = !guest_state_valid(vcpu); |
4937 | out: | 5034 | out: |
4938 | return ret; | 5035 | return ret; |
4939 | } | 5036 | } |
@@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
6467 | } | 6564 | } |
6468 | } | 6565 | } |
6469 | } | 6566 | } |
6567 | |||
6568 | exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
6569 | /* Exposing INVPCID only when PCID is exposed */ | ||
6570 | best = kvm_find_cpuid_entry(vcpu, 0x7, 0); | ||
6571 | if (vmx_invpcid_supported() && | ||
6572 | best && (best->ecx & bit(X86_FEATURE_INVPCID)) && | ||
6573 | guest_cpuid_has_pcid(vcpu)) { | ||
6574 | exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; | ||
6575 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
6576 | exec_control); | ||
6577 | } else { | ||
6578 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
6579 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
6580 | exec_control); | ||
6581 | if (best) | ||
6582 | best->ecx &= ~bit(X86_FEATURE_INVPCID); | ||
6583 | } | ||
6470 | } | 6584 | } |
6471 | 6585 | ||
6472 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 6586 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
@@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
7201 | .cpuid_update = vmx_cpuid_update, | 7315 | .cpuid_update = vmx_cpuid_update, |
7202 | 7316 | ||
7203 | .rdtscp_supported = vmx_rdtscp_supported, | 7317 | .rdtscp_supported = vmx_rdtscp_supported, |
7318 | .invpcid_supported = vmx_invpcid_supported, | ||
7204 | 7319 | ||
7205 | .set_supported_cpuid = vmx_set_supported_cpuid, | 7320 | .set_supported_cpuid = vmx_set_supported_cpuid, |
7206 | 7321 | ||
@@ -7230,23 +7345,21 @@ static int __init vmx_init(void) | |||
7230 | if (!vmx_io_bitmap_a) | 7345 | if (!vmx_io_bitmap_a) |
7231 | return -ENOMEM; | 7346 | return -ENOMEM; |
7232 | 7347 | ||
7348 | r = -ENOMEM; | ||
7349 | |||
7233 | vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); | 7350 | vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); |
7234 | if (!vmx_io_bitmap_b) { | 7351 | if (!vmx_io_bitmap_b) |
7235 | r = -ENOMEM; | ||
7236 | goto out; | 7352 | goto out; |
7237 | } | ||
7238 | 7353 | ||
7239 | vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); | 7354 | vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); |
7240 | if (!vmx_msr_bitmap_legacy) { | 7355 | if (!vmx_msr_bitmap_legacy) |
7241 | r = -ENOMEM; | ||
7242 | goto out1; | 7356 | goto out1; |
7243 | } | 7357 | |
7244 | 7358 | ||
7245 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); | 7359 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); |
7246 | if (!vmx_msr_bitmap_longmode) { | 7360 | if (!vmx_msr_bitmap_longmode) |
7247 | r = -ENOMEM; | ||
7248 | goto out2; | 7361 | goto out2; |
7249 | } | 7362 | |
7250 | 7363 | ||
7251 | /* | 7364 | /* |
7252 | * Allow direct access to the PC debug port (it is often used for I/O | 7365 | * Allow direct access to the PC debug port (it is often used for I/O |
@@ -7275,8 +7388,10 @@ static int __init vmx_init(void) | |||
7275 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7388 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
7276 | 7389 | ||
7277 | if (enable_ept) { | 7390 | if (enable_ept) { |
7278 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 7391 | kvm_mmu_set_mask_ptes(0ull, |
7279 | VMX_EPT_EXECUTABLE_MASK); | 7392 | (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, |
7393 | (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, | ||
7394 | 0ull, VMX_EPT_EXECUTABLE_MASK); | ||
7280 | ept_set_mmio_spte_mask(); | 7395 | ept_set_mmio_spte_mask(); |
7281 | kvm_enable_tdp(); | 7396 | kvm_enable_tdp(); |
7282 | } else | 7397 | } else |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7..59b59508ff07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
528 | return 1; | 528 | return 1; |
529 | } | 529 | } |
530 | 530 | ||
531 | if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) | ||
532 | return 1; | ||
533 | |||
531 | kvm_x86_ops->set_cr0(vcpu, cr0); | 534 | kvm_x86_ops->set_cr0(vcpu, cr0); |
532 | 535 | ||
533 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { | 536 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { |
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
604 | kvm_read_cr3(vcpu))) | 607 | kvm_read_cr3(vcpu))) |
605 | return 1; | 608 | return 1; |
606 | 609 | ||
610 | if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { | ||
611 | if (!guest_cpuid_has_pcid(vcpu)) | ||
612 | return 1; | ||
613 | |||
614 | /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ | ||
615 | if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) | ||
616 | return 1; | ||
617 | } | ||
618 | |||
607 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) | 619 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) |
608 | return 1; | 620 | return 1; |
609 | 621 | ||
610 | if ((cr4 ^ old_cr4) & pdptr_bits) | 622 | if (((cr4 ^ old_cr4) & pdptr_bits) || |
623 | (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) | ||
611 | kvm_mmu_reset_context(vcpu); | 624 | kvm_mmu_reset_context(vcpu); |
612 | 625 | ||
613 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) | 626 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) |
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
626 | } | 639 | } |
627 | 640 | ||
628 | if (is_long_mode(vcpu)) { | 641 | if (is_long_mode(vcpu)) { |
629 | if (cr3 & CR3_L_MODE_RESERVED_BITS) | 642 | if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { |
630 | return 1; | 643 | if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) |
644 | return 1; | ||
645 | } else | ||
646 | if (cr3 & CR3_L_MODE_RESERVED_BITS) | ||
647 | return 1; | ||
631 | } else { | 648 | } else { |
632 | if (is_pae(vcpu)) { | 649 | if (is_pae(vcpu)) { |
633 | if (cr3 & CR3_PAE_RESERVED_BITS) | 650 | if (cr3 & CR3_PAE_RESERVED_BITS) |
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = { | |||
795 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 812 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
796 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 813 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
797 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, | 814 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
815 | MSR_KVM_PV_EOI_EN, | ||
798 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 816 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
799 | MSR_STAR, | 817 | MSR_STAR, |
800 | #ifdef CONFIG_X86_64 | 818 | #ifdef CONFIG_X86_64 |
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1437 | break; | 1455 | break; |
1438 | } | 1456 | } |
1439 | default: | 1457 | default: |
1440 | pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " | 1458 | vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " |
1441 | "data 0x%llx\n", msr, data); | 1459 | "data 0x%llx\n", msr, data); |
1442 | return 1; | 1460 | return 1; |
1443 | } | 1461 | } |
1444 | return 0; | 1462 | return 0; |
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1470 | case HV_X64_MSR_TPR: | 1488 | case HV_X64_MSR_TPR: |
1471 | return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); | 1489 | return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); |
1472 | default: | 1490 | default: |
1473 | pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " | 1491 | vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " |
1474 | "data 0x%llx\n", msr, data); | 1492 | "data 0x%llx\n", msr, data); |
1475 | return 1; | 1493 | return 1; |
1476 | } | 1494 | } |
1477 | 1495 | ||
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1551 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ | 1569 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ |
1552 | data &= ~(u64)0x8; /* ignore TLB cache disable */ | 1570 | data &= ~(u64)0x8; /* ignore TLB cache disable */ |
1553 | if (data != 0) { | 1571 | if (data != 0) { |
1554 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | 1572 | vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", |
1555 | data); | 1573 | data); |
1556 | return 1; | 1574 | return 1; |
1557 | } | 1575 | } |
1558 | break; | 1576 | break; |
1559 | case MSR_FAM10H_MMIO_CONF_BASE: | 1577 | case MSR_FAM10H_MMIO_CONF_BASE: |
1560 | if (data != 0) { | 1578 | if (data != 0) { |
1561 | pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " | 1579 | vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " |
1562 | "0x%llx\n", data); | 1580 | "0x%llx\n", data); |
1563 | return 1; | 1581 | return 1; |
1564 | } | 1582 | } |
1565 | break; | 1583 | break; |
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1574 | thus reserved and should throw a #GP */ | 1592 | thus reserved and should throw a #GP */ |
1575 | return 1; | 1593 | return 1; |
1576 | } | 1594 | } |
1577 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | 1595 | vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", |
1578 | __func__, data); | 1596 | __func__, data); |
1579 | break; | 1597 | break; |
1580 | case MSR_IA32_UCODE_REV: | 1598 | case MSR_IA32_UCODE_REV: |
1581 | case MSR_IA32_UCODE_WRITE: | 1599 | case MSR_IA32_UCODE_WRITE: |
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1653 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | 1671 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); |
1654 | 1672 | ||
1655 | break; | 1673 | break; |
1674 | case MSR_KVM_PV_EOI_EN: | ||
1675 | if (kvm_lapic_enable_pv_eoi(vcpu, data)) | ||
1676 | return 1; | ||
1677 | break; | ||
1656 | 1678 | ||
1657 | case MSR_IA32_MCG_CTL: | 1679 | case MSR_IA32_MCG_CTL: |
1658 | case MSR_IA32_MCG_STATUS: | 1680 | case MSR_IA32_MCG_STATUS: |
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1671 | case MSR_K7_EVNTSEL2: | 1693 | case MSR_K7_EVNTSEL2: |
1672 | case MSR_K7_EVNTSEL3: | 1694 | case MSR_K7_EVNTSEL3: |
1673 | if (data != 0) | 1695 | if (data != 0) |
1674 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1696 | vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
1675 | "0x%x data 0x%llx\n", msr, data); | 1697 | "0x%x data 0x%llx\n", msr, data); |
1676 | break; | 1698 | break; |
1677 | /* at least RHEL 4 unconditionally writes to the perfctr registers, | 1699 | /* at least RHEL 4 unconditionally writes to the perfctr registers, |
1678 | * so we ignore writes to make it happy. | 1700 | * so we ignore writes to make it happy. |
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1681 | case MSR_K7_PERFCTR1: | 1703 | case MSR_K7_PERFCTR1: |
1682 | case MSR_K7_PERFCTR2: | 1704 | case MSR_K7_PERFCTR2: |
1683 | case MSR_K7_PERFCTR3: | 1705 | case MSR_K7_PERFCTR3: |
1684 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1706 | vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
1685 | "0x%x data 0x%llx\n", msr, data); | 1707 | "0x%x data 0x%llx\n", msr, data); |
1686 | break; | 1708 | break; |
1687 | case MSR_P6_PERFCTR0: | 1709 | case MSR_P6_PERFCTR0: |
1688 | case MSR_P6_PERFCTR1: | 1710 | case MSR_P6_PERFCTR1: |
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1693 | return kvm_pmu_set_msr(vcpu, msr, data); | 1715 | return kvm_pmu_set_msr(vcpu, msr, data); |
1694 | 1716 | ||
1695 | if (pr || data != 0) | 1717 | if (pr || data != 0) |
1696 | pr_unimpl(vcpu, "disabled perfctr wrmsr: " | 1718 | vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " |
1697 | "0x%x data 0x%llx\n", msr, data); | 1719 | "0x%x data 0x%llx\n", msr, data); |
1698 | break; | 1720 | break; |
1699 | case MSR_K7_CLK_CTL: | 1721 | case MSR_K7_CLK_CTL: |
1700 | /* | 1722 | /* |
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1720 | /* Drop writes to this legacy MSR -- see rdmsr | 1742 | /* Drop writes to this legacy MSR -- see rdmsr |
1721 | * counterpart for further detail. | 1743 | * counterpart for further detail. |
1722 | */ | 1744 | */ |
1723 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); | 1745 | vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); |
1724 | break; | 1746 | break; |
1725 | case MSR_AMD64_OSVW_ID_LENGTH: | 1747 | case MSR_AMD64_OSVW_ID_LENGTH: |
1726 | if (!guest_cpuid_has_osvw(vcpu)) | 1748 | if (!guest_cpuid_has_osvw(vcpu)) |
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1738 | if (kvm_pmu_msr(vcpu, msr)) | 1760 | if (kvm_pmu_msr(vcpu, msr)) |
1739 | return kvm_pmu_set_msr(vcpu, msr, data); | 1761 | return kvm_pmu_set_msr(vcpu, msr, data); |
1740 | if (!ignore_msrs) { | 1762 | if (!ignore_msrs) { |
1741 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", | 1763 | vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", |
1742 | msr, data); | 1764 | msr, data); |
1743 | return 1; | 1765 | return 1; |
1744 | } else { | 1766 | } else { |
1745 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", | 1767 | vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", |
1746 | msr, data); | 1768 | msr, data); |
1747 | break; | 1769 | break; |
1748 | } | 1770 | } |
1749 | } | 1771 | } |
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1846 | data = kvm->arch.hv_hypercall; | 1868 | data = kvm->arch.hv_hypercall; |
1847 | break; | 1869 | break; |
1848 | default: | 1870 | default: |
1849 | pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); | 1871 | vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); |
1850 | return 1; | 1872 | return 1; |
1851 | } | 1873 | } |
1852 | 1874 | ||
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1877 | data = vcpu->arch.hv_vapic; | 1899 | data = vcpu->arch.hv_vapic; |
1878 | break; | 1900 | break; |
1879 | default: | 1901 | default: |
1880 | pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); | 1902 | vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); |
1881 | return 1; | 1903 | return 1; |
1882 | } | 1904 | } |
1883 | *pdata = data; | 1905 | *pdata = data; |
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
2030 | if (kvm_pmu_msr(vcpu, msr)) | 2052 | if (kvm_pmu_msr(vcpu, msr)) |
2031 | return kvm_pmu_get_msr(vcpu, msr, pdata); | 2053 | return kvm_pmu_get_msr(vcpu, msr, pdata); |
2032 | if (!ignore_msrs) { | 2054 | if (!ignore_msrs) { |
2033 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 2055 | vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
2034 | return 1; | 2056 | return 1; |
2035 | } else { | 2057 | } else { |
2036 | pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); | 2058 | vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); |
2037 | data = 0; | 2059 | data = 0; |
2038 | } | 2060 | } |
2039 | break; | 2061 | break; |
@@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) | |||
4116 | value = kvm_get_cr8(vcpu); | 4138 | value = kvm_get_cr8(vcpu); |
4117 | break; | 4139 | break; |
4118 | default: | 4140 | default: |
4119 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4141 | kvm_err("%s: unexpected cr %u\n", __func__, cr); |
4120 | return 0; | 4142 | return 0; |
4121 | } | 4143 | } |
4122 | 4144 | ||
@@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) | |||
4145 | res = kvm_set_cr8(vcpu, val); | 4167 | res = kvm_set_cr8(vcpu, val); |
4146 | break; | 4168 | break; |
4147 | default: | 4169 | default: |
4148 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4170 | kvm_err("%s: unexpected cr %u\n", __func__, cr); |
4149 | res = -1; | 4171 | res = -1; |
4150 | } | 4172 | } |
4151 | 4173 | ||
@@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, | |||
4297 | return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); | 4319 | return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); |
4298 | } | 4320 | } |
4299 | 4321 | ||
4300 | static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, | 4322 | static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, |
4301 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) | 4323 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) |
4302 | { | 4324 | { |
4303 | struct kvm_cpuid_entry2 *cpuid = NULL; | 4325 | kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); |
4304 | |||
4305 | if (eax && ecx) | ||
4306 | cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), | ||
4307 | *eax, *ecx); | ||
4308 | |||
4309 | if (cpuid) { | ||
4310 | *eax = cpuid->eax; | ||
4311 | *ecx = cpuid->ecx; | ||
4312 | if (ebx) | ||
4313 | *ebx = cpuid->ebx; | ||
4314 | if (edx) | ||
4315 | *edx = cpuid->edx; | ||
4316 | return true; | ||
4317 | } | ||
4318 | |||
4319 | return false; | ||
4320 | } | 4326 | } |
4321 | 4327 | ||
4322 | static struct x86_emulate_ops emulate_ops = { | 4328 | static struct x86_emulate_ops emulate_ops = { |
@@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5296 | 5302 | ||
5297 | r = kvm_mmu_reload(vcpu); | 5303 | r = kvm_mmu_reload(vcpu); |
5298 | if (unlikely(r)) { | 5304 | if (unlikely(r)) { |
5299 | kvm_x86_ops->cancel_injection(vcpu); | 5305 | goto cancel_injection; |
5300 | goto out; | ||
5301 | } | 5306 | } |
5302 | 5307 | ||
5303 | preempt_disable(); | 5308 | preempt_disable(); |
@@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5322 | smp_wmb(); | 5327 | smp_wmb(); |
5323 | local_irq_enable(); | 5328 | local_irq_enable(); |
5324 | preempt_enable(); | 5329 | preempt_enable(); |
5325 | kvm_x86_ops->cancel_injection(vcpu); | ||
5326 | r = 1; | 5330 | r = 1; |
5327 | goto out; | 5331 | goto cancel_injection; |
5328 | } | 5332 | } |
5329 | 5333 | ||
5330 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5334 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
@@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5388 | if (unlikely(vcpu->arch.tsc_always_catchup)) | 5392 | if (unlikely(vcpu->arch.tsc_always_catchup)) |
5389 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 5393 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
5390 | 5394 | ||
5391 | kvm_lapic_sync_from_vapic(vcpu); | 5395 | if (vcpu->arch.apic_attention) |
5396 | kvm_lapic_sync_from_vapic(vcpu); | ||
5392 | 5397 | ||
5393 | r = kvm_x86_ops->handle_exit(vcpu); | 5398 | r = kvm_x86_ops->handle_exit(vcpu); |
5399 | return r; | ||
5400 | |||
5401 | cancel_injection: | ||
5402 | kvm_x86_ops->cancel_injection(vcpu); | ||
5403 | if (unlikely(vcpu->arch.apic_attention)) | ||
5404 | kvm_lapic_sync_from_vapic(vcpu); | ||
5394 | out: | 5405 | out: |
5395 | return r; | 5406 | return r; |
5396 | } | 5407 | } |
@@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, | |||
6304 | 6315 | ||
6305 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6316 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
6306 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { | 6317 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { |
6307 | vfree(free->arch.lpage_info[i]); | 6318 | kvm_kvfree(free->arch.lpage_info[i]); |
6308 | free->arch.lpage_info[i] = NULL; | 6319 | free->arch.lpage_info[i] = NULL; |
6309 | } | 6320 | } |
6310 | } | 6321 | } |
@@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
6323 | slot->base_gfn, level) + 1; | 6334 | slot->base_gfn, level) + 1; |
6324 | 6335 | ||
6325 | slot->arch.lpage_info[i] = | 6336 | slot->arch.lpage_info[i] = |
6326 | vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); | 6337 | kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); |
6327 | if (!slot->arch.lpage_info[i]) | 6338 | if (!slot->arch.lpage_info[i]) |
6328 | goto out_free; | 6339 | goto out_free; |
6329 | 6340 | ||
@@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
6350 | 6361 | ||
6351 | out_free: | 6362 | out_free: |
6352 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6363 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
6353 | vfree(slot->arch.lpage_info[i]); | 6364 | kvm_kvfree(slot->arch.lpage_info[i]); |
6354 | slot->arch.lpage_info[i] = NULL; | 6365 | slot->arch.lpage_info[i] = NULL; |
6355 | } | 6366 | } |
6356 | return -ENOMEM; | 6367 | return -ENOMEM; |