diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-24 16:07:18 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-24 16:07:18 -0500 |
commit | 89f883372fa60f604d136924baf3e89ff1870e9e (patch) | |
tree | cb69b0a14957945ba00d3d392bf9ccbbef56f3b8 /arch/x86 | |
parent | 9e2d59ad580d590134285f361a0e80f0e98c0207 (diff) | |
parent | 6b73a96065e89dc9fa75ba4f78b1aa3a3bbd0470 (diff) |
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti:
"KVM updates for the 3.9 merge window, including x86 real mode
emulation fixes, stronger memory slot interface restrictions, mmu_lock
spinlock hold time reduction, improved handling of large page faults
on shadow, initial APICv HW acceleration support, s390 channel IO
based virtio, amongst others"
* tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
Revert "KVM: MMU: lazily drop large spte"
x86: pvclock kvm: align allocation size to page size
KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr
x86 emulator: fix parity calculation for AAD instruction
KVM: PPC: BookE: Handle alignment interrupts
booke: Added DBCR4 SPR number
KVM: PPC: booke: Allow multiple exception types
KVM: PPC: booke: use vcpu reference from thread_struct
KVM: Remove user_alloc from struct kvm_memory_slot
KVM: VMX: disable apicv by default
KVM: s390: Fix handling of iscs.
KVM: MMU: cleanup __direct_map
KVM: MMU: remove pt_access in mmu_set_spte
KVM: MMU: cleanup mapping-level
KVM: MMU: lazily drop large spte
KVM: VMX: cleanup vmx_set_cr0().
KVM: VMX: add missing exit names to VMX_EXIT_REASONS array
KVM: VMX: disable SMEP feature when guest is in non-paging mode
KVM: Remove duplicate text in api.txt
Revert "KVM: MMU: split kvm_mmu_free_page"
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 26 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_para.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 18 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/vmx.h | 9 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 673 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 74 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 140 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 168 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 106 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 714 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 168 |
17 files changed, 1411 insertions, 765 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dc87b65e9c3a..635a74d22409 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -33,10 +33,10 @@ | |||
33 | 33 | ||
34 | #define KVM_MAX_VCPUS 254 | 34 | #define KVM_MAX_VCPUS 254 |
35 | #define KVM_SOFT_MAX_VCPUS 160 | 35 | #define KVM_SOFT_MAX_VCPUS 160 |
36 | #define KVM_MEMORY_SLOTS 32 | 36 | #define KVM_USER_MEM_SLOTS 125 |
37 | /* memory slots that does not exposed to userspace */ | 37 | /* memory slots that are not exposed to userspace */ |
38 | #define KVM_PRIVATE_MEM_SLOTS 4 | 38 | #define KVM_PRIVATE_MEM_SLOTS 3 |
39 | #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | 39 | #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) |
40 | 40 | ||
41 | #define KVM_MMIO_SIZE 16 | 41 | #define KVM_MMIO_SIZE 16 |
42 | 42 | ||
@@ -219,11 +219,6 @@ struct kvm_mmu_page { | |||
219 | u64 *spt; | 219 | u64 *spt; |
220 | /* hold the gfn of each spte inside spt */ | 220 | /* hold the gfn of each spte inside spt */ |
221 | gfn_t *gfns; | 221 | gfn_t *gfns; |
222 | /* | ||
223 | * One bit set per slot which has memory | ||
224 | * in this shadow page. | ||
225 | */ | ||
226 | DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM); | ||
227 | bool unsync; | 222 | bool unsync; |
228 | int root_count; /* Currently serving as active root */ | 223 | int root_count; /* Currently serving as active root */ |
229 | unsigned int unsync_children; | 224 | unsigned int unsync_children; |
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch { | |||
502 | u64 msr_val; | 497 | u64 msr_val; |
503 | struct gfn_to_hva_cache data; | 498 | struct gfn_to_hva_cache data; |
504 | } pv_eoi; | 499 | } pv_eoi; |
500 | |||
501 | /* | ||
502 | * Indicate whether the access faults on its page table in guest | ||
503 | * which is set when fix page fault and used to detect unhandeable | ||
504 | * instruction. | ||
505 | */ | ||
506 | bool write_fault_to_shadow_pgtable; | ||
505 | }; | 507 | }; |
506 | 508 | ||
507 | struct kvm_lpage_info { | 509 | struct kvm_lpage_info { |
@@ -697,6 +699,11 @@ struct kvm_x86_ops { | |||
697 | void (*enable_nmi_window)(struct kvm_vcpu *vcpu); | 699 | void (*enable_nmi_window)(struct kvm_vcpu *vcpu); |
698 | void (*enable_irq_window)(struct kvm_vcpu *vcpu); | 700 | void (*enable_irq_window)(struct kvm_vcpu *vcpu); |
699 | void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); | 701 | void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); |
702 | int (*vm_has_apicv)(struct kvm *kvm); | ||
703 | void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); | ||
704 | void (*hwapic_isr_update)(struct kvm *kvm, int isr); | ||
705 | void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); | ||
706 | void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); | ||
700 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | 707 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); |
701 | int (*get_tdp_level)(void); | 708 | int (*get_tdp_level)(void); |
702 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 709 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); | |||
991 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 998 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
992 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 999 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
993 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | 1000 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); |
1001 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); | ||
994 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | 1002 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); |
995 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | 1003 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); |
996 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | 1004 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 65231e173baf..695399f2d5eb 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) | |||
27 | * | 27 | * |
28 | * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. | 28 | * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. |
29 | * The hypercall number should be placed in rax and the return value will be | 29 | * The hypercall number should be placed in rax and the return value will be |
30 | * placed in rax. No other registers will be clobbered unless explicited | 30 | * placed in rax. No other registers will be clobbered unless explicitly |
31 | * noted by the particular hypercall. | 31 | * noted by the particular hypercall. |
32 | */ | 32 | */ |
33 | 33 | ||
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 235b49fa554b..b6fbf860e398 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -57,9 +57,12 @@ | |||
57 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | 57 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 |
58 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 | 58 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 |
59 | #define SECONDARY_EXEC_RDTSCP 0x00000008 | 59 | #define SECONDARY_EXEC_RDTSCP 0x00000008 |
60 | #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 | ||
60 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 | 61 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 |
61 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 62 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
62 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 | 63 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 |
64 | #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 | ||
65 | #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 | ||
63 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 | 66 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 |
64 | #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 | 67 | #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 |
65 | 68 | ||
@@ -97,6 +100,7 @@ enum vmcs_field { | |||
97 | GUEST_GS_SELECTOR = 0x0000080a, | 100 | GUEST_GS_SELECTOR = 0x0000080a, |
98 | GUEST_LDTR_SELECTOR = 0x0000080c, | 101 | GUEST_LDTR_SELECTOR = 0x0000080c, |
99 | GUEST_TR_SELECTOR = 0x0000080e, | 102 | GUEST_TR_SELECTOR = 0x0000080e, |
103 | GUEST_INTR_STATUS = 0x00000810, | ||
100 | HOST_ES_SELECTOR = 0x00000c00, | 104 | HOST_ES_SELECTOR = 0x00000c00, |
101 | HOST_CS_SELECTOR = 0x00000c02, | 105 | HOST_CS_SELECTOR = 0x00000c02, |
102 | HOST_SS_SELECTOR = 0x00000c04, | 106 | HOST_SS_SELECTOR = 0x00000c04, |
@@ -124,6 +128,14 @@ enum vmcs_field { | |||
124 | APIC_ACCESS_ADDR_HIGH = 0x00002015, | 128 | APIC_ACCESS_ADDR_HIGH = 0x00002015, |
125 | EPT_POINTER = 0x0000201a, | 129 | EPT_POINTER = 0x0000201a, |
126 | EPT_POINTER_HIGH = 0x0000201b, | 130 | EPT_POINTER_HIGH = 0x0000201b, |
131 | EOI_EXIT_BITMAP0 = 0x0000201c, | ||
132 | EOI_EXIT_BITMAP0_HIGH = 0x0000201d, | ||
133 | EOI_EXIT_BITMAP1 = 0x0000201e, | ||
134 | EOI_EXIT_BITMAP1_HIGH = 0x0000201f, | ||
135 | EOI_EXIT_BITMAP2 = 0x00002020, | ||
136 | EOI_EXIT_BITMAP2_HIGH = 0x00002021, | ||
137 | EOI_EXIT_BITMAP3 = 0x00002022, | ||
138 | EOI_EXIT_BITMAP3_HIGH = 0x00002023, | ||
127 | GUEST_PHYSICAL_ADDRESS = 0x00002400, | 139 | GUEST_PHYSICAL_ADDRESS = 0x00002400, |
128 | GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, | 140 | GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, |
129 | VMCS_LINK_POINTER = 0x00002800, | 141 | VMCS_LINK_POINTER = 0x00002800, |
@@ -346,9 +358,9 @@ enum vmcs_field { | |||
346 | 358 | ||
347 | #define AR_RESERVD_MASK 0xfffe0f00 | 359 | #define AR_RESERVD_MASK 0xfffe0f00 |
348 | 360 | ||
349 | #define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) | 361 | #define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) |
350 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) | 362 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) |
351 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) | 363 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) |
352 | 364 | ||
353 | #define VMX_NR_VPIDS (1 << 16) | 365 | #define VMX_NR_VPIDS (1 << 16) |
354 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 | 366 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 |
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 979d03bce135..2871fccfee68 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h | |||
@@ -62,10 +62,12 @@ | |||
62 | #define EXIT_REASON_MCE_DURING_VMENTRY 41 | 62 | #define EXIT_REASON_MCE_DURING_VMENTRY 41 |
63 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | 63 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 |
64 | #define EXIT_REASON_APIC_ACCESS 44 | 64 | #define EXIT_REASON_APIC_ACCESS 44 |
65 | #define EXIT_REASON_EOI_INDUCED 45 | ||
65 | #define EXIT_REASON_EPT_VIOLATION 48 | 66 | #define EXIT_REASON_EPT_VIOLATION 48 |
66 | #define EXIT_REASON_EPT_MISCONFIG 49 | 67 | #define EXIT_REASON_EPT_MISCONFIG 49 |
67 | #define EXIT_REASON_WBINVD 54 | 68 | #define EXIT_REASON_WBINVD 54 |
68 | #define EXIT_REASON_XSETBV 55 | 69 | #define EXIT_REASON_XSETBV 55 |
70 | #define EXIT_REASON_APIC_WRITE 56 | ||
69 | #define EXIT_REASON_INVPCID 58 | 71 | #define EXIT_REASON_INVPCID 58 |
70 | 72 | ||
71 | #define VMX_EXIT_REASONS \ | 73 | #define VMX_EXIT_REASONS \ |
@@ -103,7 +105,12 @@ | |||
103 | { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ | 105 | { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ |
104 | { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ | 106 | { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ |
105 | { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ | 107 | { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ |
106 | { EXIT_REASON_WBINVD, "WBINVD" } | 108 | { EXIT_REASON_WBINVD, "WBINVD" }, \ |
109 | { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ | ||
110 | { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ | ||
111 | { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ | ||
112 | { EXIT_REASON_INVD, "INVD" }, \ | ||
113 | { EXIT_REASON_INVPCID, "INVPCID" } | ||
107 | 114 | ||
108 | 115 | ||
109 | #endif /* _UAPIVMX_H */ | 116 | #endif /* _UAPIVMX_H */ |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 9f966dc0b9e4..0732f0089a3d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -218,6 +218,9 @@ static void kvm_shutdown(void) | |||
218 | void __init kvmclock_init(void) | 218 | void __init kvmclock_init(void) |
219 | { | 219 | { |
220 | unsigned long mem; | 220 | unsigned long mem; |
221 | int size; | ||
222 | |||
223 | size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); | ||
221 | 224 | ||
222 | if (!kvm_para_available()) | 225 | if (!kvm_para_available()) |
223 | return; | 226 | return; |
@@ -231,16 +234,14 @@ void __init kvmclock_init(void) | |||
231 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", | 234 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", |
232 | msr_kvm_system_time, msr_kvm_wall_clock); | 235 | msr_kvm_system_time, msr_kvm_wall_clock); |
233 | 236 | ||
234 | mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, | 237 | mem = memblock_alloc(size, PAGE_SIZE); |
235 | PAGE_SIZE); | ||
236 | if (!mem) | 238 | if (!mem) |
237 | return; | 239 | return; |
238 | hv_clock = __va(mem); | 240 | hv_clock = __va(mem); |
239 | 241 | ||
240 | if (kvm_register_clock("boot clock")) { | 242 | if (kvm_register_clock("boot clock")) { |
241 | hv_clock = NULL; | 243 | hv_clock = NULL; |
242 | memblock_free(mem, | 244 | memblock_free(mem, size); |
243 | sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); | ||
244 | return; | 245 | return; |
245 | } | 246 | } |
246 | pv_time_ops.sched_clock = kvm_clock_read; | 247 | pv_time_ops.sched_clock = kvm_clock_read; |
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) | |||
275 | struct pvclock_vcpu_time_info *vcpu_time; | 276 | struct pvclock_vcpu_time_info *vcpu_time; |
276 | unsigned int size; | 277 | unsigned int size; |
277 | 278 | ||
278 | size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; | 279 | size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); |
279 | 280 | ||
280 | preempt_disable(); | 281 | preempt_disable(); |
281 | cpu = smp_processor_id(); | 282 | cpu = smp_processor_id(); |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a27e76371108..a335cc6cde72 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <asm/kvm_emulate.h> | 26 | #include <asm/kvm_emulate.h> |
27 | #include <linux/stringify.h> | ||
27 | 28 | ||
28 | #include "x86.h" | 29 | #include "x86.h" |
29 | #include "tss.h" | 30 | #include "tss.h" |
@@ -43,7 +44,7 @@ | |||
43 | #define OpCL 9ull /* CL register (for shifts) */ | 44 | #define OpCL 9ull /* CL register (for shifts) */ |
44 | #define OpImmByte 10ull /* 8-bit sign extended immediate */ | 45 | #define OpImmByte 10ull /* 8-bit sign extended immediate */ |
45 | #define OpOne 11ull /* Implied 1 */ | 46 | #define OpOne 11ull /* Implied 1 */ |
46 | #define OpImm 12ull /* Sign extended immediate */ | 47 | #define OpImm 12ull /* Sign extended up to 32-bit immediate */ |
47 | #define OpMem16 13ull /* Memory operand (16-bit). */ | 48 | #define OpMem16 13ull /* Memory operand (16-bit). */ |
48 | #define OpMem32 14ull /* Memory operand (32-bit). */ | 49 | #define OpMem32 14ull /* Memory operand (32-bit). */ |
49 | #define OpImmU 15ull /* Immediate operand, zero extended */ | 50 | #define OpImmU 15ull /* Immediate operand, zero extended */ |
@@ -58,6 +59,7 @@ | |||
58 | #define OpFS 24ull /* FS */ | 59 | #define OpFS 24ull /* FS */ |
59 | #define OpGS 25ull /* GS */ | 60 | #define OpGS 25ull /* GS */ |
60 | #define OpMem8 26ull /* 8-bit zero extended memory operand */ | 61 | #define OpMem8 26ull /* 8-bit zero extended memory operand */ |
62 | #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ | ||
61 | 63 | ||
62 | #define OpBits 5 /* Width of operand field */ | 64 | #define OpBits 5 /* Width of operand field */ |
63 | #define OpMask ((1ull << OpBits) - 1) | 65 | #define OpMask ((1ull << OpBits) - 1) |
@@ -101,6 +103,7 @@ | |||
101 | #define SrcMemFAddr (OpMemFAddr << SrcShift) | 103 | #define SrcMemFAddr (OpMemFAddr << SrcShift) |
102 | #define SrcAcc (OpAcc << SrcShift) | 104 | #define SrcAcc (OpAcc << SrcShift) |
103 | #define SrcImmU16 (OpImmU16 << SrcShift) | 105 | #define SrcImmU16 (OpImmU16 << SrcShift) |
106 | #define SrcImm64 (OpImm64 << SrcShift) | ||
104 | #define SrcDX (OpDX << SrcShift) | 107 | #define SrcDX (OpDX << SrcShift) |
105 | #define SrcMem8 (OpMem8 << SrcShift) | 108 | #define SrcMem8 (OpMem8 << SrcShift) |
106 | #define SrcMask (OpMask << SrcShift) | 109 | #define SrcMask (OpMask << SrcShift) |
@@ -113,6 +116,7 @@ | |||
113 | #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ | 116 | #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ |
114 | #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ | 117 | #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ |
115 | #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ | 118 | #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ |
119 | #define Escape (5<<15) /* Escape to coprocessor instruction */ | ||
116 | #define Sse (1<<18) /* SSE Vector instruction */ | 120 | #define Sse (1<<18) /* SSE Vector instruction */ |
117 | /* Generic ModRM decode. */ | 121 | /* Generic ModRM decode. */ |
118 | #define ModRM (1<<19) | 122 | #define ModRM (1<<19) |
@@ -146,6 +150,8 @@ | |||
146 | #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ | 150 | #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ |
147 | #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ | 151 | #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ |
148 | #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ | 152 | #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ |
153 | #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ | ||
154 | #define NoWrite ((u64)1 << 45) /* No writeback */ | ||
149 | 155 | ||
150 | #define X2(x...) x, x | 156 | #define X2(x...) x, x |
151 | #define X3(x...) X2(x), x | 157 | #define X3(x...) X2(x), x |
@@ -156,6 +162,27 @@ | |||
156 | #define X8(x...) X4(x), X4(x) | 162 | #define X8(x...) X4(x), X4(x) |
157 | #define X16(x...) X8(x), X8(x) | 163 | #define X16(x...) X8(x), X8(x) |
158 | 164 | ||
165 | #define NR_FASTOP (ilog2(sizeof(ulong)) + 1) | ||
166 | #define FASTOP_SIZE 8 | ||
167 | |||
168 | /* | ||
169 | * fastop functions have a special calling convention: | ||
170 | * | ||
171 | * dst: [rdx]:rax (in/out) | ||
172 | * src: rbx (in/out) | ||
173 | * src2: rcx (in) | ||
174 | * flags: rflags (in/out) | ||
175 | * | ||
176 | * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for | ||
177 | * different operand sizes can be reached by calculation, rather than a jump | ||
178 | * table (which would be bigger than the code). | ||
179 | * | ||
180 | * fastop functions are declared as taking a never-defined fastop parameter, | ||
181 | * so they can't be called from C directly. | ||
182 | */ | ||
183 | |||
184 | struct fastop; | ||
185 | |||
159 | struct opcode { | 186 | struct opcode { |
160 | u64 flags : 56; | 187 | u64 flags : 56; |
161 | u64 intercept : 8; | 188 | u64 intercept : 8; |
@@ -164,6 +191,8 @@ struct opcode { | |||
164 | const struct opcode *group; | 191 | const struct opcode *group; |
165 | const struct group_dual *gdual; | 192 | const struct group_dual *gdual; |
166 | const struct gprefix *gprefix; | 193 | const struct gprefix *gprefix; |
194 | const struct escape *esc; | ||
195 | void (*fastop)(struct fastop *fake); | ||
167 | } u; | 196 | } u; |
168 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); | 197 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); |
169 | }; | 198 | }; |
@@ -180,6 +209,11 @@ struct gprefix { | |||
180 | struct opcode pfx_f3; | 209 | struct opcode pfx_f3; |
181 | }; | 210 | }; |
182 | 211 | ||
212 | struct escape { | ||
213 | struct opcode op[8]; | ||
214 | struct opcode high[64]; | ||
215 | }; | ||
216 | |||
183 | /* EFLAGS bit definitions. */ | 217 | /* EFLAGS bit definitions. */ |
184 | #define EFLG_ID (1<<21) | 218 | #define EFLG_ID (1<<21) |
185 | #define EFLG_VIP (1<<20) | 219 | #define EFLG_VIP (1<<20) |
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) | |||
407 | } \ | 441 | } \ |
408 | } while (0) | 442 | } while (0) |
409 | 443 | ||
444 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); | ||
445 | |||
446 | #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" | ||
447 | #define FOP_RET "ret \n\t" | ||
448 | |||
449 | #define FOP_START(op) \ | ||
450 | extern void em_##op(struct fastop *fake); \ | ||
451 | asm(".pushsection .text, \"ax\" \n\t" \ | ||
452 | ".global em_" #op " \n\t" \ | ||
453 | FOP_ALIGN \ | ||
454 | "em_" #op ": \n\t" | ||
455 | |||
456 | #define FOP_END \ | ||
457 | ".popsection") | ||
458 | |||
459 | #define FOPNOP() FOP_ALIGN FOP_RET | ||
460 | |||
461 | #define FOP1E(op, dst) \ | ||
462 | FOP_ALIGN #op " %" #dst " \n\t" FOP_RET | ||
463 | |||
464 | #define FASTOP1(op) \ | ||
465 | FOP_START(op) \ | ||
466 | FOP1E(op##b, al) \ | ||
467 | FOP1E(op##w, ax) \ | ||
468 | FOP1E(op##l, eax) \ | ||
469 | ON64(FOP1E(op##q, rax)) \ | ||
470 | FOP_END | ||
471 | |||
472 | #define FOP2E(op, dst, src) \ | ||
473 | FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET | ||
474 | |||
475 | #define FASTOP2(op) \ | ||
476 | FOP_START(op) \ | ||
477 | FOP2E(op##b, al, bl) \ | ||
478 | FOP2E(op##w, ax, bx) \ | ||
479 | FOP2E(op##l, eax, ebx) \ | ||
480 | ON64(FOP2E(op##q, rax, rbx)) \ | ||
481 | FOP_END | ||
482 | |||
483 | /* 2 operand, word only */ | ||
484 | #define FASTOP2W(op) \ | ||
485 | FOP_START(op) \ | ||
486 | FOPNOP() \ | ||
487 | FOP2E(op##w, ax, bx) \ | ||
488 | FOP2E(op##l, eax, ebx) \ | ||
489 | ON64(FOP2E(op##q, rax, rbx)) \ | ||
490 | FOP_END | ||
491 | |||
492 | /* 2 operand, src is CL */ | ||
493 | #define FASTOP2CL(op) \ | ||
494 | FOP_START(op) \ | ||
495 | FOP2E(op##b, al, cl) \ | ||
496 | FOP2E(op##w, ax, cl) \ | ||
497 | FOP2E(op##l, eax, cl) \ | ||
498 | ON64(FOP2E(op##q, rax, cl)) \ | ||
499 | FOP_END | ||
500 | |||
501 | #define FOP3E(op, dst, src, src2) \ | ||
502 | FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET | ||
503 | |||
504 | /* 3-operand, word-only, src2=cl */ | ||
505 | #define FASTOP3WCL(op) \ | ||
506 | FOP_START(op) \ | ||
507 | FOPNOP() \ | ||
508 | FOP3E(op##w, ax, bx, cl) \ | ||
509 | FOP3E(op##l, eax, ebx, cl) \ | ||
510 | ON64(FOP3E(op##q, rax, rbx, cl)) \ | ||
511 | FOP_END | ||
512 | |||
513 | /* Special case for SETcc - 1 instruction per cc */ | ||
514 | #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" | ||
515 | |||
516 | FOP_START(setcc) | ||
517 | FOP_SETCC(seto) | ||
518 | FOP_SETCC(setno) | ||
519 | FOP_SETCC(setc) | ||
520 | FOP_SETCC(setnc) | ||
521 | FOP_SETCC(setz) | ||
522 | FOP_SETCC(setnz) | ||
523 | FOP_SETCC(setbe) | ||
524 | FOP_SETCC(setnbe) | ||
525 | FOP_SETCC(sets) | ||
526 | FOP_SETCC(setns) | ||
527 | FOP_SETCC(setp) | ||
528 | FOP_SETCC(setnp) | ||
529 | FOP_SETCC(setl) | ||
530 | FOP_SETCC(setnl) | ||
531 | FOP_SETCC(setle) | ||
532 | FOP_SETCC(setnle) | ||
533 | FOP_END; | ||
534 | |||
410 | #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ | 535 | #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ |
411 | do { \ | 536 | do { \ |
412 | unsigned long _tmp; \ | 537 | unsigned long _tmp; \ |
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
663 | ulong la; | 788 | ulong la; |
664 | u32 lim; | 789 | u32 lim; |
665 | u16 sel; | 790 | u16 sel; |
666 | unsigned cpl, rpl; | 791 | unsigned cpl; |
667 | 792 | ||
668 | la = seg_base(ctxt, addr.seg) + addr.ea; | 793 | la = seg_base(ctxt, addr.seg) + addr.ea; |
669 | switch (ctxt->mode) { | 794 | switch (ctxt->mode) { |
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
697 | goto bad; | 822 | goto bad; |
698 | } | 823 | } |
699 | cpl = ctxt->ops->cpl(ctxt); | 824 | cpl = ctxt->ops->cpl(ctxt); |
700 | if (ctxt->mode == X86EMUL_MODE_REAL) | ||
701 | rpl = 0; | ||
702 | else | ||
703 | rpl = sel & 3; | ||
704 | cpl = max(cpl, rpl); | ||
705 | if (!(desc.type & 8)) { | 825 | if (!(desc.type & 8)) { |
706 | /* data segment */ | 826 | /* data segment */ |
707 | if (cpl > desc.dpl) | 827 | if (cpl > desc.dpl) |
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, | |||
852 | return rc; | 972 | return rc; |
853 | } | 973 | } |
854 | 974 | ||
855 | static int test_cc(unsigned int condition, unsigned int flags) | 975 | FASTOP2(add); |
856 | { | 976 | FASTOP2(or); |
857 | int rc = 0; | 977 | FASTOP2(adc); |
858 | 978 | FASTOP2(sbb); | |
859 | switch ((condition & 15) >> 1) { | 979 | FASTOP2(and); |
860 | case 0: /* o */ | 980 | FASTOP2(sub); |
861 | rc |= (flags & EFLG_OF); | 981 | FASTOP2(xor); |
862 | break; | 982 | FASTOP2(cmp); |
863 | case 1: /* b/c/nae */ | 983 | FASTOP2(test); |
864 | rc |= (flags & EFLG_CF); | 984 | |
865 | break; | 985 | FASTOP3WCL(shld); |
866 | case 2: /* z/e */ | 986 | FASTOP3WCL(shrd); |
867 | rc |= (flags & EFLG_ZF); | 987 | |
868 | break; | 988 | FASTOP2W(imul); |
869 | case 3: /* be/na */ | 989 | |
870 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | 990 | FASTOP1(not); |
871 | break; | 991 | FASTOP1(neg); |
872 | case 4: /* s */ | 992 | FASTOP1(inc); |
873 | rc |= (flags & EFLG_SF); | 993 | FASTOP1(dec); |
874 | break; | 994 | |
875 | case 5: /* p/pe */ | 995 | FASTOP2CL(rol); |
876 | rc |= (flags & EFLG_PF); | 996 | FASTOP2CL(ror); |
877 | break; | 997 | FASTOP2CL(rcl); |
878 | case 7: /* le/ng */ | 998 | FASTOP2CL(rcr); |
879 | rc |= (flags & EFLG_ZF); | 999 | FASTOP2CL(shl); |
880 | /* fall through */ | 1000 | FASTOP2CL(shr); |
881 | case 6: /* l/nge */ | 1001 | FASTOP2CL(sar); |
882 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | 1002 | |
883 | break; | 1003 | FASTOP2W(bsf); |
884 | } | 1004 | FASTOP2W(bsr); |
885 | 1005 | FASTOP2W(bt); | |
886 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | 1006 | FASTOP2W(bts); |
887 | return (!!rc ^ (condition & 1)); | 1007 | FASTOP2W(btr); |
1008 | FASTOP2W(btc); | ||
1009 | |||
1010 | static u8 test_cc(unsigned int condition, unsigned long flags) | ||
1011 | { | ||
1012 | u8 rc; | ||
1013 | void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); | ||
1014 | |||
1015 | flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; | ||
1016 | asm("push %[flags]; popf; call *%[fastop]" | ||
1017 | : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); | ||
1018 | return rc; | ||
888 | } | 1019 | } |
889 | 1020 | ||
890 | static void fetch_register_operand(struct operand *op) | 1021 | static void fetch_register_operand(struct operand *op) |
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) | |||
994 | ctxt->ops->put_fpu(ctxt); | 1125 | ctxt->ops->put_fpu(ctxt); |
995 | } | 1126 | } |
996 | 1127 | ||
1128 | static int em_fninit(struct x86_emulate_ctxt *ctxt) | ||
1129 | { | ||
1130 | if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) | ||
1131 | return emulate_nm(ctxt); | ||
1132 | |||
1133 | ctxt->ops->get_fpu(ctxt); | ||
1134 | asm volatile("fninit"); | ||
1135 | ctxt->ops->put_fpu(ctxt); | ||
1136 | return X86EMUL_CONTINUE; | ||
1137 | } | ||
1138 | |||
1139 | static int em_fnstcw(struct x86_emulate_ctxt *ctxt) | ||
1140 | { | ||
1141 | u16 fcw; | ||
1142 | |||
1143 | if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) | ||
1144 | return emulate_nm(ctxt); | ||
1145 | |||
1146 | ctxt->ops->get_fpu(ctxt); | ||
1147 | asm volatile("fnstcw %0": "+m"(fcw)); | ||
1148 | ctxt->ops->put_fpu(ctxt); | ||
1149 | |||
1150 | /* force 2 byte destination */ | ||
1151 | ctxt->dst.bytes = 2; | ||
1152 | ctxt->dst.val = fcw; | ||
1153 | |||
1154 | return X86EMUL_CONTINUE; | ||
1155 | } | ||
1156 | |||
1157 | static int em_fnstsw(struct x86_emulate_ctxt *ctxt) | ||
1158 | { | ||
1159 | u16 fsw; | ||
1160 | |||
1161 | if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) | ||
1162 | return emulate_nm(ctxt); | ||
1163 | |||
1164 | ctxt->ops->get_fpu(ctxt); | ||
1165 | asm volatile("fnstsw %0": "+m"(fsw)); | ||
1166 | ctxt->ops->put_fpu(ctxt); | ||
1167 | |||
1168 | /* force 2 byte destination */ | ||
1169 | ctxt->dst.bytes = 2; | ||
1170 | ctxt->dst.val = fsw; | ||
1171 | |||
1172 | return X86EMUL_CONTINUE; | ||
1173 | } | ||
1174 | |||
997 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | 1175 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, |
998 | struct operand *op) | 1176 | struct operand *op) |
999 | { | 1177 | { |
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1534 | { | 1712 | { |
1535 | int rc; | 1713 | int rc; |
1536 | 1714 | ||
1715 | if (ctxt->d & NoWrite) | ||
1716 | return X86EMUL_CONTINUE; | ||
1717 | |||
1537 | switch (ctxt->dst.type) { | 1718 | switch (ctxt->dst.type) { |
1538 | case OP_REG: | 1719 | case OP_REG: |
1539 | write_register_operand(&ctxt->dst); | 1720 | write_register_operand(&ctxt->dst); |
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | |||
1918 | return X86EMUL_CONTINUE; | 2099 | return X86EMUL_CONTINUE; |
1919 | } | 2100 | } |
1920 | 2101 | ||
1921 | static int em_grp2(struct x86_emulate_ctxt *ctxt) | ||
1922 | { | ||
1923 | switch (ctxt->modrm_reg) { | ||
1924 | case 0: /* rol */ | ||
1925 | emulate_2op_SrcB(ctxt, "rol"); | ||
1926 | break; | ||
1927 | case 1: /* ror */ | ||
1928 | emulate_2op_SrcB(ctxt, "ror"); | ||
1929 | break; | ||
1930 | case 2: /* rcl */ | ||
1931 | emulate_2op_SrcB(ctxt, "rcl"); | ||
1932 | break; | ||
1933 | case 3: /* rcr */ | ||
1934 | emulate_2op_SrcB(ctxt, "rcr"); | ||
1935 | break; | ||
1936 | case 4: /* sal/shl */ | ||
1937 | case 6: /* sal/shl */ | ||
1938 | emulate_2op_SrcB(ctxt, "sal"); | ||
1939 | break; | ||
1940 | case 5: /* shr */ | ||
1941 | emulate_2op_SrcB(ctxt, "shr"); | ||
1942 | break; | ||
1943 | case 7: /* sar */ | ||
1944 | emulate_2op_SrcB(ctxt, "sar"); | ||
1945 | break; | ||
1946 | } | ||
1947 | return X86EMUL_CONTINUE; | ||
1948 | } | ||
1949 | |||
1950 | static int em_not(struct x86_emulate_ctxt *ctxt) | ||
1951 | { | ||
1952 | ctxt->dst.val = ~ctxt->dst.val; | ||
1953 | return X86EMUL_CONTINUE; | ||
1954 | } | ||
1955 | |||
1956 | static int em_neg(struct x86_emulate_ctxt *ctxt) | ||
1957 | { | ||
1958 | emulate_1op(ctxt, "neg"); | ||
1959 | return X86EMUL_CONTINUE; | ||
1960 | } | ||
1961 | |||
1962 | static int em_mul_ex(struct x86_emulate_ctxt *ctxt) | 2102 | static int em_mul_ex(struct x86_emulate_ctxt *ctxt) |
1963 | { | 2103 | { |
1964 | u8 ex = 0; | 2104 | u8 ex = 0; |
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) | |||
2000 | int rc = X86EMUL_CONTINUE; | 2140 | int rc = X86EMUL_CONTINUE; |
2001 | 2141 | ||
2002 | switch (ctxt->modrm_reg) { | 2142 | switch (ctxt->modrm_reg) { |
2003 | case 0: /* inc */ | ||
2004 | emulate_1op(ctxt, "inc"); | ||
2005 | break; | ||
2006 | case 1: /* dec */ | ||
2007 | emulate_1op(ctxt, "dec"); | ||
2008 | break; | ||
2009 | case 2: /* call near abs */ { | 2143 | case 2: /* call near abs */ { |
2010 | long int old_eip; | 2144 | long int old_eip; |
2011 | old_eip = ctxt->_eip; | 2145 | old_eip = ctxt->_eip; |
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | |||
2075 | /* Save real source value, then compare EAX against destination. */ | 2209 | /* Save real source value, then compare EAX against destination. */ |
2076 | ctxt->src.orig_val = ctxt->src.val; | 2210 | ctxt->src.orig_val = ctxt->src.val; |
2077 | ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); | 2211 | ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); |
2078 | emulate_2op_SrcV(ctxt, "cmp"); | 2212 | fastop(ctxt, em_cmp); |
2079 | 2213 | ||
2080 | if (ctxt->eflags & EFLG_ZF) { | 2214 | if (ctxt->eflags & EFLG_ZF) { |
2081 | /* Success: write back to memory. */ | 2215 | /* Success: write back to memory. */ |
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2843 | ctxt->src.type = OP_IMM; | 2977 | ctxt->src.type = OP_IMM; |
2844 | ctxt->src.val = 0; | 2978 | ctxt->src.val = 0; |
2845 | ctxt->src.bytes = 1; | 2979 | ctxt->src.bytes = 1; |
2846 | emulate_2op_SrcV(ctxt, "or"); | 2980 | fastop(ctxt, em_or); |
2847 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); | 2981 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); |
2848 | if (cf) | 2982 | if (cf) |
2849 | ctxt->eflags |= X86_EFLAGS_CF; | 2983 | ctxt->eflags |= X86_EFLAGS_CF; |
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2852 | return X86EMUL_CONTINUE; | 2986 | return X86EMUL_CONTINUE; |
2853 | } | 2987 | } |
2854 | 2988 | ||
2989 | static int em_aad(struct x86_emulate_ctxt *ctxt) | ||
2990 | { | ||
2991 | u8 al = ctxt->dst.val & 0xff; | ||
2992 | u8 ah = (ctxt->dst.val >> 8) & 0xff; | ||
2993 | |||
2994 | al = (al + (ah * ctxt->src.val)) & 0xff; | ||
2995 | |||
2996 | ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al; | ||
2997 | |||
2998 | /* Set PF, ZF, SF */ | ||
2999 | ctxt->src.type = OP_IMM; | ||
3000 | ctxt->src.val = 0; | ||
3001 | ctxt->src.bytes = 1; | ||
3002 | fastop(ctxt, em_or); | ||
3003 | |||
3004 | return X86EMUL_CONTINUE; | ||
3005 | } | ||
3006 | |||
2855 | static int em_call(struct x86_emulate_ctxt *ctxt) | 3007 | static int em_call(struct x86_emulate_ctxt *ctxt) |
2856 | { | 3008 | { |
2857 | long rel = ctxt->src.val; | 3009 | long rel = ctxt->src.val; |
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) | |||
2900 | return X86EMUL_CONTINUE; | 3052 | return X86EMUL_CONTINUE; |
2901 | } | 3053 | } |
2902 | 3054 | ||
2903 | static int em_add(struct x86_emulate_ctxt *ctxt) | ||
2904 | { | ||
2905 | emulate_2op_SrcV(ctxt, "add"); | ||
2906 | return X86EMUL_CONTINUE; | ||
2907 | } | ||
2908 | |||
2909 | static int em_or(struct x86_emulate_ctxt *ctxt) | ||
2910 | { | ||
2911 | emulate_2op_SrcV(ctxt, "or"); | ||
2912 | return X86EMUL_CONTINUE; | ||
2913 | } | ||
2914 | |||
2915 | static int em_adc(struct x86_emulate_ctxt *ctxt) | ||
2916 | { | ||
2917 | emulate_2op_SrcV(ctxt, "adc"); | ||
2918 | return X86EMUL_CONTINUE; | ||
2919 | } | ||
2920 | |||
2921 | static int em_sbb(struct x86_emulate_ctxt *ctxt) | ||
2922 | { | ||
2923 | emulate_2op_SrcV(ctxt, "sbb"); | ||
2924 | return X86EMUL_CONTINUE; | ||
2925 | } | ||
2926 | |||
2927 | static int em_and(struct x86_emulate_ctxt *ctxt) | ||
2928 | { | ||
2929 | emulate_2op_SrcV(ctxt, "and"); | ||
2930 | return X86EMUL_CONTINUE; | ||
2931 | } | ||
2932 | |||
2933 | static int em_sub(struct x86_emulate_ctxt *ctxt) | ||
2934 | { | ||
2935 | emulate_2op_SrcV(ctxt, "sub"); | ||
2936 | return X86EMUL_CONTINUE; | ||
2937 | } | ||
2938 | |||
2939 | static int em_xor(struct x86_emulate_ctxt *ctxt) | ||
2940 | { | ||
2941 | emulate_2op_SrcV(ctxt, "xor"); | ||
2942 | return X86EMUL_CONTINUE; | ||
2943 | } | ||
2944 | |||
2945 | static int em_cmp(struct x86_emulate_ctxt *ctxt) | ||
2946 | { | ||
2947 | emulate_2op_SrcV(ctxt, "cmp"); | ||
2948 | /* Disable writeback. */ | ||
2949 | ctxt->dst.type = OP_NONE; | ||
2950 | return X86EMUL_CONTINUE; | ||
2951 | } | ||
2952 | |||
2953 | static int em_test(struct x86_emulate_ctxt *ctxt) | ||
2954 | { | ||
2955 | emulate_2op_SrcV(ctxt, "test"); | ||
2956 | /* Disable writeback. */ | ||
2957 | ctxt->dst.type = OP_NONE; | ||
2958 | return X86EMUL_CONTINUE; | ||
2959 | } | ||
2960 | |||
2961 | static int em_xchg(struct x86_emulate_ctxt *ctxt) | 3055 | static int em_xchg(struct x86_emulate_ctxt *ctxt) |
2962 | { | 3056 | { |
2963 | /* Write back the register source. */ | 3057 | /* Write back the register source. */ |
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) | |||
2970 | return X86EMUL_CONTINUE; | 3064 | return X86EMUL_CONTINUE; |
2971 | } | 3065 | } |
2972 | 3066 | ||
2973 | static int em_imul(struct x86_emulate_ctxt *ctxt) | ||
2974 | { | ||
2975 | emulate_2op_SrcV_nobyte(ctxt, "imul"); | ||
2976 | return X86EMUL_CONTINUE; | ||
2977 | } | ||
2978 | |||
2979 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) | 3067 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) |
2980 | { | 3068 | { |
2981 | ctxt->dst.val = ctxt->src2.val; | 3069 | ctxt->dst.val = ctxt->src2.val; |
2982 | return em_imul(ctxt); | 3070 | return fastop(ctxt, em_imul); |
2983 | } | 3071 | } |
2984 | 3072 | ||
2985 | static int em_cwd(struct x86_emulate_ctxt *ctxt) | 3073 | static int em_cwd(struct x86_emulate_ctxt *ctxt) |
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) | |||
3300 | return X86EMUL_CONTINUE; | 3388 | return X86EMUL_CONTINUE; |
3301 | } | 3389 | } |
3302 | 3390 | ||
3303 | static int em_bt(struct x86_emulate_ctxt *ctxt) | ||
3304 | { | ||
3305 | /* Disable writeback. */ | ||
3306 | ctxt->dst.type = OP_NONE; | ||
3307 | /* only subword offset */ | ||
3308 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; | ||
3309 | |||
3310 | emulate_2op_SrcV_nobyte(ctxt, "bt"); | ||
3311 | return X86EMUL_CONTINUE; | ||
3312 | } | ||
3313 | |||
3314 | static int em_bts(struct x86_emulate_ctxt *ctxt) | ||
3315 | { | ||
3316 | emulate_2op_SrcV_nobyte(ctxt, "bts"); | ||
3317 | return X86EMUL_CONTINUE; | ||
3318 | } | ||
3319 | |||
3320 | static int em_btr(struct x86_emulate_ctxt *ctxt) | ||
3321 | { | ||
3322 | emulate_2op_SrcV_nobyte(ctxt, "btr"); | ||
3323 | return X86EMUL_CONTINUE; | ||
3324 | } | ||
3325 | |||
3326 | static int em_btc(struct x86_emulate_ctxt *ctxt) | ||
3327 | { | ||
3328 | emulate_2op_SrcV_nobyte(ctxt, "btc"); | ||
3329 | return X86EMUL_CONTINUE; | ||
3330 | } | ||
3331 | |||
3332 | static int em_bsf(struct x86_emulate_ctxt *ctxt) | ||
3333 | { | ||
3334 | emulate_2op_SrcV_nobyte(ctxt, "bsf"); | ||
3335 | return X86EMUL_CONTINUE; | ||
3336 | } | ||
3337 | |||
3338 | static int em_bsr(struct x86_emulate_ctxt *ctxt) | ||
3339 | { | ||
3340 | emulate_2op_SrcV_nobyte(ctxt, "bsr"); | ||
3341 | return X86EMUL_CONTINUE; | ||
3342 | } | ||
3343 | |||
3344 | static int em_cpuid(struct x86_emulate_ctxt *ctxt) | 3391 | static int em_cpuid(struct x86_emulate_ctxt *ctxt) |
3345 | { | 3392 | { |
3346 | u32 eax, ebx, ecx, edx; | 3393 | u32 eax, ebx, ecx, edx; |
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3572 | #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } | 3619 | #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } |
3573 | #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } | 3620 | #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } |
3574 | #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } | 3621 | #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } |
3622 | #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } | ||
3575 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } | 3623 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } |
3624 | #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } | ||
3576 | #define II(_f, _e, _i) \ | 3625 | #define II(_f, _e, _i) \ |
3577 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } | 3626 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } |
3578 | #define IIP(_f, _e, _i, _p) \ | 3627 | #define IIP(_f, _e, _i, _p) \ |
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3583 | #define D2bv(_f) D((_f) | ByteOp), D(_f) | 3632 | #define D2bv(_f) D((_f) | ByteOp), D(_f) |
3584 | #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) | 3633 | #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) |
3585 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) | 3634 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) |
3635 | #define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e) | ||
3586 | #define I2bvIP(_f, _e, _i, _p) \ | 3636 | #define I2bvIP(_f, _e, _i, _p) \ |
3587 | IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) | 3637 | IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) |
3588 | 3638 | ||
3589 | #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ | 3639 | #define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \ |
3590 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ | 3640 | F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ |
3591 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) | 3641 | F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) |
3592 | 3642 | ||
3593 | static const struct opcode group7_rm1[] = { | 3643 | static const struct opcode group7_rm1[] = { |
3594 | DI(SrcNone | Priv, monitor), | 3644 | DI(SrcNone | Priv, monitor), |
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = { | |||
3614 | }; | 3664 | }; |
3615 | 3665 | ||
3616 | static const struct opcode group1[] = { | 3666 | static const struct opcode group1[] = { |
3617 | I(Lock, em_add), | 3667 | F(Lock, em_add), |
3618 | I(Lock | PageTable, em_or), | 3668 | F(Lock | PageTable, em_or), |
3619 | I(Lock, em_adc), | 3669 | F(Lock, em_adc), |
3620 | I(Lock, em_sbb), | 3670 | F(Lock, em_sbb), |
3621 | I(Lock | PageTable, em_and), | 3671 | F(Lock | PageTable, em_and), |
3622 | I(Lock, em_sub), | 3672 | F(Lock, em_sub), |
3623 | I(Lock, em_xor), | 3673 | F(Lock, em_xor), |
3624 | I(0, em_cmp), | 3674 | F(NoWrite, em_cmp), |
3625 | }; | 3675 | }; |
3626 | 3676 | ||
3627 | static const struct opcode group1A[] = { | 3677 | static const struct opcode group1A[] = { |
3628 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, | 3678 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, |
3629 | }; | 3679 | }; |
3630 | 3680 | ||
3681 | static const struct opcode group2[] = { | ||
3682 | F(DstMem | ModRM, em_rol), | ||
3683 | F(DstMem | ModRM, em_ror), | ||
3684 | F(DstMem | ModRM, em_rcl), | ||
3685 | F(DstMem | ModRM, em_rcr), | ||
3686 | F(DstMem | ModRM, em_shl), | ||
3687 | F(DstMem | ModRM, em_shr), | ||
3688 | F(DstMem | ModRM, em_shl), | ||
3689 | F(DstMem | ModRM, em_sar), | ||
3690 | }; | ||
3691 | |||
3631 | static const struct opcode group3[] = { | 3692 | static const struct opcode group3[] = { |
3632 | I(DstMem | SrcImm, em_test), | 3693 | F(DstMem | SrcImm | NoWrite, em_test), |
3633 | I(DstMem | SrcImm, em_test), | 3694 | F(DstMem | SrcImm | NoWrite, em_test), |
3634 | I(DstMem | SrcNone | Lock, em_not), | 3695 | F(DstMem | SrcNone | Lock, em_not), |
3635 | I(DstMem | SrcNone | Lock, em_neg), | 3696 | F(DstMem | SrcNone | Lock, em_neg), |
3636 | I(SrcMem, em_mul_ex), | 3697 | I(SrcMem, em_mul_ex), |
3637 | I(SrcMem, em_imul_ex), | 3698 | I(SrcMem, em_imul_ex), |
3638 | I(SrcMem, em_div_ex), | 3699 | I(SrcMem, em_div_ex), |
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = { | |||
3640 | }; | 3701 | }; |
3641 | 3702 | ||
3642 | static const struct opcode group4[] = { | 3703 | static const struct opcode group4[] = { |
3643 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), | 3704 | F(ByteOp | DstMem | SrcNone | Lock, em_inc), |
3644 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), | 3705 | F(ByteOp | DstMem | SrcNone | Lock, em_dec), |
3645 | N, N, N, N, N, N, | 3706 | N, N, N, N, N, N, |
3646 | }; | 3707 | }; |
3647 | 3708 | ||
3648 | static const struct opcode group5[] = { | 3709 | static const struct opcode group5[] = { |
3649 | I(DstMem | SrcNone | Lock, em_grp45), | 3710 | F(DstMem | SrcNone | Lock, em_inc), |
3650 | I(DstMem | SrcNone | Lock, em_grp45), | 3711 | F(DstMem | SrcNone | Lock, em_dec), |
3651 | I(SrcMem | Stack, em_grp45), | 3712 | I(SrcMem | Stack, em_grp45), |
3652 | I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), | 3713 | I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), |
3653 | I(SrcMem | Stack, em_grp45), | 3714 | I(SrcMem | Stack, em_grp45), |
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { { | |||
3682 | 3743 | ||
3683 | static const struct opcode group8[] = { | 3744 | static const struct opcode group8[] = { |
3684 | N, N, N, N, | 3745 | N, N, N, N, |
3685 | I(DstMem | SrcImmByte, em_bt), | 3746 | F(DstMem | SrcImmByte | NoWrite, em_bt), |
3686 | I(DstMem | SrcImmByte | Lock | PageTable, em_bts), | 3747 | F(DstMem | SrcImmByte | Lock | PageTable, em_bts), |
3687 | I(DstMem | SrcImmByte | Lock, em_btr), | 3748 | F(DstMem | SrcImmByte | Lock, em_btr), |
3688 | I(DstMem | SrcImmByte | Lock | PageTable, em_btc), | 3749 | F(DstMem | SrcImmByte | Lock | PageTable, em_btc), |
3689 | }; | 3750 | }; |
3690 | 3751 | ||
3691 | static const struct group_dual group9 = { { | 3752 | static const struct group_dual group9 = { { |
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = { | |||
3707 | I(0, em_mov), N, N, N, | 3768 | I(0, em_mov), N, N, N, |
3708 | }; | 3769 | }; |
3709 | 3770 | ||
3771 | static const struct escape escape_d9 = { { | ||
3772 | N, N, N, N, N, N, N, I(DstMem, em_fnstcw), | ||
3773 | }, { | ||
3774 | /* 0xC0 - 0xC7 */ | ||
3775 | N, N, N, N, N, N, N, N, | ||
3776 | /* 0xC8 - 0xCF */ | ||
3777 | N, N, N, N, N, N, N, N, | ||
3778 | /* 0xD0 - 0xC7 */ | ||
3779 | N, N, N, N, N, N, N, N, | ||
3780 | /* 0xD8 - 0xDF */ | ||
3781 | N, N, N, N, N, N, N, N, | ||
3782 | /* 0xE0 - 0xE7 */ | ||
3783 | N, N, N, N, N, N, N, N, | ||
3784 | /* 0xE8 - 0xEF */ | ||
3785 | N, N, N, N, N, N, N, N, | ||
3786 | /* 0xF0 - 0xF7 */ | ||
3787 | N, N, N, N, N, N, N, N, | ||
3788 | /* 0xF8 - 0xFF */ | ||
3789 | N, N, N, N, N, N, N, N, | ||
3790 | } }; | ||
3791 | |||
3792 | static const struct escape escape_db = { { | ||
3793 | N, N, N, N, N, N, N, N, | ||
3794 | }, { | ||
3795 | /* 0xC0 - 0xC7 */ | ||
3796 | N, N, N, N, N, N, N, N, | ||
3797 | /* 0xC8 - 0xCF */ | ||
3798 | N, N, N, N, N, N, N, N, | ||
3799 | /* 0xD0 - 0xC7 */ | ||
3800 | N, N, N, N, N, N, N, N, | ||
3801 | /* 0xD8 - 0xDF */ | ||
3802 | N, N, N, N, N, N, N, N, | ||
3803 | /* 0xE0 - 0xE7 */ | ||
3804 | N, N, N, I(ImplicitOps, em_fninit), N, N, N, N, | ||
3805 | /* 0xE8 - 0xEF */ | ||
3806 | N, N, N, N, N, N, N, N, | ||
3807 | /* 0xF0 - 0xF7 */ | ||
3808 | N, N, N, N, N, N, N, N, | ||
3809 | /* 0xF8 - 0xFF */ | ||
3810 | N, N, N, N, N, N, N, N, | ||
3811 | } }; | ||
3812 | |||
3813 | static const struct escape escape_dd = { { | ||
3814 | N, N, N, N, N, N, N, I(DstMem, em_fnstsw), | ||
3815 | }, { | ||
3816 | /* 0xC0 - 0xC7 */ | ||
3817 | N, N, N, N, N, N, N, N, | ||
3818 | /* 0xC8 - 0xCF */ | ||
3819 | N, N, N, N, N, N, N, N, | ||
3820 | /* 0xD0 - 0xC7 */ | ||
3821 | N, N, N, N, N, N, N, N, | ||
3822 | /* 0xD8 - 0xDF */ | ||
3823 | N, N, N, N, N, N, N, N, | ||
3824 | /* 0xE0 - 0xE7 */ | ||
3825 | N, N, N, N, N, N, N, N, | ||
3826 | /* 0xE8 - 0xEF */ | ||
3827 | N, N, N, N, N, N, N, N, | ||
3828 | /* 0xF0 - 0xF7 */ | ||
3829 | N, N, N, N, N, N, N, N, | ||
3830 | /* 0xF8 - 0xFF */ | ||
3831 | N, N, N, N, N, N, N, N, | ||
3832 | } }; | ||
3833 | |||
3710 | static const struct opcode opcode_table[256] = { | 3834 | static const struct opcode opcode_table[256] = { |
3711 | /* 0x00 - 0x07 */ | 3835 | /* 0x00 - 0x07 */ |
3712 | I6ALU(Lock, em_add), | 3836 | F6ALU(Lock, em_add), |
3713 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), | 3837 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), |
3714 | I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), | 3838 | I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), |
3715 | /* 0x08 - 0x0F */ | 3839 | /* 0x08 - 0x0F */ |
3716 | I6ALU(Lock | PageTable, em_or), | 3840 | F6ALU(Lock | PageTable, em_or), |
3717 | I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), | 3841 | I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), |
3718 | N, | 3842 | N, |
3719 | /* 0x10 - 0x17 */ | 3843 | /* 0x10 - 0x17 */ |
3720 | I6ALU(Lock, em_adc), | 3844 | F6ALU(Lock, em_adc), |
3721 | I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), | 3845 | I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), |
3722 | I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), | 3846 | I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), |
3723 | /* 0x18 - 0x1F */ | 3847 | /* 0x18 - 0x1F */ |
3724 | I6ALU(Lock, em_sbb), | 3848 | F6ALU(Lock, em_sbb), |
3725 | I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), | 3849 | I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), |
3726 | I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), | 3850 | I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), |
3727 | /* 0x20 - 0x27 */ | 3851 | /* 0x20 - 0x27 */ |
3728 | I6ALU(Lock | PageTable, em_and), N, N, | 3852 | F6ALU(Lock | PageTable, em_and), N, N, |
3729 | /* 0x28 - 0x2F */ | 3853 | /* 0x28 - 0x2F */ |
3730 | I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), | 3854 | F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), |
3731 | /* 0x30 - 0x37 */ | 3855 | /* 0x30 - 0x37 */ |
3732 | I6ALU(Lock, em_xor), N, N, | 3856 | F6ALU(Lock, em_xor), N, N, |
3733 | /* 0x38 - 0x3F */ | 3857 | /* 0x38 - 0x3F */ |
3734 | I6ALU(0, em_cmp), N, N, | 3858 | F6ALU(NoWrite, em_cmp), N, N, |
3735 | /* 0x40 - 0x4F */ | 3859 | /* 0x40 - 0x4F */ |
3736 | X16(D(DstReg)), | 3860 | X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)), |
3737 | /* 0x50 - 0x57 */ | 3861 | /* 0x50 - 0x57 */ |
3738 | X8(I(SrcReg | Stack, em_push)), | 3862 | X8(I(SrcReg | Stack, em_push)), |
3739 | /* 0x58 - 0x5F */ | 3863 | /* 0x58 - 0x5F */ |
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = { | |||
3757 | G(DstMem | SrcImm, group1), | 3881 | G(DstMem | SrcImm, group1), |
3758 | G(ByteOp | DstMem | SrcImm | No64, group1), | 3882 | G(ByteOp | DstMem | SrcImm | No64, group1), |
3759 | G(DstMem | SrcImmByte, group1), | 3883 | G(DstMem | SrcImmByte, group1), |
3760 | I2bv(DstMem | SrcReg | ModRM, em_test), | 3884 | F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test), |
3761 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), | 3885 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), |
3762 | /* 0x88 - 0x8F */ | 3886 | /* 0x88 - 0x8F */ |
3763 | I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), | 3887 | I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), |
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = { | |||
3777 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | 3901 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), |
3778 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), | 3902 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), |
3779 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | 3903 | I2bv(SrcSI | DstDI | Mov | String, em_mov), |
3780 | I2bv(SrcSI | DstDI | String, em_cmp), | 3904 | F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), |
3781 | /* 0xA8 - 0xAF */ | 3905 | /* 0xA8 - 0xAF */ |
3782 | I2bv(DstAcc | SrcImm, em_test), | 3906 | F2bv(DstAcc | SrcImm | NoWrite, em_test), |
3783 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), | 3907 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), |
3784 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), | 3908 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), |
3785 | I2bv(SrcAcc | DstDI | String, em_cmp), | 3909 | F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), |
3786 | /* 0xB0 - 0xB7 */ | 3910 | /* 0xB0 - 0xB7 */ |
3787 | X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), | 3911 | X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), |
3788 | /* 0xB8 - 0xBF */ | 3912 | /* 0xB8 - 0xBF */ |
3789 | X8(I(DstReg | SrcImm | Mov, em_mov)), | 3913 | X8(I(DstReg | SrcImm64 | Mov, em_mov)), |
3790 | /* 0xC0 - 0xC7 */ | 3914 | /* 0xC0 - 0xC7 */ |
3791 | D2bv(DstMem | SrcImmByte | ModRM), | 3915 | G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), |
3792 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), | 3916 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), |
3793 | I(ImplicitOps | Stack, em_ret), | 3917 | I(ImplicitOps | Stack, em_ret), |
3794 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), | 3918 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), |
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = { | |||
3800 | D(ImplicitOps), DI(SrcImmByte, intn), | 3924 | D(ImplicitOps), DI(SrcImmByte, intn), |
3801 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), | 3925 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
3802 | /* 0xD0 - 0xD7 */ | 3926 | /* 0xD0 - 0xD7 */ |
3803 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), | 3927 | G(Src2One | ByteOp, group2), G(Src2One, group2), |
3804 | N, N, N, N, | 3928 | G(Src2CL | ByteOp, group2), G(Src2CL, group2), |
3929 | N, I(DstAcc | SrcImmByte | No64, em_aad), N, N, | ||
3805 | /* 0xD8 - 0xDF */ | 3930 | /* 0xD8 - 0xDF */ |
3806 | N, N, N, N, N, N, N, N, | 3931 | N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, |
3807 | /* 0xE0 - 0xE7 */ | 3932 | /* 0xE0 - 0xE7 */ |
3808 | X3(I(SrcImmByte, em_loop)), | 3933 | X3(I(SrcImmByte, em_loop)), |
3809 | I(SrcImmByte, em_jcxz), | 3934 | I(SrcImmByte, em_jcxz), |
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = { | |||
3870 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | 3995 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), |
3871 | /* 0xA0 - 0xA7 */ | 3996 | /* 0xA0 - 0xA7 */ |
3872 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), | 3997 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), |
3873 | II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), | 3998 | II(ImplicitOps, em_cpuid, cpuid), |
3874 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 3999 | F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt), |
3875 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | 4000 | F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld), |
4001 | F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, | ||
3876 | /* 0xA8 - 0xAF */ | 4002 | /* 0xA8 - 0xAF */ |
3877 | I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), | 4003 | I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), |
3878 | DI(ImplicitOps, rsm), | 4004 | DI(ImplicitOps, rsm), |
3879 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), | 4005 | F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), |
3880 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 4006 | F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), |
3881 | D(DstMem | SrcReg | Src2CL | ModRM), | 4007 | F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), |
3882 | D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), | 4008 | D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), |
3883 | /* 0xB0 - 0xB7 */ | 4009 | /* 0xB0 - 0xB7 */ |
3884 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), | 4010 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), |
3885 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), | 4011 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), |
3886 | I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), | 4012 | F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), |
3887 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), | 4013 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), |
3888 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), | 4014 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), |
3889 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 4015 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3890 | /* 0xB8 - 0xBF */ | 4016 | /* 0xB8 - 0xBF */ |
3891 | N, N, | 4017 | N, N, |
3892 | G(BitOp, group8), | 4018 | G(BitOp, group8), |
3893 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), | 4019 | F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
3894 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), | 4020 | F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), |
3895 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 4021 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3896 | /* 0xC0 - 0xC7 */ | 4022 | /* 0xC0 - 0xC7 */ |
3897 | D2bv(DstMem | SrcReg | ModRM | Lock), | 4023 | D2bv(DstMem | SrcReg | ModRM | Lock), |
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3950 | case 4: | 4076 | case 4: |
3951 | op->val = insn_fetch(s32, ctxt); | 4077 | op->val = insn_fetch(s32, ctxt); |
3952 | break; | 4078 | break; |
4079 | case 8: | ||
4080 | op->val = insn_fetch(s64, ctxt); | ||
4081 | break; | ||
3953 | } | 4082 | } |
3954 | if (!sign_extension) { | 4083 | if (!sign_extension) { |
3955 | switch (op->bytes) { | 4084 | switch (op->bytes) { |
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
4028 | case OpImm: | 4157 | case OpImm: |
4029 | rc = decode_imm(ctxt, op, imm_size(ctxt), true); | 4158 | rc = decode_imm(ctxt, op, imm_size(ctxt), true); |
4030 | break; | 4159 | break; |
4160 | case OpImm64: | ||
4161 | rc = decode_imm(ctxt, op, ctxt->op_bytes, true); | ||
4162 | break; | ||
4031 | case OpMem8: | 4163 | case OpMem8: |
4032 | ctxt->memop.bytes = 1; | 4164 | ctxt->memop.bytes = 1; |
4033 | goto mem_common; | 4165 | goto mem_common; |
@@ -4222,6 +4354,12 @@ done_prefixes: | |||
4222 | case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; | 4354 | case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; |
4223 | } | 4355 | } |
4224 | break; | 4356 | break; |
4357 | case Escape: | ||
4358 | if (ctxt->modrm > 0xbf) | ||
4359 | opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; | ||
4360 | else | ||
4361 | opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; | ||
4362 | break; | ||
4225 | default: | 4363 | default: |
4226 | return EMULATION_FAILED; | 4364 | return EMULATION_FAILED; |
4227 | } | 4365 | } |
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, | |||
4354 | read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); | 4492 | read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); |
4355 | } | 4493 | } |
4356 | 4494 | ||
4495 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) | ||
4496 | { | ||
4497 | ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; | ||
4498 | fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; | ||
4499 | asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" | ||
4500 | : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) | ||
4501 | : "c"(ctxt->src2.val), [fastop]"S"(fop)); | ||
4502 | ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); | ||
4503 | return X86EMUL_CONTINUE; | ||
4504 | } | ||
4357 | 4505 | ||
4358 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | 4506 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
4359 | { | 4507 | { |
@@ -4483,6 +4631,13 @@ special_insn: | |||
4483 | } | 4631 | } |
4484 | 4632 | ||
4485 | if (ctxt->execute) { | 4633 | if (ctxt->execute) { |
4634 | if (ctxt->d & Fastop) { | ||
4635 | void (*fop)(struct fastop *) = (void *)ctxt->execute; | ||
4636 | rc = fastop(ctxt, fop); | ||
4637 | if (rc != X86EMUL_CONTINUE) | ||
4638 | goto done; | ||
4639 | goto writeback; | ||
4640 | } | ||
4486 | rc = ctxt->execute(ctxt); | 4641 | rc = ctxt->execute(ctxt); |
4487 | if (rc != X86EMUL_CONTINUE) | 4642 | if (rc != X86EMUL_CONTINUE) |
4488 | goto done; | 4643 | goto done; |
@@ -4493,12 +4648,6 @@ special_insn: | |||
4493 | goto twobyte_insn; | 4648 | goto twobyte_insn; |
4494 | 4649 | ||
4495 | switch (ctxt->b) { | 4650 | switch (ctxt->b) { |
4496 | case 0x40 ... 0x47: /* inc r16/r32 */ | ||
4497 | emulate_1op(ctxt, "inc"); | ||
4498 | break; | ||
4499 | case 0x48 ... 0x4f: /* dec r16/r32 */ | ||
4500 | emulate_1op(ctxt, "dec"); | ||
4501 | break; | ||
4502 | case 0x63: /* movsxd */ | 4651 | case 0x63: /* movsxd */ |
4503 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 4652 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
4504 | goto cannot_emulate; | 4653 | goto cannot_emulate; |
@@ -4523,9 +4672,6 @@ special_insn: | |||
4523 | case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; | 4672 | case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; |
4524 | } | 4673 | } |
4525 | break; | 4674 | break; |
4526 | case 0xc0 ... 0xc1: | ||
4527 | rc = em_grp2(ctxt); | ||
4528 | break; | ||
4529 | case 0xcc: /* int3 */ | 4675 | case 0xcc: /* int3 */ |
4530 | rc = emulate_int(ctxt, 3); | 4676 | rc = emulate_int(ctxt, 3); |
4531 | break; | 4677 | break; |
@@ -4536,13 +4682,6 @@ special_insn: | |||
4536 | if (ctxt->eflags & EFLG_OF) | 4682 | if (ctxt->eflags & EFLG_OF) |
4537 | rc = emulate_int(ctxt, 4); | 4683 | rc = emulate_int(ctxt, 4); |
4538 | break; | 4684 | break; |
4539 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
4540 | rc = em_grp2(ctxt); | ||
4541 | break; | ||
4542 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
4543 | ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX); | ||
4544 | rc = em_grp2(ctxt); | ||
4545 | break; | ||
4546 | case 0xe9: /* jmp rel */ | 4685 | case 0xe9: /* jmp rel */ |
4547 | case 0xeb: /* jmp rel short */ | 4686 | case 0xeb: /* jmp rel short */ |
4548 | jmp_rel(ctxt, ctxt->src.val); | 4687 | jmp_rel(ctxt, ctxt->src.val); |
@@ -4661,14 +4800,6 @@ twobyte_insn: | |||
4661 | case 0x90 ... 0x9f: /* setcc r/m8 */ | 4800 | case 0x90 ... 0x9f: /* setcc r/m8 */ |
4662 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); | 4801 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); |
4663 | break; | 4802 | break; |
4664 | case 0xa4: /* shld imm8, r, r/m */ | ||
4665 | case 0xa5: /* shld cl, r, r/m */ | ||
4666 | emulate_2op_cl(ctxt, "shld"); | ||
4667 | break; | ||
4668 | case 0xac: /* shrd imm8, r, r/m */ | ||
4669 | case 0xad: /* shrd cl, r, r/m */ | ||
4670 | emulate_2op_cl(ctxt, "shrd"); | ||
4671 | break; | ||
4672 | case 0xae: /* clflush */ | 4803 | case 0xae: /* clflush */ |
4673 | break; | 4804 | break; |
4674 | case 0xb6 ... 0xb7: /* movzx */ | 4805 | case 0xb6 ... 0xb7: /* movzx */ |
@@ -4682,7 +4813,7 @@ twobyte_insn: | |||
4682 | (s16) ctxt->src.val; | 4813 | (s16) ctxt->src.val; |
4683 | break; | 4814 | break; |
4684 | case 0xc0 ... 0xc1: /* xadd */ | 4815 | case 0xc0 ... 0xc1: /* xadd */ |
4685 | emulate_2op_SrcV(ctxt, "add"); | 4816 | fastop(ctxt, em_add); |
4686 | /* Write back the register source. */ | 4817 | /* Write back the register source. */ |
4687 | ctxt->src.val = ctxt->dst.orig_val; | 4818 | ctxt->src.val = ctxt->dst.orig_val; |
4688 | write_register_operand(&ctxt->src); | 4819 | write_register_operand(&ctxt->src); |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 11300d2fa714..c1d30b2fc9bb 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm) | |||
122 | */ | 122 | */ |
123 | remaining = hrtimer_get_remaining(&ps->timer); | 123 | remaining = hrtimer_get_remaining(&ps->timer); |
124 | elapsed = ps->period - ktime_to_ns(remaining); | 124 | elapsed = ps->period - ktime_to_ns(remaining); |
125 | elapsed = mod_64(elapsed, ps->period); | ||
126 | 125 | ||
127 | return elapsed; | 126 | return elapsed; |
128 | } | 127 | } |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 848206df0967..cc31f7c06d3d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
241 | int irq, irq2, intno; | 241 | int irq, irq2, intno; |
242 | struct kvm_pic *s = pic_irqchip(kvm); | 242 | struct kvm_pic *s = pic_irqchip(kvm); |
243 | 243 | ||
244 | s->output = 0; | ||
245 | |||
244 | pic_lock(s); | 246 | pic_lock(s); |
245 | irq = pic_get_irq(&s->pics[0]); | 247 | irq = pic_get_irq(&s->pics[0]); |
246 | if (irq >= 0) { | 248 | if (irq >= 0) { |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7e06ba1618bd..484bc874688b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | |||
38 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); | 38 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); |
39 | 39 | ||
40 | /* | 40 | /* |
41 | * check if there is pending interrupt from | ||
42 | * non-APIC source without intack. | ||
43 | */ | ||
44 | static int kvm_cpu_has_extint(struct kvm_vcpu *v) | ||
45 | { | ||
46 | if (kvm_apic_accept_pic_intr(v)) | ||
47 | return pic_irqchip(v->kvm)->output; /* PIC */ | ||
48 | else | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * check if there is injectable interrupt: | ||
54 | * when virtual interrupt delivery enabled, | ||
55 | * interrupt from apic will handled by hardware, | ||
56 | * we don't need to check it here. | ||
57 | */ | ||
58 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) | ||
59 | { | ||
60 | if (!irqchip_in_kernel(v->kvm)) | ||
61 | return v->arch.interrupt.pending; | ||
62 | |||
63 | if (kvm_cpu_has_extint(v)) | ||
64 | return 1; | ||
65 | |||
66 | if (kvm_apic_vid_enabled(v->kvm)) | ||
67 | return 0; | ||
68 | |||
69 | return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ | ||
70 | } | ||
71 | |||
72 | /* | ||
41 | * check if there is pending interrupt without | 73 | * check if there is pending interrupt without |
42 | * intack. | 74 | * intack. |
43 | */ | 75 | */ |
44 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | 76 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) |
45 | { | 77 | { |
46 | struct kvm_pic *s; | ||
47 | |||
48 | if (!irqchip_in_kernel(v->kvm)) | 78 | if (!irqchip_in_kernel(v->kvm)) |
49 | return v->arch.interrupt.pending; | 79 | return v->arch.interrupt.pending; |
50 | 80 | ||
51 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ | 81 | if (kvm_cpu_has_extint(v)) |
52 | if (kvm_apic_accept_pic_intr(v)) { | 82 | return 1; |
53 | s = pic_irqchip(v->kvm); /* PIC */ | 83 | |
54 | return s->output; | 84 | return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ |
55 | } else | ||
56 | return 0; | ||
57 | } | ||
58 | return 1; | ||
59 | } | 85 | } |
60 | EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); | 86 | EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); |
61 | 87 | ||
62 | /* | 88 | /* |
89 | * Read pending interrupt(from non-APIC source) | ||
90 | * vector and intack. | ||
91 | */ | ||
92 | static int kvm_cpu_get_extint(struct kvm_vcpu *v) | ||
93 | { | ||
94 | if (kvm_cpu_has_extint(v)) | ||
95 | return kvm_pic_read_irq(v->kvm); /* PIC */ | ||
96 | return -1; | ||
97 | } | ||
98 | |||
99 | /* | ||
63 | * Read pending interrupt vector and intack. | 100 | * Read pending interrupt vector and intack. |
64 | */ | 101 | */ |
65 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | 102 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v) |
66 | { | 103 | { |
67 | struct kvm_pic *s; | ||
68 | int vector; | 104 | int vector; |
69 | 105 | ||
70 | if (!irqchip_in_kernel(v->kvm)) | 106 | if (!irqchip_in_kernel(v->kvm)) |
71 | return v->arch.interrupt.nr; | 107 | return v->arch.interrupt.nr; |
72 | 108 | ||
73 | vector = kvm_get_apic_interrupt(v); /* APIC */ | 109 | vector = kvm_cpu_get_extint(v); |
74 | if (vector == -1) { | 110 | |
75 | if (kvm_apic_accept_pic_intr(v)) { | 111 | if (kvm_apic_vid_enabled(v->kvm) || vector != -1) |
76 | s = pic_irqchip(v->kvm); | 112 | return vector; /* PIC */ |
77 | s->output = 0; /* PIC */ | 113 | |
78 | vector = kvm_pic_read_irq(v->kvm); | 114 | return kvm_get_apic_interrupt(v); /* APIC */ |
79 | } | ||
80 | } | ||
81 | return vector; | ||
82 | } | 115 | } |
83 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | ||
84 | 116 | ||
85 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 117 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
86 | { | 118 | { |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9392f527f107..02b51dd4e4ad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic) | |||
140 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ | 140 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ |
141 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) | 141 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) |
142 | 142 | ||
143 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | ||
144 | { | ||
145 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
146 | } | ||
147 | |||
148 | static inline int kvm_apic_id(struct kvm_lapic *apic) | 143 | static inline int kvm_apic_id(struct kvm_lapic *apic) |
149 | { | 144 | { |
150 | return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | 145 | return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; |
151 | } | 146 | } |
152 | 147 | ||
153 | static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) | 148 | void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, |
149 | struct kvm_lapic_irq *irq, | ||
150 | u64 *eoi_exit_bitmap) | ||
154 | { | 151 | { |
155 | u16 cid; | 152 | struct kvm_lapic **dst; |
156 | ldr >>= 32 - map->ldr_bits; | 153 | struct kvm_apic_map *map; |
157 | cid = (ldr >> map->cid_shift) & map->cid_mask; | 154 | unsigned long bitmap = 1; |
155 | int i; | ||
158 | 156 | ||
159 | BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); | 157 | rcu_read_lock(); |
158 | map = rcu_dereference(vcpu->kvm->arch.apic_map); | ||
160 | 159 | ||
161 | return cid; | 160 | if (unlikely(!map)) { |
162 | } | 161 | __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); |
162 | goto out; | ||
163 | } | ||
163 | 164 | ||
164 | static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) | 165 | if (irq->dest_mode == 0) { /* physical mode */ |
165 | { | 166 | if (irq->delivery_mode == APIC_DM_LOWEST || |
166 | ldr >>= (32 - map->ldr_bits); | 167 | irq->dest_id == 0xff) { |
167 | return ldr & map->lid_mask; | 168 | __set_bit(irq->vector, |
169 | (unsigned long *)eoi_exit_bitmap); | ||
170 | goto out; | ||
171 | } | ||
172 | dst = &map->phys_map[irq->dest_id & 0xff]; | ||
173 | } else { | ||
174 | u32 mda = irq->dest_id << (32 - map->ldr_bits); | ||
175 | |||
176 | dst = map->logical_map[apic_cluster_id(map, mda)]; | ||
177 | |||
178 | bitmap = apic_logical_id(map, mda); | ||
179 | } | ||
180 | |||
181 | for_each_set_bit(i, &bitmap, 16) { | ||
182 | if (!dst[i]) | ||
183 | continue; | ||
184 | if (dst[i]->vcpu == vcpu) { | ||
185 | __set_bit(irq->vector, | ||
186 | (unsigned long *)eoi_exit_bitmap); | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | out: | ||
192 | rcu_read_unlock(); | ||
168 | } | 193 | } |
169 | 194 | ||
170 | static void recalculate_apic_map(struct kvm *kvm) | 195 | static void recalculate_apic_map(struct kvm *kvm) |
@@ -230,6 +255,8 @@ out: | |||
230 | 255 | ||
231 | if (old) | 256 | if (old) |
232 | kfree_rcu(old, rcu); | 257 | kfree_rcu(old, rcu); |
258 | |||
259 | kvm_ioapic_make_eoibitmap_request(kvm); | ||
233 | } | 260 | } |
234 | 261 | ||
235 | static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) | 262 | static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) |
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) | |||
345 | { | 372 | { |
346 | int result; | 373 | int result; |
347 | 374 | ||
375 | /* | ||
376 | * Note that irr_pending is just a hint. It will be always | ||
377 | * true with virtual interrupt delivery enabled. | ||
378 | */ | ||
348 | if (!apic->irr_pending) | 379 | if (!apic->irr_pending) |
349 | return -1; | 380 | return -1; |
350 | 381 | ||
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) | |||
461 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | 492 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) |
462 | { | 493 | { |
463 | int result; | 494 | int result; |
495 | |||
496 | /* Note that isr_count is always 1 with vid enabled */ | ||
464 | if (!apic->isr_count) | 497 | if (!apic->isr_count) |
465 | return -1; | 498 | return -1; |
466 | if (likely(apic->highest_isr_cache != -1)) | 499 | if (likely(apic->highest_isr_cache != -1)) |
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) | |||
740 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; | 773 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; |
741 | } | 774 | } |
742 | 775 | ||
776 | static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) | ||
777 | { | ||
778 | if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && | ||
779 | kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { | ||
780 | int trigger_mode; | ||
781 | if (apic_test_vector(vector, apic->regs + APIC_TMR)) | ||
782 | trigger_mode = IOAPIC_LEVEL_TRIG; | ||
783 | else | ||
784 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
785 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
786 | } | ||
787 | } | ||
788 | |||
743 | static int apic_set_eoi(struct kvm_lapic *apic) | 789 | static int apic_set_eoi(struct kvm_lapic *apic) |
744 | { | 790 | { |
745 | int vector = apic_find_highest_isr(apic); | 791 | int vector = apic_find_highest_isr(apic); |
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) | |||
756 | apic_clear_isr(vector, apic); | 802 | apic_clear_isr(vector, apic); |
757 | apic_update_ppr(apic); | 803 | apic_update_ppr(apic); |
758 | 804 | ||
759 | if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && | 805 | kvm_ioapic_send_eoi(apic, vector); |
760 | kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { | ||
761 | int trigger_mode; | ||
762 | if (apic_test_vector(vector, apic->regs + APIC_TMR)) | ||
763 | trigger_mode = IOAPIC_LEVEL_TRIG; | ||
764 | else | ||
765 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
766 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
767 | } | ||
768 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 806 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); |
769 | return vector; | 807 | return vector; |
770 | } | 808 | } |
771 | 809 | ||
810 | /* | ||
811 | * this interface assumes a trap-like exit, which has already finished | ||
812 | * desired side effect including vISR and vPPR update. | ||
813 | */ | ||
814 | void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) | ||
815 | { | ||
816 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
817 | |||
818 | trace_kvm_eoi(apic, vector); | ||
819 | |||
820 | kvm_ioapic_send_eoi(apic, vector); | ||
821 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
822 | } | ||
823 | EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); | ||
824 | |||
772 | static void apic_send_ipi(struct kvm_lapic *apic) | 825 | static void apic_send_ipi(struct kvm_lapic *apic) |
773 | { | 826 | { |
774 | u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); | 827 | u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); |
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) | |||
1212 | } | 1265 | } |
1213 | EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); | 1266 | EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); |
1214 | 1267 | ||
1268 | /* emulate APIC access in a trap manner */ | ||
1269 | void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) | ||
1270 | { | ||
1271 | u32 val = 0; | ||
1272 | |||
1273 | /* hw has done the conditional check and inst decode */ | ||
1274 | offset &= 0xff0; | ||
1275 | |||
1276 | apic_reg_read(vcpu->arch.apic, offset, 4, &val); | ||
1277 | |||
1278 | /* TODO: optimize to just emulate side effect w/o one more write */ | ||
1279 | apic_reg_write(vcpu->arch.apic, offset, val); | ||
1280 | } | ||
1281 | EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); | ||
1282 | |||
1215 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | 1283 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
1216 | { | 1284 | { |
1217 | struct kvm_lapic *apic = vcpu->arch.apic; | 1285 | struct kvm_lapic *apic = vcpu->arch.apic; |
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | |||
1288 | 1356 | ||
1289 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | 1357 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) |
1290 | { | 1358 | { |
1359 | u64 old_value = vcpu->arch.apic_base; | ||
1291 | struct kvm_lapic *apic = vcpu->arch.apic; | 1360 | struct kvm_lapic *apic = vcpu->arch.apic; |
1292 | 1361 | ||
1293 | if (!apic) { | 1362 | if (!apic) { |
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
1309 | value &= ~MSR_IA32_APICBASE_BSP; | 1378 | value &= ~MSR_IA32_APICBASE_BSP; |
1310 | 1379 | ||
1311 | vcpu->arch.apic_base = value; | 1380 | vcpu->arch.apic_base = value; |
1312 | if (apic_x2apic_mode(apic)) { | 1381 | if ((old_value ^ value) & X2APIC_ENABLE) { |
1313 | u32 id = kvm_apic_id(apic); | 1382 | if (value & X2APIC_ENABLE) { |
1314 | u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); | 1383 | u32 id = kvm_apic_id(apic); |
1315 | kvm_apic_set_ldr(apic, ldr); | 1384 | u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); |
1385 | kvm_apic_set_ldr(apic, ldr); | ||
1386 | kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); | ||
1387 | } else | ||
1388 | kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); | ||
1316 | } | 1389 | } |
1390 | |||
1317 | apic->base_address = apic->vcpu->arch.apic_base & | 1391 | apic->base_address = apic->vcpu->arch.apic_base & |
1318 | MSR_IA32_APICBASE_BASE; | 1392 | MSR_IA32_APICBASE_BASE; |
1319 | 1393 | ||
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1359 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | 1433 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); |
1360 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 1434 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
1361 | } | 1435 | } |
1362 | apic->irr_pending = false; | 1436 | apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); |
1363 | apic->isr_count = 0; | 1437 | apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); |
1364 | apic->highest_isr_cache = -1; | 1438 | apic->highest_isr_cache = -1; |
1365 | update_divide_count(apic); | 1439 | update_divide_count(apic); |
1366 | atomic_set(&apic->lapic_timer.pending, 0); | 1440 | atomic_set(&apic->lapic_timer.pending, 0); |
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, | |||
1575 | update_divide_count(apic); | 1649 | update_divide_count(apic); |
1576 | start_apic_timer(apic); | 1650 | start_apic_timer(apic); |
1577 | apic->irr_pending = true; | 1651 | apic->irr_pending = true; |
1578 | apic->isr_count = count_vectors(apic->regs + APIC_ISR); | 1652 | apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? |
1653 | 1 : count_vectors(apic->regs + APIC_ISR); | ||
1579 | apic->highest_isr_cache = -1; | 1654 | apic->highest_isr_cache = -1; |
1655 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); | ||
1580 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 1656 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
1581 | } | 1657 | } |
1582 | 1658 | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e5ebf9f3571f..1676d34ddb4e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | |||
64 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); | 64 | u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); |
65 | void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); | 65 | void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); |
66 | 66 | ||
67 | void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); | ||
68 | void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); | ||
69 | |||
67 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | 70 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); |
68 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); | 71 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); |
69 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); | 72 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); |
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | |||
124 | return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); | 127 | return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); |
125 | } | 128 | } |
126 | 129 | ||
130 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | ||
131 | { | ||
132 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
133 | } | ||
134 | |||
135 | static inline bool kvm_apic_vid_enabled(struct kvm *kvm) | ||
136 | { | ||
137 | return kvm_x86_ops->vm_has_apicv(kvm); | ||
138 | } | ||
139 | |||
140 | static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) | ||
141 | { | ||
142 | u16 cid; | ||
143 | ldr >>= 32 - map->ldr_bits; | ||
144 | cid = (ldr >> map->cid_shift) & map->cid_mask; | ||
145 | |||
146 | BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); | ||
147 | |||
148 | return cid; | ||
149 | } | ||
150 | |||
151 | static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) | ||
152 | { | ||
153 | ldr >>= (32 - map->ldr_bits); | ||
154 | return ldr & map->lid_mask; | ||
155 | } | ||
156 | |||
157 | void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, | ||
158 | struct kvm_lapic_irq *irq, | ||
159 | u64 *eoi_bitmap); | ||
160 | |||
127 | #endif | 161 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01d7c2ad05f5..4ed3edbe06bd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte) | |||
448 | 448 | ||
449 | static bool spte_is_locklessly_modifiable(u64 spte) | 449 | static bool spte_is_locklessly_modifiable(u64 spte) |
450 | { | 450 | { |
451 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | 451 | return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == |
452 | (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); | ||
452 | } | 453 | } |
453 | 454 | ||
454 | static bool spte_has_volatile_bits(u64 spte) | 455 | static bool spte_has_volatile_bits(u64 spte) |
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
831 | if (host_level == PT_PAGE_TABLE_LEVEL) | 832 | if (host_level == PT_PAGE_TABLE_LEVEL) |
832 | return host_level; | 833 | return host_level; |
833 | 834 | ||
834 | max_level = kvm_x86_ops->get_lpage_level() < host_level ? | 835 | max_level = min(kvm_x86_ops->get_lpage_level(), host_level); |
835 | kvm_x86_ops->get_lpage_level() : host_level; | ||
836 | 836 | ||
837 | for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) | 837 | for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) |
838 | if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) | 838 | if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) |
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | |||
1142 | } | 1142 | } |
1143 | 1143 | ||
1144 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | 1144 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, |
1145 | int level, bool pt_protect) | 1145 | bool pt_protect) |
1146 | { | 1146 | { |
1147 | u64 *sptep; | 1147 | u64 *sptep; |
1148 | struct rmap_iterator iter; | 1148 | struct rmap_iterator iter; |
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
1180 | while (mask) { | 1180 | while (mask) { |
1181 | rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), | 1181 | rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), |
1182 | PT_PAGE_TABLE_LEVEL, slot); | 1182 | PT_PAGE_TABLE_LEVEL, slot); |
1183 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); | 1183 | __rmap_write_protect(kvm, rmapp, false); |
1184 | 1184 | ||
1185 | /* clear the first set bit */ | 1185 | /* clear the first set bit */ |
1186 | mask &= mask - 1; | 1186 | mask &= mask - 1; |
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1199 | for (i = PT_PAGE_TABLE_LEVEL; | 1199 | for (i = PT_PAGE_TABLE_LEVEL; |
1200 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1200 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1201 | rmapp = __gfn_to_rmap(gfn, i, slot); | 1201 | rmapp = __gfn_to_rmap(gfn, i, slot); |
1202 | write_protected |= __rmap_write_protect(kvm, rmapp, i, true); | 1202 | write_protected |= __rmap_write_protect(kvm, rmapp, true); |
1203 | } | 1203 | } |
1204 | 1204 | ||
1205 | return write_protected; | 1205 | return write_protected; |
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | |||
1460 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | 1460 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); |
1461 | } | 1461 | } |
1462 | 1462 | ||
1463 | /* | 1463 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) |
1464 | * Remove the sp from shadow page cache, after call it, | ||
1465 | * we can not find this sp from the cache, and the shadow | ||
1466 | * page table is still valid. | ||
1467 | * It should be under the protection of mmu lock. | ||
1468 | */ | ||
1469 | static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) | ||
1470 | { | 1464 | { |
1471 | ASSERT(is_empty_shadow_page(sp->spt)); | 1465 | ASSERT(is_empty_shadow_page(sp->spt)); |
1472 | hlist_del(&sp->hash_link); | 1466 | hlist_del(&sp->hash_link); |
1473 | if (!sp->role.direct) | ||
1474 | free_page((unsigned long)sp->gfns); | ||
1475 | } | ||
1476 | |||
1477 | /* | ||
1478 | * Free the shadow page table and the sp, we can do it | ||
1479 | * out of the protection of mmu lock. | ||
1480 | */ | ||
1481 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) | ||
1482 | { | ||
1483 | list_del(&sp->link); | 1467 | list_del(&sp->link); |
1484 | free_page((unsigned long)sp->spt); | 1468 | free_page((unsigned long)sp->spt); |
1469 | if (!sp->role.direct) | ||
1470 | free_page((unsigned long)sp->gfns); | ||
1485 | kmem_cache_free(mmu_page_header_cache, sp); | 1471 | kmem_cache_free(mmu_page_header_cache, sp); |
1486 | } | 1472 | } |
1487 | 1473 | ||
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1522 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); | 1508 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1523 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1509 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1524 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1510 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1525 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); | ||
1526 | sp->parent_ptes = 0; | 1511 | sp->parent_ptes = 0; |
1527 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1512 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1528 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | 1513 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
1973 | { | 1958 | { |
1974 | u64 spte; | 1959 | u64 spte; |
1975 | 1960 | ||
1976 | spte = __pa(sp->spt) | 1961 | spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | |
1977 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | 1962 | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; |
1978 | | PT_WRITABLE_MASK | PT_USER_MASK; | 1963 | |
1979 | mmu_spte_set(sptep, spte); | 1964 | mmu_spte_set(sptep, spte); |
1980 | } | 1965 | } |
1981 | 1966 | ||
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
2126 | do { | 2111 | do { |
2127 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | 2112 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); |
2128 | WARN_ON(!sp->role.invalid || sp->root_count); | 2113 | WARN_ON(!sp->role.invalid || sp->root_count); |
2129 | kvm_mmu_isolate_page(sp); | ||
2130 | kvm_mmu_free_page(sp); | 2114 | kvm_mmu_free_page(sp); |
2131 | } while (!list_empty(invalid_list)); | 2115 | } while (!list_empty(invalid_list)); |
2132 | } | 2116 | } |
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
2144 | * change the value | 2128 | * change the value |
2145 | */ | 2129 | */ |
2146 | 2130 | ||
2131 | spin_lock(&kvm->mmu_lock); | ||
2132 | |||
2147 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { | 2133 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { |
2148 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && | 2134 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && |
2149 | !list_empty(&kvm->arch.active_mmu_pages)) { | 2135 | !list_empty(&kvm->arch.active_mmu_pages)) { |
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
2158 | } | 2144 | } |
2159 | 2145 | ||
2160 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; | 2146 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
2147 | |||
2148 | spin_unlock(&kvm->mmu_lock); | ||
2161 | } | 2149 | } |
2162 | 2150 | ||
2163 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 2151 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
2183 | } | 2171 | } |
2184 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); | 2172 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); |
2185 | 2173 | ||
2186 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | ||
2187 | { | ||
2188 | int slot = memslot_id(kvm, gfn); | ||
2189 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | ||
2190 | |||
2191 | __set_bit(slot, sp->slot_bitmap); | ||
2192 | } | ||
2193 | |||
2194 | /* | 2174 | /* |
2195 | * The function is based on mtrr_type_lookup() in | 2175 | * The function is based on mtrr_type_lookup() in |
2196 | * arch/x86/kernel/cpu/mtrr/generic.c | 2176 | * arch/x86/kernel/cpu/mtrr/generic.c |
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2332 | if (s->role.level != PT_PAGE_TABLE_LEVEL) | 2312 | if (s->role.level != PT_PAGE_TABLE_LEVEL) |
2333 | return 1; | 2313 | return 1; |
2334 | 2314 | ||
2335 | if (!need_unsync && !s->unsync) { | 2315 | if (!s->unsync) |
2336 | need_unsync = true; | 2316 | need_unsync = true; |
2337 | } | ||
2338 | } | 2317 | } |
2339 | if (need_unsync) | 2318 | if (need_unsync) |
2340 | kvm_unsync_pages(vcpu, gfn); | 2319 | kvm_unsync_pages(vcpu, gfn); |
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2342 | } | 2321 | } |
2343 | 2322 | ||
2344 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2323 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2345 | unsigned pte_access, int user_fault, | 2324 | unsigned pte_access, int level, |
2346 | int write_fault, int level, | ||
2347 | gfn_t gfn, pfn_t pfn, bool speculative, | 2325 | gfn_t gfn, pfn_t pfn, bool speculative, |
2348 | bool can_unsync, bool host_writable) | 2326 | bool can_unsync, bool host_writable) |
2349 | { | 2327 | { |
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2378 | 2356 | ||
2379 | spte |= (u64)pfn << PAGE_SHIFT; | 2357 | spte |= (u64)pfn << PAGE_SHIFT; |
2380 | 2358 | ||
2381 | if ((pte_access & ACC_WRITE_MASK) | 2359 | if (pte_access & ACC_WRITE_MASK) { |
2382 | || (!vcpu->arch.mmu.direct_map && write_fault | ||
2383 | && !is_write_protection(vcpu) && !user_fault)) { | ||
2384 | 2360 | ||
2385 | /* | 2361 | /* |
2386 | * There are two cases: | 2362 | * Other vcpu creates new sp in the window between |
2387 | * - the one is other vcpu creates new sp in the window | 2363 | * mapping_level() and acquiring mmu-lock. We can |
2388 | * between mapping_level() and acquiring mmu-lock. | 2364 | * allow guest to retry the access, the mapping can |
2389 | * - the another case is the new sp is created by itself | 2365 | * be fixed if guest refault. |
2390 | * (page-fault path) when guest uses the target gfn as | ||
2391 | * its page table. | ||
2392 | * Both of these cases can be fixed by allowing guest to | ||
2393 | * retry the access, it will refault, then we can establish | ||
2394 | * the mapping by using small page. | ||
2395 | */ | 2366 | */ |
2396 | if (level > PT_PAGE_TABLE_LEVEL && | 2367 | if (level > PT_PAGE_TABLE_LEVEL && |
2397 | has_wrprotected_page(vcpu->kvm, gfn, level)) | 2368 | has_wrprotected_page(vcpu->kvm, gfn, level)) |
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2399 | 2370 | ||
2400 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; | 2371 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; |
2401 | 2372 | ||
2402 | if (!vcpu->arch.mmu.direct_map | ||
2403 | && !(pte_access & ACC_WRITE_MASK)) { | ||
2404 | spte &= ~PT_USER_MASK; | ||
2405 | /* | ||
2406 | * If we converted a user page to a kernel page, | ||
2407 | * so that the kernel can write to it when cr0.wp=0, | ||
2408 | * then we should prevent the kernel from executing it | ||
2409 | * if SMEP is enabled. | ||
2410 | */ | ||
2411 | if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
2412 | spte |= PT64_NX_MASK; | ||
2413 | } | ||
2414 | |||
2415 | /* | 2373 | /* |
2416 | * Optimization: for pte sync, if spte was writable the hash | 2374 | * Optimization: for pte sync, if spte was writable the hash |
2417 | * lookup is unnecessary (and expensive). Write protection | 2375 | * lookup is unnecessary (and expensive). Write protection |
@@ -2441,19 +2399,15 @@ done: | |||
2441 | } | 2399 | } |
2442 | 2400 | ||
2443 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2401 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2444 | unsigned pt_access, unsigned pte_access, | 2402 | unsigned pte_access, int write_fault, int *emulate, |
2445 | int user_fault, int write_fault, | 2403 | int level, gfn_t gfn, pfn_t pfn, bool speculative, |
2446 | int *emulate, int level, gfn_t gfn, | ||
2447 | pfn_t pfn, bool speculative, | ||
2448 | bool host_writable) | 2404 | bool host_writable) |
2449 | { | 2405 | { |
2450 | int was_rmapped = 0; | 2406 | int was_rmapped = 0; |
2451 | int rmap_count; | 2407 | int rmap_count; |
2452 | 2408 | ||
2453 | pgprintk("%s: spte %llx access %x write_fault %d" | 2409 | pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, |
2454 | " user_fault %d gfn %llx\n", | 2410 | *sptep, write_fault, gfn); |
2455 | __func__, *sptep, pt_access, | ||
2456 | write_fault, user_fault, gfn); | ||
2457 | 2411 | ||
2458 | if (is_rmap_spte(*sptep)) { | 2412 | if (is_rmap_spte(*sptep)) { |
2459 | /* | 2413 | /* |
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2477 | was_rmapped = 1; | 2431 | was_rmapped = 1; |
2478 | } | 2432 | } |
2479 | 2433 | ||
2480 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2434 | if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, |
2481 | level, gfn, pfn, speculative, true, | 2435 | true, host_writable)) { |
2482 | host_writable)) { | ||
2483 | if (write_fault) | 2436 | if (write_fault) |
2484 | *emulate = 1; | 2437 | *emulate = 1; |
2485 | kvm_mmu_flush_tlb(vcpu); | 2438 | kvm_mmu_flush_tlb(vcpu); |
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2497 | ++vcpu->kvm->stat.lpages; | 2450 | ++vcpu->kvm->stat.lpages; |
2498 | 2451 | ||
2499 | if (is_shadow_present_pte(*sptep)) { | 2452 | if (is_shadow_present_pte(*sptep)) { |
2500 | page_header_update_slot(vcpu->kvm, sptep, gfn); | ||
2501 | if (!was_rmapped) { | 2453 | if (!was_rmapped) { |
2502 | rmap_count = rmap_add(vcpu, sptep, gfn); | 2454 | rmap_count = rmap_add(vcpu, sptep, gfn); |
2503 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 2455 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | |||
2571 | return -1; | 2523 | return -1; |
2572 | 2524 | ||
2573 | for (i = 0; i < ret; i++, gfn++, start++) | 2525 | for (i = 0; i < ret; i++, gfn++, start++) |
2574 | mmu_set_spte(vcpu, start, ACC_ALL, | 2526 | mmu_set_spte(vcpu, start, access, 0, NULL, |
2575 | access, 0, 0, NULL, | 2527 | sp->role.level, gfn, page_to_pfn(pages[i]), |
2576 | sp->role.level, gfn, | 2528 | true, true); |
2577 | page_to_pfn(pages[i]), true, true); | ||
2578 | 2529 | ||
2579 | return 0; | 2530 | return 0; |
2580 | } | 2531 | } |
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2633 | 2584 | ||
2634 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2585 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
2635 | if (iterator.level == level) { | 2586 | if (iterator.level == level) { |
2636 | unsigned pte_access = ACC_ALL; | 2587 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, |
2637 | 2588 | write, &emulate, level, gfn, pfn, | |
2638 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | 2589 | prefault, map_writable); |
2639 | 0, write, &emulate, | ||
2640 | level, gfn, pfn, prefault, map_writable); | ||
2641 | direct_pte_prefetch(vcpu, iterator.sptep); | 2590 | direct_pte_prefetch(vcpu, iterator.sptep); |
2642 | ++vcpu->stat.pf_fixed; | 2591 | ++vcpu->stat.pf_fixed; |
2643 | break; | 2592 | break; |
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2652 | iterator.level - 1, | 2601 | iterator.level - 1, |
2653 | 1, ACC_ALL, iterator.sptep); | 2602 | 1, ACC_ALL, iterator.sptep); |
2654 | 2603 | ||
2655 | mmu_spte_set(iterator.sptep, | 2604 | link_shadow_page(iterator.sptep, sp); |
2656 | __pa(sp->spt) | ||
2657 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | ||
2658 | | shadow_user_mask | shadow_x_mask | ||
2659 | | shadow_accessed_mask); | ||
2660 | } | 2605 | } |
2661 | } | 2606 | } |
2662 | return emulate; | 2607 | return emulate; |
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | |||
3719 | else | 3664 | else |
3720 | r = paging32_init_context(vcpu, context); | 3665 | r = paging32_init_context(vcpu, context); |
3721 | 3666 | ||
3667 | vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); | ||
3722 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3668 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
3723 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3669 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
3724 | vcpu->arch.mmu.base_role.smep_andnot_wp | 3670 | vcpu->arch.mmu.base_role.smep_andnot_wp |
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, | |||
3885 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | 3831 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ |
3886 | *gpa &= ~(gpa_t)7; | 3832 | *gpa &= ~(gpa_t)7; |
3887 | *bytes = 8; | 3833 | *bytes = 8; |
3888 | r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); | 3834 | r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); |
3889 | if (r) | 3835 | if (r) |
3890 | gentry = 0; | 3836 | gentry = 0; |
3891 | new = (const u8 *)&gentry; | 3837 | new = (const u8 *)&gentry; |
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
4039 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | 3985 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) |
4040 | & mask.word) && rmap_can_add(vcpu)) | 3986 | & mask.word) && rmap_can_add(vcpu)) |
4041 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); | 3987 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); |
4042 | if (!remote_flush && need_remote_flush(entry, *spte)) | 3988 | if (need_remote_flush(entry, *spte)) |
4043 | remote_flush = true; | 3989 | remote_flush = true; |
4044 | ++spte; | 3990 | ++spte; |
4045 | } | 3991 | } |
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
4198 | 4144 | ||
4199 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 4145 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
4200 | { | 4146 | { |
4201 | struct kvm_mmu_page *sp; | 4147 | struct kvm_memory_slot *memslot; |
4202 | bool flush = false; | 4148 | gfn_t last_gfn; |
4149 | int i; | ||
4203 | 4150 | ||
4204 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 4151 | memslot = id_to_memslot(kvm->memslots, slot); |
4205 | int i; | 4152 | last_gfn = memslot->base_gfn + memslot->npages - 1; |
4206 | u64 *pt; | ||
4207 | 4153 | ||
4208 | if (!test_bit(slot, sp->slot_bitmap)) | 4154 | spin_lock(&kvm->mmu_lock); |
4209 | continue; | ||
4210 | 4155 | ||
4211 | pt = sp->spt; | 4156 | for (i = PT_PAGE_TABLE_LEVEL; |
4212 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 4157 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
4213 | if (!is_shadow_present_pte(pt[i]) || | 4158 | unsigned long *rmapp; |
4214 | !is_last_spte(pt[i], sp->role.level)) | 4159 | unsigned long last_index, index; |
4215 | continue; | ||
4216 | 4160 | ||
4217 | spte_write_protect(kvm, &pt[i], &flush, false); | 4161 | rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; |
4162 | last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); | ||
4163 | |||
4164 | for (index = 0; index <= last_index; ++index, ++rmapp) { | ||
4165 | if (*rmapp) | ||
4166 | __rmap_write_protect(kvm, rmapp, false); | ||
4167 | |||
4168 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | ||
4169 | kvm_flush_remote_tlbs(kvm); | ||
4170 | cond_resched_lock(&kvm->mmu_lock); | ||
4171 | } | ||
4218 | } | 4172 | } |
4219 | } | 4173 | } |
4174 | |||
4220 | kvm_flush_remote_tlbs(kvm); | 4175 | kvm_flush_remote_tlbs(kvm); |
4176 | spin_unlock(&kvm->mmu_lock); | ||
4221 | } | 4177 | } |
4222 | 4178 | ||
4223 | void kvm_mmu_zap_all(struct kvm *kvm) | 4179 | void kvm_mmu_zap_all(struct kvm *kvm) |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cd6e98333ba3..b8f6172f4174 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
195 | TP_ARGS(sp) | 195 | TP_ARGS(sp) |
196 | ); | 196 | ); |
197 | 197 | ||
198 | DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, | ||
199 | TP_PROTO(struct kvm_mmu_page *sp), | ||
200 | |||
201 | TP_ARGS(sp) | ||
202 | ); | ||
203 | |||
204 | TRACE_EVENT( | 198 | TRACE_EVENT( |
205 | mark_mmio_spte, | 199 | mark_mmio_spte, |
206 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), | 200 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 891eb6d93b8b..105dd5bd550e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
151 | pt_element_t pte; | 151 | pt_element_t pte; |
152 | pt_element_t __user *uninitialized_var(ptep_user); | 152 | pt_element_t __user *uninitialized_var(ptep_user); |
153 | gfn_t table_gfn; | 153 | gfn_t table_gfn; |
154 | unsigned index, pt_access, pte_access, accessed_dirty, shift; | 154 | unsigned index, pt_access, pte_access, accessed_dirty; |
155 | gpa_t pte_gpa; | 155 | gpa_t pte_gpa; |
156 | int offset; | 156 | int offset; |
157 | const int write_fault = access & PFERR_WRITE_MASK; | 157 | const int write_fault = access & PFERR_WRITE_MASK; |
@@ -249,16 +249,12 @@ retry_walk: | |||
249 | 249 | ||
250 | if (!write_fault) | 250 | if (!write_fault) |
251 | protect_clean_gpte(&pte_access, pte); | 251 | protect_clean_gpte(&pte_access, pte); |
252 | 252 | else | |
253 | /* | 253 | /* |
254 | * On a write fault, fold the dirty bit into accessed_dirty by shifting it one | 254 | * On a write fault, fold the dirty bit into accessed_dirty by |
255 | * place right. | 255 | * shifting it one place right. |
256 | * | 256 | */ |
257 | * On a read fault, do nothing. | 257 | accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); |
258 | */ | ||
259 | shift = write_fault >> ilog2(PFERR_WRITE_MASK); | ||
260 | shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; | ||
261 | accessed_dirty &= pte >> shift; | ||
262 | 258 | ||
263 | if (unlikely(!accessed_dirty)) { | 259 | if (unlikely(!accessed_dirty)) { |
264 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); | 260 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); |
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
330 | * we call mmu_set_spte() with host_writable = true because | 326 | * we call mmu_set_spte() with host_writable = true because |
331 | * pte_prefetch_gfn_to_pfn always gets a writable pfn. | 327 | * pte_prefetch_gfn_to_pfn always gets a writable pfn. |
332 | */ | 328 | */ |
333 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 329 | mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, |
334 | NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); | 330 | gfn, pfn, true, true); |
335 | 331 | ||
336 | return true; | 332 | return true; |
337 | } | 333 | } |
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
405 | */ | 401 | */ |
406 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 402 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
407 | struct guest_walker *gw, | 403 | struct guest_walker *gw, |
408 | int user_fault, int write_fault, int hlevel, | 404 | int write_fault, int hlevel, |
409 | pfn_t pfn, bool map_writable, bool prefault) | 405 | pfn_t pfn, bool map_writable, bool prefault) |
410 | { | 406 | { |
411 | struct kvm_mmu_page *sp = NULL; | 407 | struct kvm_mmu_page *sp = NULL; |
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
413 | unsigned direct_access, access = gw->pt_access; | 409 | unsigned direct_access, access = gw->pt_access; |
414 | int top_level, emulate = 0; | 410 | int top_level, emulate = 0; |
415 | 411 | ||
416 | if (!is_present_gpte(gw->ptes[gw->level - 1])) | ||
417 | return 0; | ||
418 | |||
419 | direct_access = gw->pte_access; | 412 | direct_access = gw->pte_access; |
420 | 413 | ||
421 | top_level = vcpu->arch.mmu.root_level; | 414 | top_level = vcpu->arch.mmu.root_level; |
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
477 | } | 470 | } |
478 | 471 | ||
479 | clear_sp_write_flooding_count(it.sptep); | 472 | clear_sp_write_flooding_count(it.sptep); |
480 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, | 473 | mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, |
481 | user_fault, write_fault, &emulate, it.level, | 474 | it.level, gw->gfn, pfn, prefault, map_writable); |
482 | gw->gfn, pfn, prefault, map_writable); | ||
483 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 475 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
484 | 476 | ||
485 | return emulate; | 477 | return emulate; |
@@ -491,6 +483,46 @@ out_gpte_changed: | |||
491 | return 0; | 483 | return 0; |
492 | } | 484 | } |
493 | 485 | ||
486 | /* | ||
487 | * To see whether the mapped gfn can write its page table in the current | ||
488 | * mapping. | ||
489 | * | ||
490 | * It is the helper function of FNAME(page_fault). When guest uses large page | ||
491 | * size to map the writable gfn which is used as current page table, we should | ||
492 | * force kvm to use small page size to map it because new shadow page will be | ||
493 | * created when kvm establishes shadow page table that stop kvm using large | ||
494 | * page size. Do it early can avoid unnecessary #PF and emulation. | ||
495 | * | ||
496 | * @write_fault_to_shadow_pgtable will return true if the fault gfn is | ||
497 | * currently used as its page table. | ||
498 | * | ||
499 | * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok | ||
500 | * since the PDPT is always shadowed, that means, we can not use large page | ||
501 | * size to map the gfn which is used as PDPT. | ||
502 | */ | ||
503 | static bool | ||
504 | FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, | ||
505 | struct guest_walker *walker, int user_fault, | ||
506 | bool *write_fault_to_shadow_pgtable) | ||
507 | { | ||
508 | int level; | ||
509 | gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); | ||
510 | bool self_changed = false; | ||
511 | |||
512 | if (!(walker->pte_access & ACC_WRITE_MASK || | ||
513 | (!is_write_protection(vcpu) && !user_fault))) | ||
514 | return false; | ||
515 | |||
516 | for (level = walker->level; level <= walker->max_level; level++) { | ||
517 | gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; | ||
518 | |||
519 | self_changed |= !(gfn & mask); | ||
520 | *write_fault_to_shadow_pgtable |= !gfn; | ||
521 | } | ||
522 | |||
523 | return self_changed; | ||
524 | } | ||
525 | |||
494 | /* | 526 | /* |
495 | * Page fault handler. There are several causes for a page fault: | 527 | * Page fault handler. There are several causes for a page fault: |
496 | * - there is no shadow pte for the guest pte | 528 | * - there is no shadow pte for the guest pte |
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
516 | int level = PT_PAGE_TABLE_LEVEL; | 548 | int level = PT_PAGE_TABLE_LEVEL; |
517 | int force_pt_level; | 549 | int force_pt_level; |
518 | unsigned long mmu_seq; | 550 | unsigned long mmu_seq; |
519 | bool map_writable; | 551 | bool map_writable, is_self_change_mapping; |
520 | 552 | ||
521 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 553 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
522 | 554 | ||
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
544 | return 0; | 576 | return 0; |
545 | } | 577 | } |
546 | 578 | ||
579 | vcpu->arch.write_fault_to_shadow_pgtable = false; | ||
580 | |||
581 | is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, | ||
582 | &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); | ||
583 | |||
547 | if (walker.level >= PT_DIRECTORY_LEVEL) | 584 | if (walker.level >= PT_DIRECTORY_LEVEL) |
548 | force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); | 585 | force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) |
586 | || is_self_change_mapping; | ||
549 | else | 587 | else |
550 | force_pt_level = 1; | 588 | force_pt_level = 1; |
551 | if (!force_pt_level) { | 589 | if (!force_pt_level) { |
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
564 | walker.gfn, pfn, walker.pte_access, &r)) | 602 | walker.gfn, pfn, walker.pte_access, &r)) |
565 | return r; | 603 | return r; |
566 | 604 | ||
605 | /* | ||
606 | * Do not change pte_access if the pfn is a mmio page, otherwise | ||
607 | * we will cache the incorrect access into mmio spte. | ||
608 | */ | ||
609 | if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && | ||
610 | !is_write_protection(vcpu) && !user_fault && | ||
611 | !is_noslot_pfn(pfn)) { | ||
612 | walker.pte_access |= ACC_WRITE_MASK; | ||
613 | walker.pte_access &= ~ACC_USER_MASK; | ||
614 | |||
615 | /* | ||
616 | * If we converted a user page to a kernel page, | ||
617 | * so that the kernel can write to it when cr0.wp=0, | ||
618 | * then we should prevent the kernel from executing it | ||
619 | * if SMEP is enabled. | ||
620 | */ | ||
621 | if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
622 | walker.pte_access &= ~ACC_EXEC_MASK; | ||
623 | } | ||
624 | |||
567 | spin_lock(&vcpu->kvm->mmu_lock); | 625 | spin_lock(&vcpu->kvm->mmu_lock); |
568 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) | 626 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) |
569 | goto out_unlock; | 627 | goto out_unlock; |
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
572 | kvm_mmu_free_some_pages(vcpu); | 630 | kvm_mmu_free_some_pages(vcpu); |
573 | if (!force_pt_level) | 631 | if (!force_pt_level) |
574 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | 632 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
575 | r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 633 | r = FNAME(fetch)(vcpu, addr, &walker, write_fault, |
576 | level, pfn, map_writable, prefault); | 634 | level, pfn, map_writable, prefault); |
577 | ++vcpu->stat.pf_fixed; | 635 | ++vcpu->stat.pf_fixed; |
578 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | 636 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
747 | 805 | ||
748 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; | 806 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
749 | 807 | ||
750 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 808 | set_spte(vcpu, &sp->spt[i], pte_access, |
751 | PT_PAGE_TABLE_LEVEL, gfn, | 809 | PT_PAGE_TABLE_LEVEL, gfn, |
752 | spte_to_pfn(sp->spt[i]), true, false, | 810 | spte_to_pfn(sp->spt[i]), true, false, |
753 | host_writable); | 811 | host_writable); |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd1c156..e1b1ce21bc00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
3571 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); | 3571 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
3572 | } | 3572 | } |
3573 | 3573 | ||
3574 | static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) | ||
3575 | { | ||
3576 | return; | ||
3577 | } | ||
3578 | |||
3579 | static int svm_vm_has_apicv(struct kvm *kvm) | ||
3580 | { | ||
3581 | return 0; | ||
3582 | } | ||
3583 | |||
3584 | static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) | ||
3585 | { | ||
3586 | return; | ||
3587 | } | ||
3588 | |||
3589 | static void svm_hwapic_isr_update(struct kvm *kvm, int isr) | ||
3590 | { | ||
3591 | return; | ||
3592 | } | ||
3593 | |||
3574 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) | 3594 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) |
3575 | { | 3595 | { |
3576 | struct vcpu_svm *svm = to_svm(vcpu); | 3596 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
4290 | .enable_nmi_window = enable_nmi_window, | 4310 | .enable_nmi_window = enable_nmi_window, |
4291 | .enable_irq_window = enable_irq_window, | 4311 | .enable_irq_window = enable_irq_window, |
4292 | .update_cr8_intercept = update_cr8_intercept, | 4312 | .update_cr8_intercept = update_cr8_intercept, |
4313 | .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, | ||
4314 | .vm_has_apicv = svm_vm_has_apicv, | ||
4315 | .load_eoi_exitmap = svm_load_eoi_exitmap, | ||
4316 | .hwapic_isr_update = svm_hwapic_isr_update, | ||
4293 | 4317 | ||
4294 | .set_tss_addr = svm_set_tss_addr, | 4318 | .set_tss_addr = svm_set_tss_addr, |
4295 | .get_tdp_level = get_npt_level, | 4319 | .get_tdp_level = get_npt_level, |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9120ae1901e4..6667042714cc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); | |||
84 | static bool __read_mostly fasteoi = 1; | 84 | static bool __read_mostly fasteoi = 1; |
85 | module_param(fasteoi, bool, S_IRUGO); | 85 | module_param(fasteoi, bool, S_IRUGO); |
86 | 86 | ||
87 | static bool __read_mostly enable_apicv_reg_vid; | ||
88 | |||
87 | /* | 89 | /* |
88 | * If nested=1, nested virtualization is supported, i.e., guests may use | 90 | * If nested=1, nested virtualization is supported, i.e., guests may use |
89 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | 91 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not |
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO); | |||
92 | static bool __read_mostly nested = 0; | 94 | static bool __read_mostly nested = 0; |
93 | module_param(nested, bool, S_IRUGO); | 95 | module_param(nested, bool, S_IRUGO); |
94 | 96 | ||
95 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 97 | #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) |
96 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 98 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) |
97 | #define KVM_GUEST_CR0_MASK \ | ||
98 | (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) | ||
99 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ | ||
100 | (X86_CR0_WP | X86_CR0_NE) | ||
101 | #define KVM_VM_CR0_ALWAYS_ON \ | 99 | #define KVM_VM_CR0_ALWAYS_ON \ |
102 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) | 100 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) |
103 | #define KVM_CR4_GUEST_OWNED_BITS \ | 101 | #define KVM_CR4_GUEST_OWNED_BITS \ |
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
624 | struct kvm_segment *var, int seg); | 622 | struct kvm_segment *var, int seg); |
625 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | 623 | static void vmx_get_segment(struct kvm_vcpu *vcpu, |
626 | struct kvm_segment *var, int seg); | 624 | struct kvm_segment *var, int seg); |
625 | static bool guest_state_valid(struct kvm_vcpu *vcpu); | ||
626 | static u32 vmx_segment_access_rights(struct kvm_segment *var); | ||
627 | 627 | ||
628 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 628 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
629 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 629 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a; | |||
638 | static unsigned long *vmx_io_bitmap_b; | 638 | static unsigned long *vmx_io_bitmap_b; |
639 | static unsigned long *vmx_msr_bitmap_legacy; | 639 | static unsigned long *vmx_msr_bitmap_legacy; |
640 | static unsigned long *vmx_msr_bitmap_longmode; | 640 | static unsigned long *vmx_msr_bitmap_longmode; |
641 | static unsigned long *vmx_msr_bitmap_legacy_x2apic; | ||
642 | static unsigned long *vmx_msr_bitmap_longmode_x2apic; | ||
641 | 643 | ||
642 | static bool cpu_has_load_ia32_efer; | 644 | static bool cpu_has_load_ia32_efer; |
643 | static bool cpu_has_load_perf_global_ctrl; | 645 | static bool cpu_has_load_perf_global_ctrl; |
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | |||
762 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | 764 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
763 | } | 765 | } |
764 | 766 | ||
767 | static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) | ||
768 | { | ||
769 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
770 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
771 | } | ||
772 | |||
773 | static inline bool cpu_has_vmx_apic_register_virt(void) | ||
774 | { | ||
775 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
776 | SECONDARY_EXEC_APIC_REGISTER_VIRT; | ||
777 | } | ||
778 | |||
779 | static inline bool cpu_has_vmx_virtual_intr_delivery(void) | ||
780 | { | ||
781 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
782 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; | ||
783 | } | ||
784 | |||
765 | static inline bool cpu_has_vmx_flexpriority(void) | 785 | static inline bool cpu_has_vmx_flexpriority(void) |
766 | { | 786 | { |
767 | return cpu_has_vmx_tpr_shadow() && | 787 | return cpu_has_vmx_tpr_shadow() && |
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | |||
1694 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 1714 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
1695 | { | 1715 | { |
1696 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | 1716 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); |
1697 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
1698 | to_vmx(vcpu)->rflags = rflags; | 1717 | to_vmx(vcpu)->rflags = rflags; |
1699 | if (to_vmx(vcpu)->rmode.vm86_active) { | 1718 | if (to_vmx(vcpu)->rmode.vm86_active) { |
1700 | to_vmx(vcpu)->rmode.save_rflags = rflags; | 1719 | to_vmx(vcpu)->rmode.save_rflags = rflags; |
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | |||
1820 | vmx->guest_msrs[from] = tmp; | 1839 | vmx->guest_msrs[from] = tmp; |
1821 | } | 1840 | } |
1822 | 1841 | ||
1842 | static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) | ||
1843 | { | ||
1844 | unsigned long *msr_bitmap; | ||
1845 | |||
1846 | if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { | ||
1847 | if (is_long_mode(vcpu)) | ||
1848 | msr_bitmap = vmx_msr_bitmap_longmode_x2apic; | ||
1849 | else | ||
1850 | msr_bitmap = vmx_msr_bitmap_legacy_x2apic; | ||
1851 | } else { | ||
1852 | if (is_long_mode(vcpu)) | ||
1853 | msr_bitmap = vmx_msr_bitmap_longmode; | ||
1854 | else | ||
1855 | msr_bitmap = vmx_msr_bitmap_legacy; | ||
1856 | } | ||
1857 | |||
1858 | vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); | ||
1859 | } | ||
1860 | |||
1823 | /* | 1861 | /* |
1824 | * Set up the vmcs to automatically save and restore system | 1862 | * Set up the vmcs to automatically save and restore system |
1825 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | 1863 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy |
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | |||
1828 | static void setup_msrs(struct vcpu_vmx *vmx) | 1866 | static void setup_msrs(struct vcpu_vmx *vmx) |
1829 | { | 1867 | { |
1830 | int save_nmsrs, index; | 1868 | int save_nmsrs, index; |
1831 | unsigned long *msr_bitmap; | ||
1832 | 1869 | ||
1833 | save_nmsrs = 0; | 1870 | save_nmsrs = 0; |
1834 | #ifdef CONFIG_X86_64 | 1871 | #ifdef CONFIG_X86_64 |
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
1860 | 1897 | ||
1861 | vmx->save_nmsrs = save_nmsrs; | 1898 | vmx->save_nmsrs = save_nmsrs; |
1862 | 1899 | ||
1863 | if (cpu_has_vmx_msr_bitmap()) { | 1900 | if (cpu_has_vmx_msr_bitmap()) |
1864 | if (is_long_mode(&vmx->vcpu)) | 1901 | vmx_set_msr_bitmap(&vmx->vcpu); |
1865 | msr_bitmap = vmx_msr_bitmap_longmode; | ||
1866 | else | ||
1867 | msr_bitmap = vmx_msr_bitmap_legacy; | ||
1868 | |||
1869 | vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); | ||
1870 | } | ||
1871 | } | 1902 | } |
1872 | 1903 | ||
1873 | /* | 1904 | /* |
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2533 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | 2564 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { |
2534 | min2 = 0; | 2565 | min2 = 0; |
2535 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 2566 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
2567 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
2536 | SECONDARY_EXEC_WBINVD_EXITING | | 2568 | SECONDARY_EXEC_WBINVD_EXITING | |
2537 | SECONDARY_EXEC_ENABLE_VPID | | 2569 | SECONDARY_EXEC_ENABLE_VPID | |
2538 | SECONDARY_EXEC_ENABLE_EPT | | 2570 | SECONDARY_EXEC_ENABLE_EPT | |
2539 | SECONDARY_EXEC_UNRESTRICTED_GUEST | | 2571 | SECONDARY_EXEC_UNRESTRICTED_GUEST | |
2540 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | | 2572 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | |
2541 | SECONDARY_EXEC_RDTSCP | | 2573 | SECONDARY_EXEC_RDTSCP | |
2542 | SECONDARY_EXEC_ENABLE_INVPCID; | 2574 | SECONDARY_EXEC_ENABLE_INVPCID | |
2575 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
2576 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; | ||
2543 | if (adjust_vmx_controls(min2, opt2, | 2577 | if (adjust_vmx_controls(min2, opt2, |
2544 | MSR_IA32_VMX_PROCBASED_CTLS2, | 2578 | MSR_IA32_VMX_PROCBASED_CTLS2, |
2545 | &_cpu_based_2nd_exec_control) < 0) | 2579 | &_cpu_based_2nd_exec_control) < 0) |
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2550 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | 2584 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) |
2551 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | 2585 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; |
2552 | #endif | 2586 | #endif |
2587 | |||
2588 | if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
2589 | _cpu_based_2nd_exec_control &= ~( | ||
2590 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
2591 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
2592 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
2593 | |||
2553 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 2594 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
2554 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT | 2595 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
2555 | enabled */ | 2596 | enabled */ |
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void) | |||
2747 | if (!cpu_has_vmx_ple()) | 2788 | if (!cpu_has_vmx_ple()) |
2748 | ple_gap = 0; | 2789 | ple_gap = 0; |
2749 | 2790 | ||
2791 | if (!cpu_has_vmx_apic_register_virt() || | ||
2792 | !cpu_has_vmx_virtual_intr_delivery()) | ||
2793 | enable_apicv_reg_vid = 0; | ||
2794 | |||
2795 | if (enable_apicv_reg_vid) | ||
2796 | kvm_x86_ops->update_cr8_intercept = NULL; | ||
2797 | else | ||
2798 | kvm_x86_ops->hwapic_irr_update = NULL; | ||
2799 | |||
2750 | if (nested) | 2800 | if (nested) |
2751 | nested_vmx_setup_ctls_msrs(); | 2801 | nested_vmx_setup_ctls_msrs(); |
2752 | 2802 | ||
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void) | |||
2758 | free_kvm_area(); | 2808 | free_kvm_area(); |
2759 | } | 2809 | } |
2760 | 2810 | ||
2761 | static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) | 2811 | static bool emulation_required(struct kvm_vcpu *vcpu) |
2762 | { | 2812 | { |
2763 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2813 | return emulate_invalid_guest_state && !guest_state_valid(vcpu); |
2764 | struct kvm_segment tmp = *save; | 2814 | } |
2765 | 2815 | ||
2766 | if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { | 2816 | static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, |
2767 | tmp.base = vmcs_readl(sf->base); | 2817 | struct kvm_segment *save) |
2768 | tmp.selector = vmcs_read16(sf->selector); | 2818 | { |
2769 | tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; | 2819 | if (!emulate_invalid_guest_state) { |
2770 | tmp.s = 1; | 2820 | /* |
2821 | * CS and SS RPL should be equal during guest entry according | ||
2822 | * to VMX spec, but in reality it is not always so. Since vcpu | ||
2823 | * is in the middle of the transition from real mode to | ||
2824 | * protected mode it is safe to assume that RPL 0 is a good | ||
2825 | * default value. | ||
2826 | */ | ||
2827 | if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) | ||
2828 | save->selector &= ~SELECTOR_RPL_MASK; | ||
2829 | save->dpl = save->selector & SELECTOR_RPL_MASK; | ||
2830 | save->s = 1; | ||
2771 | } | 2831 | } |
2772 | vmx_set_segment(vcpu, &tmp, seg); | 2832 | vmx_set_segment(vcpu, save, seg); |
2773 | } | 2833 | } |
2774 | 2834 | ||
2775 | static void enter_pmode(struct kvm_vcpu *vcpu) | 2835 | static void enter_pmode(struct kvm_vcpu *vcpu) |
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
2777 | unsigned long flags; | 2837 | unsigned long flags; |
2778 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2838 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2779 | 2839 | ||
2780 | vmx->emulation_required = 1; | 2840 | /* |
2841 | * Update real mode segment cache. It may be not up-to-date if sement | ||
2842 | * register was written while vcpu was in a guest mode. | ||
2843 | */ | ||
2844 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
2845 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
2846 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
2847 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
2848 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
2849 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
2850 | |||
2781 | vmx->rmode.vm86_active = 0; | 2851 | vmx->rmode.vm86_active = 0; |
2782 | 2852 | ||
2783 | vmx_segment_cache_clear(vmx); | 2853 | vmx_segment_cache_clear(vmx); |
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
2794 | 2864 | ||
2795 | update_exception_bitmap(vcpu); | 2865 | update_exception_bitmap(vcpu); |
2796 | 2866 | ||
2797 | if (emulate_invalid_guest_state) | 2867 | fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); |
2798 | return; | 2868 | fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); |
2799 | 2869 | fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | |
2800 | fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | 2870 | fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); |
2801 | fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | 2871 | fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); |
2802 | fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | 2872 | fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); |
2803 | fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | ||
2804 | |||
2805 | vmx_segment_cache_clear(vmx); | ||
2806 | 2873 | ||
2807 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 2874 | /* CPL is always 0 when CPU enters protected mode */ |
2808 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 2875 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
2809 | 2876 | vmx->cpl = 0; | |
2810 | vmcs_write16(GUEST_CS_SELECTOR, | ||
2811 | vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); | ||
2812 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
2813 | } | 2877 | } |
2814 | 2878 | ||
2815 | static gva_t rmode_tss_base(struct kvm *kvm) | 2879 | static gva_t rmode_tss_base(struct kvm *kvm) |
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm) | |||
2831 | static void fix_rmode_seg(int seg, struct kvm_segment *save) | 2895 | static void fix_rmode_seg(int seg, struct kvm_segment *save) |
2832 | { | 2896 | { |
2833 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2897 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
2834 | 2898 | struct kvm_segment var = *save; | |
2835 | vmcs_write16(sf->selector, save->base >> 4); | 2899 | |
2836 | vmcs_write32(sf->base, save->base & 0xffff0); | 2900 | var.dpl = 0x3; |
2837 | vmcs_write32(sf->limit, 0xffff); | 2901 | if (seg == VCPU_SREG_CS) |
2838 | vmcs_write32(sf->ar_bytes, 0xf3); | 2902 | var.type = 0x3; |
2839 | if (save->base & 0xf) | 2903 | |
2840 | printk_once(KERN_WARNING "kvm: segment base is not paragraph" | 2904 | if (!emulate_invalid_guest_state) { |
2841 | " aligned when entering protected mode (seg=%d)", | 2905 | var.selector = var.base >> 4; |
2842 | seg); | 2906 | var.base = var.base & 0xffff0; |
2907 | var.limit = 0xffff; | ||
2908 | var.g = 0; | ||
2909 | var.db = 0; | ||
2910 | var.present = 1; | ||
2911 | var.s = 1; | ||
2912 | var.l = 0; | ||
2913 | var.unusable = 0; | ||
2914 | var.type = 0x3; | ||
2915 | var.avl = 0; | ||
2916 | if (save->base & 0xf) | ||
2917 | printk_once(KERN_WARNING "kvm: segment base is not " | ||
2918 | "paragraph aligned when entering " | ||
2919 | "protected mode (seg=%d)", seg); | ||
2920 | } | ||
2921 | |||
2922 | vmcs_write16(sf->selector, var.selector); | ||
2923 | vmcs_write32(sf->base, var.base); | ||
2924 | vmcs_write32(sf->limit, var.limit); | ||
2925 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); | ||
2843 | } | 2926 | } |
2844 | 2927 | ||
2845 | static void enter_rmode(struct kvm_vcpu *vcpu) | 2928 | static void enter_rmode(struct kvm_vcpu *vcpu) |
2846 | { | 2929 | { |
2847 | unsigned long flags; | 2930 | unsigned long flags; |
2848 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2931 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2849 | struct kvm_segment var; | ||
2850 | |||
2851 | if (enable_unrestricted_guest) | ||
2852 | return; | ||
2853 | 2932 | ||
2854 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | 2933 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); |
2855 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | 2934 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); |
2856 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | 2935 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); |
2857 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | 2936 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); |
2858 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | 2937 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); |
2938 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
2939 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
2859 | 2940 | ||
2860 | vmx->emulation_required = 1; | ||
2861 | vmx->rmode.vm86_active = 1; | 2941 | vmx->rmode.vm86_active = 1; |
2862 | 2942 | ||
2863 | |||
2864 | /* | 2943 | /* |
2865 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering | 2944 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering |
2866 | * vcpu. Call it here with phys address pointing 16M below 4G. | 2945 | * vcpu. Call it here with phys address pointing 16M below 4G. |
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
2888 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | 2967 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); |
2889 | update_exception_bitmap(vcpu); | 2968 | update_exception_bitmap(vcpu); |
2890 | 2969 | ||
2891 | if (emulate_invalid_guest_state) | 2970 | fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); |
2892 | goto continue_rmode; | 2971 | fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); |
2893 | 2972 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | |
2894 | vmx_get_segment(vcpu, &var, VCPU_SREG_SS); | 2973 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); |
2895 | vmx_set_segment(vcpu, &var, VCPU_SREG_SS); | 2974 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); |
2896 | 2975 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | |
2897 | vmx_get_segment(vcpu, &var, VCPU_SREG_CS); | ||
2898 | vmx_set_segment(vcpu, &var, VCPU_SREG_CS); | ||
2899 | |||
2900 | vmx_get_segment(vcpu, &var, VCPU_SREG_ES); | ||
2901 | vmx_set_segment(vcpu, &var, VCPU_SREG_ES); | ||
2902 | |||
2903 | vmx_get_segment(vcpu, &var, VCPU_SREG_DS); | ||
2904 | vmx_set_segment(vcpu, &var, VCPU_SREG_DS); | ||
2905 | 2976 | ||
2906 | vmx_get_segment(vcpu, &var, VCPU_SREG_GS); | ||
2907 | vmx_set_segment(vcpu, &var, VCPU_SREG_GS); | ||
2908 | |||
2909 | vmx_get_segment(vcpu, &var, VCPU_SREG_FS); | ||
2910 | vmx_set_segment(vcpu, &var, VCPU_SREG_FS); | ||
2911 | |||
2912 | continue_rmode: | ||
2913 | kvm_mmu_reset_context(vcpu); | 2977 | kvm_mmu_reset_context(vcpu); |
2914 | } | 2978 | } |
2915 | 2979 | ||
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
3068 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3132 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3069 | unsigned long hw_cr0; | 3133 | unsigned long hw_cr0; |
3070 | 3134 | ||
3135 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); | ||
3071 | if (enable_unrestricted_guest) | 3136 | if (enable_unrestricted_guest) |
3072 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) | 3137 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; |
3073 | | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; | 3138 | else { |
3074 | else | 3139 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; |
3075 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; | ||
3076 | 3140 | ||
3077 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) | 3141 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) |
3078 | enter_pmode(vcpu); | 3142 | enter_pmode(vcpu); |
3079 | 3143 | ||
3080 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) | 3144 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) |
3081 | enter_rmode(vcpu); | 3145 | enter_rmode(vcpu); |
3146 | } | ||
3082 | 3147 | ||
3083 | #ifdef CONFIG_X86_64 | 3148 | #ifdef CONFIG_X86_64 |
3084 | if (vcpu->arch.efer & EFER_LME) { | 3149 | if (vcpu->arch.efer & EFER_LME) { |
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
3098 | vmcs_writel(CR0_READ_SHADOW, cr0); | 3163 | vmcs_writel(CR0_READ_SHADOW, cr0); |
3099 | vmcs_writel(GUEST_CR0, hw_cr0); | 3164 | vmcs_writel(GUEST_CR0, hw_cr0); |
3100 | vcpu->arch.cr0 = cr0; | 3165 | vcpu->arch.cr0 = cr0; |
3101 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3166 | |
3167 | /* depends on vcpu->arch.cr0 to be set to a new value */ | ||
3168 | vmx->emulation_required = emulation_required(vcpu); | ||
3102 | } | 3169 | } |
3103 | 3170 | ||
3104 | static u64 construct_eptp(unsigned long root_hpa) | 3171 | static u64 construct_eptp(unsigned long root_hpa) |
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
3155 | if (!is_paging(vcpu)) { | 3222 | if (!is_paging(vcpu)) { |
3156 | hw_cr4 &= ~X86_CR4_PAE; | 3223 | hw_cr4 &= ~X86_CR4_PAE; |
3157 | hw_cr4 |= X86_CR4_PSE; | 3224 | hw_cr4 |= X86_CR4_PSE; |
3225 | /* | ||
3226 | * SMEP is disabled if CPU is in non-paging mode in | ||
3227 | * hardware. However KVM always uses paging mode to | ||
3228 | * emulate guest non-paging mode with TDP. | ||
3229 | * To emulate this behavior, SMEP needs to be manually | ||
3230 | * disabled when guest switches to non-paging mode. | ||
3231 | */ | ||
3232 | hw_cr4 &= ~X86_CR4_SMEP; | ||
3158 | } else if (!(cr4 & X86_CR4_PAE)) { | 3233 | } else if (!(cr4 & X86_CR4_PAE)) { |
3159 | hw_cr4 &= ~X86_CR4_PAE; | 3234 | hw_cr4 &= ~X86_CR4_PAE; |
3160 | } | 3235 | } |
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
3171 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3246 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3172 | u32 ar; | 3247 | u32 ar; |
3173 | 3248 | ||
3174 | if (vmx->rmode.vm86_active | 3249 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { |
3175 | && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES | ||
3176 | || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS | ||
3177 | || seg == VCPU_SREG_GS)) { | ||
3178 | *var = vmx->rmode.segs[seg]; | 3250 | *var = vmx->rmode.segs[seg]; |
3179 | if (seg == VCPU_SREG_TR | 3251 | if (seg == VCPU_SREG_TR |
3180 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) | 3252 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) |
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
3187 | var->limit = vmx_read_guest_seg_limit(vmx, seg); | 3259 | var->limit = vmx_read_guest_seg_limit(vmx, seg); |
3188 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | 3260 | var->selector = vmx_read_guest_seg_selector(vmx, seg); |
3189 | ar = vmx_read_guest_seg_ar(vmx, seg); | 3261 | ar = vmx_read_guest_seg_ar(vmx, seg); |
3190 | if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) | ||
3191 | ar = 0; | ||
3192 | var->type = ar & 15; | 3262 | var->type = ar & 15; |
3193 | var->s = (ar >> 4) & 1; | 3263 | var->s = (ar >> 4) & 1; |
3194 | var->dpl = (ar >> 5) & 3; | 3264 | var->dpl = (ar >> 5) & 3; |
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
3211 | return vmx_read_guest_seg_base(to_vmx(vcpu), seg); | 3281 | return vmx_read_guest_seg_base(to_vmx(vcpu), seg); |
3212 | } | 3282 | } |
3213 | 3283 | ||
3214 | static int __vmx_get_cpl(struct kvm_vcpu *vcpu) | 3284 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) |
3215 | { | 3285 | { |
3286 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
3287 | |||
3216 | if (!is_protmode(vcpu)) | 3288 | if (!is_protmode(vcpu)) |
3217 | return 0; | 3289 | return 0; |
3218 | 3290 | ||
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) | |||
3220 | && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ | 3292 | && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ |
3221 | return 3; | 3293 | return 3; |
3222 | 3294 | ||
3223 | return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; | ||
3224 | } | ||
3225 | |||
3226 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
3227 | { | ||
3228 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
3229 | |||
3230 | /* | ||
3231 | * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations | ||
3232 | * fail; use the cache instead. | ||
3233 | */ | ||
3234 | if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { | ||
3235 | return vmx->cpl; | ||
3236 | } | ||
3237 | |||
3238 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { | 3295 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { |
3239 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3296 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
3240 | vmx->cpl = __vmx_get_cpl(vcpu); | 3297 | vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; |
3241 | } | 3298 | } |
3242 | 3299 | ||
3243 | return vmx->cpl; | 3300 | return vmx->cpl; |
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3269 | { | 3326 | { |
3270 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3327 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3271 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 3328 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
3272 | u32 ar; | ||
3273 | 3329 | ||
3274 | vmx_segment_cache_clear(vmx); | 3330 | vmx_segment_cache_clear(vmx); |
3331 | if (seg == VCPU_SREG_CS) | ||
3332 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
3275 | 3333 | ||
3276 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { | 3334 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { |
3277 | vmcs_write16(sf->selector, var->selector); | 3335 | vmx->rmode.segs[seg] = *var; |
3278 | vmx->rmode.segs[VCPU_SREG_TR] = *var; | 3336 | if (seg == VCPU_SREG_TR) |
3279 | return; | 3337 | vmcs_write16(sf->selector, var->selector); |
3338 | else if (var->s) | ||
3339 | fix_rmode_seg(seg, &vmx->rmode.segs[seg]); | ||
3340 | goto out; | ||
3280 | } | 3341 | } |
3342 | |||
3281 | vmcs_writel(sf->base, var->base); | 3343 | vmcs_writel(sf->base, var->base); |
3282 | vmcs_write32(sf->limit, var->limit); | 3344 | vmcs_write32(sf->limit, var->limit); |
3283 | vmcs_write16(sf->selector, var->selector); | 3345 | vmcs_write16(sf->selector, var->selector); |
3284 | if (vmx->rmode.vm86_active && var->s) { | ||
3285 | vmx->rmode.segs[seg] = *var; | ||
3286 | /* | ||
3287 | * Hack real-mode segments into vm86 compatibility. | ||
3288 | */ | ||
3289 | if (var->base == 0xffff0000 && var->selector == 0xf000) | ||
3290 | vmcs_writel(sf->base, 0xf0000); | ||
3291 | ar = 0xf3; | ||
3292 | } else | ||
3293 | ar = vmx_segment_access_rights(var); | ||
3294 | 3346 | ||
3295 | /* | 3347 | /* |
3296 | * Fix the "Accessed" bit in AR field of segment registers for older | 3348 | * Fix the "Accessed" bit in AR field of segment registers for older |
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3304 | * kvm hack. | 3356 | * kvm hack. |
3305 | */ | 3357 | */ |
3306 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) | 3358 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) |
3307 | ar |= 0x1; /* Accessed */ | 3359 | var->type |= 0x1; /* Accessed */ |
3308 | 3360 | ||
3309 | vmcs_write32(sf->ar_bytes, ar); | 3361 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); |
3310 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
3311 | 3362 | ||
3312 | /* | 3363 | out: |
3313 | * Fix segments for real mode guest in hosts that don't have | 3364 | vmx->emulation_required |= emulation_required(vcpu); |
3314 | * "unrestricted_mode" or it was disabled. | ||
3315 | * This is done to allow migration of the guests from hosts with | ||
3316 | * unrestricted guest like Westmere to older host that don't have | ||
3317 | * unrestricted guest like Nehelem. | ||
3318 | */ | ||
3319 | if (vmx->rmode.vm86_active) { | ||
3320 | switch (seg) { | ||
3321 | case VCPU_SREG_CS: | ||
3322 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
3323 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
3324 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
3325 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
3326 | vmcs_write16(GUEST_CS_SELECTOR, | ||
3327 | vmcs_readl(GUEST_CS_BASE) >> 4); | ||
3328 | break; | ||
3329 | case VCPU_SREG_ES: | ||
3330 | case VCPU_SREG_DS: | ||
3331 | case VCPU_SREG_GS: | ||
3332 | case VCPU_SREG_FS: | ||
3333 | fix_rmode_seg(seg, &vmx->rmode.segs[seg]); | ||
3334 | break; | ||
3335 | case VCPU_SREG_SS: | ||
3336 | vmcs_write16(GUEST_SS_SELECTOR, | ||
3337 | vmcs_readl(GUEST_SS_BASE) >> 4); | ||
3338 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
3339 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
3340 | break; | ||
3341 | } | ||
3342 | } | ||
3343 | } | 3365 | } |
3344 | 3366 | ||
3345 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 3367 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | |||
3380 | u32 ar; | 3402 | u32 ar; |
3381 | 3403 | ||
3382 | vmx_get_segment(vcpu, &var, seg); | 3404 | vmx_get_segment(vcpu, &var, seg); |
3405 | var.dpl = 0x3; | ||
3406 | if (seg == VCPU_SREG_CS) | ||
3407 | var.type = 0x3; | ||
3383 | ar = vmx_segment_access_rights(&var); | 3408 | ar = vmx_segment_access_rights(&var); |
3384 | 3409 | ||
3385 | if (var.base != (var.selector << 4)) | 3410 | if (var.base != (var.selector << 4)) |
3386 | return false; | 3411 | return false; |
3387 | if (var.limit < 0xffff) | 3412 | if (var.limit != 0xffff) |
3388 | return false; | 3413 | return false; |
3389 | if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) | 3414 | if (ar != 0xf3) |
3390 | return false; | 3415 | return false; |
3391 | 3416 | ||
3392 | return true; | 3417 | return true; |
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | |||
3521 | */ | 3546 | */ |
3522 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | 3547 | static bool guest_state_valid(struct kvm_vcpu *vcpu) |
3523 | { | 3548 | { |
3549 | if (enable_unrestricted_guest) | ||
3550 | return true; | ||
3551 | |||
3524 | /* real mode guest state checks */ | 3552 | /* real mode guest state checks */ |
3525 | if (!is_protmode(vcpu)) { | 3553 | if (!is_protmode(vcpu)) { |
3526 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | 3554 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) |
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg) | |||
3644 | vmcs_write16(sf->selector, 0); | 3672 | vmcs_write16(sf->selector, 0); |
3645 | vmcs_writel(sf->base, 0); | 3673 | vmcs_writel(sf->base, 0); |
3646 | vmcs_write32(sf->limit, 0xffff); | 3674 | vmcs_write32(sf->limit, 0xffff); |
3647 | if (enable_unrestricted_guest) { | 3675 | ar = 0x93; |
3648 | ar = 0x93; | 3676 | if (seg == VCPU_SREG_CS) |
3649 | if (seg == VCPU_SREG_CS) | 3677 | ar |= 0x08; /* code segment */ |
3650 | ar |= 0x08; /* code segment */ | ||
3651 | } else | ||
3652 | ar = 0xf3; | ||
3653 | 3678 | ||
3654 | vmcs_write32(sf->ar_bytes, ar); | 3679 | vmcs_write32(sf->ar_bytes, ar); |
3655 | } | 3680 | } |
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm) | |||
3667 | kvm_userspace_mem.flags = 0; | 3692 | kvm_userspace_mem.flags = 0; |
3668 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; | 3693 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; |
3669 | kvm_userspace_mem.memory_size = PAGE_SIZE; | 3694 | kvm_userspace_mem.memory_size = PAGE_SIZE; |
3670 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | 3695 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); |
3671 | if (r) | 3696 | if (r) |
3672 | goto out; | 3697 | goto out; |
3673 | 3698 | ||
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
3697 | kvm_userspace_mem.guest_phys_addr = | 3722 | kvm_userspace_mem.guest_phys_addr = |
3698 | kvm->arch.ept_identity_map_addr; | 3723 | kvm->arch.ept_identity_map_addr; |
3699 | kvm_userspace_mem.memory_size = PAGE_SIZE; | 3724 | kvm_userspace_mem.memory_size = PAGE_SIZE; |
3700 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | 3725 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); |
3701 | if (r) | 3726 | if (r) |
3702 | goto out; | 3727 | goto out; |
3703 | 3728 | ||
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx) | |||
3739 | spin_unlock(&vmx_vpid_lock); | 3764 | spin_unlock(&vmx_vpid_lock); |
3740 | } | 3765 | } |
3741 | 3766 | ||
3742 | static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) | 3767 | #define MSR_TYPE_R 1 |
3768 | #define MSR_TYPE_W 2 | ||
3769 | static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | ||
3770 | u32 msr, int type) | ||
3743 | { | 3771 | { |
3744 | int f = sizeof(unsigned long); | 3772 | int f = sizeof(unsigned long); |
3745 | 3773 | ||
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) | |||
3752 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | 3780 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. |
3753 | */ | 3781 | */ |
3754 | if (msr <= 0x1fff) { | 3782 | if (msr <= 0x1fff) { |
3755 | __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ | 3783 | if (type & MSR_TYPE_R) |
3756 | __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ | 3784 | /* read-low */ |
3785 | __clear_bit(msr, msr_bitmap + 0x000 / f); | ||
3786 | |||
3787 | if (type & MSR_TYPE_W) | ||
3788 | /* write-low */ | ||
3789 | __clear_bit(msr, msr_bitmap + 0x800 / f); | ||
3790 | |||
3757 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | 3791 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { |
3758 | msr &= 0x1fff; | 3792 | msr &= 0x1fff; |
3759 | __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ | 3793 | if (type & MSR_TYPE_R) |
3760 | __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ | 3794 | /* read-high */ |
3795 | __clear_bit(msr, msr_bitmap + 0x400 / f); | ||
3796 | |||
3797 | if (type & MSR_TYPE_W) | ||
3798 | /* write-high */ | ||
3799 | __clear_bit(msr, msr_bitmap + 0xc00 / f); | ||
3800 | |||
3801 | } | ||
3802 | } | ||
3803 | |||
3804 | static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, | ||
3805 | u32 msr, int type) | ||
3806 | { | ||
3807 | int f = sizeof(unsigned long); | ||
3808 | |||
3809 | if (!cpu_has_vmx_msr_bitmap()) | ||
3810 | return; | ||
3811 | |||
3812 | /* | ||
3813 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
3814 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
3815 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
3816 | */ | ||
3817 | if (msr <= 0x1fff) { | ||
3818 | if (type & MSR_TYPE_R) | ||
3819 | /* read-low */ | ||
3820 | __set_bit(msr, msr_bitmap + 0x000 / f); | ||
3821 | |||
3822 | if (type & MSR_TYPE_W) | ||
3823 | /* write-low */ | ||
3824 | __set_bit(msr, msr_bitmap + 0x800 / f); | ||
3825 | |||
3826 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
3827 | msr &= 0x1fff; | ||
3828 | if (type & MSR_TYPE_R) | ||
3829 | /* read-high */ | ||
3830 | __set_bit(msr, msr_bitmap + 0x400 / f); | ||
3831 | |||
3832 | if (type & MSR_TYPE_W) | ||
3833 | /* write-high */ | ||
3834 | __set_bit(msr, msr_bitmap + 0xc00 / f); | ||
3835 | |||
3761 | } | 3836 | } |
3762 | } | 3837 | } |
3763 | 3838 | ||
3764 | static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | 3839 | static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) |
3765 | { | 3840 | { |
3766 | if (!longmode_only) | 3841 | if (!longmode_only) |
3767 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); | 3842 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, |
3768 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); | 3843 | msr, MSR_TYPE_R | MSR_TYPE_W); |
3844 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, | ||
3845 | msr, MSR_TYPE_R | MSR_TYPE_W); | ||
3846 | } | ||
3847 | |||
3848 | static void vmx_enable_intercept_msr_read_x2apic(u32 msr) | ||
3849 | { | ||
3850 | __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, | ||
3851 | msr, MSR_TYPE_R); | ||
3852 | __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, | ||
3853 | msr, MSR_TYPE_R); | ||
3854 | } | ||
3855 | |||
3856 | static void vmx_disable_intercept_msr_read_x2apic(u32 msr) | ||
3857 | { | ||
3858 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, | ||
3859 | msr, MSR_TYPE_R); | ||
3860 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, | ||
3861 | msr, MSR_TYPE_R); | ||
3862 | } | ||
3863 | |||
3864 | static void vmx_disable_intercept_msr_write_x2apic(u32 msr) | ||
3865 | { | ||
3866 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, | ||
3867 | msr, MSR_TYPE_W); | ||
3868 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, | ||
3869 | msr, MSR_TYPE_W); | ||
3769 | } | 3870 | } |
3770 | 3871 | ||
3771 | /* | 3872 | /* |
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) | |||
3844 | return exec_control; | 3945 | return exec_control; |
3845 | } | 3946 | } |
3846 | 3947 | ||
3948 | static int vmx_vm_has_apicv(struct kvm *kvm) | ||
3949 | { | ||
3950 | return enable_apicv_reg_vid && irqchip_in_kernel(kvm); | ||
3951 | } | ||
3952 | |||
3847 | static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | 3953 | static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) |
3848 | { | 3954 | { |
3849 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | 3955 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; |
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | |||
3861 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | 3967 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; |
3862 | if (!ple_gap) | 3968 | if (!ple_gap) |
3863 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | 3969 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; |
3970 | if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) | ||
3971 | exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
3972 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
3973 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
3864 | return exec_control; | 3974 | return exec_control; |
3865 | } | 3975 | } |
3866 | 3976 | ||
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
3905 | vmx_secondary_exec_control(vmx)); | 4015 | vmx_secondary_exec_control(vmx)); |
3906 | } | 4016 | } |
3907 | 4017 | ||
4018 | if (enable_apicv_reg_vid) { | ||
4019 | vmcs_write64(EOI_EXIT_BITMAP0, 0); | ||
4020 | vmcs_write64(EOI_EXIT_BITMAP1, 0); | ||
4021 | vmcs_write64(EOI_EXIT_BITMAP2, 0); | ||
4022 | vmcs_write64(EOI_EXIT_BITMAP3, 0); | ||
4023 | |||
4024 | vmcs_write16(GUEST_INTR_STATUS, 0); | ||
4025 | } | ||
4026 | |||
3908 | if (ple_gap) { | 4027 | if (ple_gap) { |
3909 | vmcs_write32(PLE_GAP, ple_gap); | 4028 | vmcs_write32(PLE_GAP, ple_gap); |
3910 | vmcs_write32(PLE_WINDOW, ple_window); | 4029 | vmcs_write32(PLE_WINDOW, ple_window); |
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
3990 | vmx_segment_cache_clear(vmx); | 4109 | vmx_segment_cache_clear(vmx); |
3991 | 4110 | ||
3992 | seg_setup(VCPU_SREG_CS); | 4111 | seg_setup(VCPU_SREG_CS); |
3993 | /* | 4112 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) |
3994 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
3995 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
3996 | */ | ||
3997 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) { | ||
3998 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | 4113 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); |
3999 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | 4114 | else { |
4000 | } else { | ||
4001 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | 4115 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); |
4002 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | 4116 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); |
4003 | } | 4117 | } |
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
4073 | 4187 | ||
4074 | ret = 0; | 4188 | ret = 0; |
4075 | 4189 | ||
4076 | /* HACK: Don't enable emulation on guest boot/reset */ | ||
4077 | vmx->emulation_required = 0; | ||
4078 | |||
4079 | return ret; | 4190 | return ret; |
4080 | } | 4191 | } |
4081 | 4192 | ||
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
4251 | .flags = 0, | 4362 | .flags = 0, |
4252 | }; | 4363 | }; |
4253 | 4364 | ||
4254 | ret = kvm_set_memory_region(kvm, &tss_mem, 0); | 4365 | ret = kvm_set_memory_region(kvm, &tss_mem, false); |
4255 | if (ret) | 4366 | if (ret) |
4256 | return ret; | 4367 | return ret; |
4257 | kvm->arch.tss_addr = addr; | 4368 | kvm->arch.tss_addr = addr; |
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
4261 | return 0; | 4372 | return 0; |
4262 | } | 4373 | } |
4263 | 4374 | ||
4264 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 4375 | static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) |
4265 | int vec, u32 err_code) | ||
4266 | { | 4376 | { |
4267 | /* | ||
4268 | * Instruction with address size override prefix opcode 0x67 | ||
4269 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
4270 | */ | ||
4271 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | ||
4272 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) | ||
4273 | return 1; | ||
4274 | /* | ||
4275 | * Forward all other exceptions that are valid in real mode. | ||
4276 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
4277 | * the required debugging infrastructure rework. | ||
4278 | */ | ||
4279 | switch (vec) { | 4377 | switch (vec) { |
4280 | case DB_VECTOR: | ||
4281 | if (vcpu->guest_debug & | ||
4282 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
4283 | return 0; | ||
4284 | kvm_queue_exception(vcpu, vec); | ||
4285 | return 1; | ||
4286 | case BP_VECTOR: | 4378 | case BP_VECTOR: |
4287 | /* | 4379 | /* |
4288 | * Update instruction length as we may reinject the exception | 4380 | * Update instruction length as we may reinject the exception |
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
4291 | to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = | 4383 | to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = |
4292 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 4384 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
4293 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 4385 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
4294 | return 0; | 4386 | return false; |
4387 | /* fall through */ | ||
4388 | case DB_VECTOR: | ||
4389 | if (vcpu->guest_debug & | ||
4390 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
4391 | return false; | ||
4295 | /* fall through */ | 4392 | /* fall through */ |
4296 | case DE_VECTOR: | 4393 | case DE_VECTOR: |
4297 | case OF_VECTOR: | 4394 | case OF_VECTOR: |
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
4301 | case SS_VECTOR: | 4398 | case SS_VECTOR: |
4302 | case GP_VECTOR: | 4399 | case GP_VECTOR: |
4303 | case MF_VECTOR: | 4400 | case MF_VECTOR: |
4304 | kvm_queue_exception(vcpu, vec); | 4401 | return true; |
4305 | return 1; | 4402 | break; |
4306 | } | 4403 | } |
4307 | return 0; | 4404 | return false; |
4405 | } | ||
4406 | |||
4407 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
4408 | int vec, u32 err_code) | ||
4409 | { | ||
4410 | /* | ||
4411 | * Instruction with address size override prefix opcode 0x67 | ||
4412 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
4413 | */ | ||
4414 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { | ||
4415 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { | ||
4416 | if (vcpu->arch.halt_request) { | ||
4417 | vcpu->arch.halt_request = 0; | ||
4418 | return kvm_emulate_halt(vcpu); | ||
4419 | } | ||
4420 | return 1; | ||
4421 | } | ||
4422 | return 0; | ||
4423 | } | ||
4424 | |||
4425 | /* | ||
4426 | * Forward all other exceptions that are valid in real mode. | ||
4427 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
4428 | * the required debugging infrastructure rework. | ||
4429 | */ | ||
4430 | kvm_queue_exception(vcpu, vec); | ||
4431 | return 1; | ||
4308 | } | 4432 | } |
4309 | 4433 | ||
4310 | /* | 4434 | /* |
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
4392 | return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); | 4516 | return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); |
4393 | } | 4517 | } |
4394 | 4518 | ||
4395 | if (vmx->rmode.vm86_active && | ||
4396 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | ||
4397 | error_code)) { | ||
4398 | if (vcpu->arch.halt_request) { | ||
4399 | vcpu->arch.halt_request = 0; | ||
4400 | return kvm_emulate_halt(vcpu); | ||
4401 | } | ||
4402 | return 1; | ||
4403 | } | ||
4404 | |||
4405 | ex_no = intr_info & INTR_INFO_VECTOR_MASK; | 4519 | ex_no = intr_info & INTR_INFO_VECTOR_MASK; |
4520 | |||
4521 | if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) | ||
4522 | return handle_rmode_exception(vcpu, ex_no, error_code); | ||
4523 | |||
4406 | switch (ex_no) { | 4524 | switch (ex_no) { |
4407 | case DB_VECTOR: | 4525 | case DB_VECTOR: |
4408 | dr6 = vmcs_readl(EXIT_QUALIFICATION); | 4526 | dr6 = vmcs_readl(EXIT_QUALIFICATION); |
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) | |||
4820 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | 4938 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
4821 | } | 4939 | } |
4822 | 4940 | ||
4941 | static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) | ||
4942 | { | ||
4943 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
4944 | int vector = exit_qualification & 0xff; | ||
4945 | |||
4946 | /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ | ||
4947 | kvm_apic_set_eoi_accelerated(vcpu, vector); | ||
4948 | return 1; | ||
4949 | } | ||
4950 | |||
4951 | static int handle_apic_write(struct kvm_vcpu *vcpu) | ||
4952 | { | ||
4953 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
4954 | u32 offset = exit_qualification & 0xfff; | ||
4955 | |||
4956 | /* APIC-write VM exit is trap-like and thus no need to adjust IP */ | ||
4957 | kvm_apic_write_nodecode(vcpu, offset); | ||
4958 | return 1; | ||
4959 | } | ||
4960 | |||
4823 | static int handle_task_switch(struct kvm_vcpu *vcpu) | 4961 | static int handle_task_switch(struct kvm_vcpu *vcpu) |
4824 | { | 4962 | { |
4825 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 4963 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
5065 | schedule(); | 5203 | schedule(); |
5066 | } | 5204 | } |
5067 | 5205 | ||
5068 | vmx->emulation_required = !guest_state_valid(vcpu); | 5206 | vmx->emulation_required = emulation_required(vcpu); |
5069 | out: | 5207 | out: |
5070 | return ret; | 5208 | return ret; |
5071 | } | 5209 | } |
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
5754 | [EXIT_REASON_VMON] = handle_vmon, | 5892 | [EXIT_REASON_VMON] = handle_vmon, |
5755 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 5893 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
5756 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 5894 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
5895 | [EXIT_REASON_APIC_WRITE] = handle_apic_write, | ||
5896 | [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, | ||
5757 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 5897 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
5758 | [EXIT_REASON_XSETBV] = handle_xsetbv, | 5898 | [EXIT_REASON_XSETBV] = handle_xsetbv, |
5759 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | 5899 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, |
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | |||
5780 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | 5920 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; |
5781 | gpa_t bitmap; | 5921 | gpa_t bitmap; |
5782 | 5922 | ||
5783 | if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) | 5923 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) |
5784 | return 1; | 5924 | return 1; |
5785 | 5925 | ||
5786 | /* | 5926 | /* |
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
6008 | u32 vectoring_info = vmx->idt_vectoring_info; | 6148 | u32 vectoring_info = vmx->idt_vectoring_info; |
6009 | 6149 | ||
6010 | /* If guest state is invalid, start emulating */ | 6150 | /* If guest state is invalid, start emulating */ |
6011 | if (vmx->emulation_required && emulate_invalid_guest_state) | 6151 | if (vmx->emulation_required) |
6012 | return handle_invalid_guest_state(vcpu); | 6152 | return handle_invalid_guest_state(vcpu); |
6013 | 6153 | ||
6014 | /* | 6154 | /* |
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
6103 | vmcs_write32(TPR_THRESHOLD, irr); | 6243 | vmcs_write32(TPR_THRESHOLD, irr); |
6104 | } | 6244 | } |
6105 | 6245 | ||
6246 | static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) | ||
6247 | { | ||
6248 | u32 sec_exec_control; | ||
6249 | |||
6250 | /* | ||
6251 | * There is not point to enable virtualize x2apic without enable | ||
6252 | * apicv | ||
6253 | */ | ||
6254 | if (!cpu_has_vmx_virtualize_x2apic_mode() || | ||
6255 | !vmx_vm_has_apicv(vcpu->kvm)) | ||
6256 | return; | ||
6257 | |||
6258 | if (!vm_need_tpr_shadow(vcpu->kvm)) | ||
6259 | return; | ||
6260 | |||
6261 | sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
6262 | |||
6263 | if (set) { | ||
6264 | sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6265 | sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
6266 | } else { | ||
6267 | sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
6268 | sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6269 | } | ||
6270 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); | ||
6271 | |||
6272 | vmx_set_msr_bitmap(vcpu); | ||
6273 | } | ||
6274 | |||
6275 | static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) | ||
6276 | { | ||
6277 | u16 status; | ||
6278 | u8 old; | ||
6279 | |||
6280 | if (!vmx_vm_has_apicv(kvm)) | ||
6281 | return; | ||
6282 | |||
6283 | if (isr == -1) | ||
6284 | isr = 0; | ||
6285 | |||
6286 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
6287 | old = status >> 8; | ||
6288 | if (isr != old) { | ||
6289 | status &= 0xff; | ||
6290 | status |= isr << 8; | ||
6291 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
6292 | } | ||
6293 | } | ||
6294 | |||
6295 | static void vmx_set_rvi(int vector) | ||
6296 | { | ||
6297 | u16 status; | ||
6298 | u8 old; | ||
6299 | |||
6300 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
6301 | old = (u8)status & 0xff; | ||
6302 | if ((u8)vector != old) { | ||
6303 | status &= ~0xff; | ||
6304 | status |= (u8)vector; | ||
6305 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
6306 | } | ||
6307 | } | ||
6308 | |||
6309 | static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) | ||
6310 | { | ||
6311 | if (max_irr == -1) | ||
6312 | return; | ||
6313 | |||
6314 | vmx_set_rvi(max_irr); | ||
6315 | } | ||
6316 | |||
6317 | static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) | ||
6318 | { | ||
6319 | vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); | ||
6320 | vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); | ||
6321 | vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); | ||
6322 | vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); | ||
6323 | } | ||
6324 | |||
6106 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) | 6325 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) |
6107 | { | 6326 | { |
6108 | u32 exit_intr_info; | 6327 | u32 exit_intr_info; |
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6291 | 6510 | ||
6292 | /* Don't enter VMX if guest state is invalid, let the exit handler | 6511 | /* Don't enter VMX if guest state is invalid, let the exit handler |
6293 | start emulation until we arrive back to a valid state */ | 6512 | start emulation until we arrive back to a valid state */ |
6294 | if (vmx->emulation_required && emulate_invalid_guest_state) | 6513 | if (vmx->emulation_required) |
6295 | return; | 6514 | return; |
6296 | 6515 | ||
6297 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | 6516 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) |
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
7366 | .enable_nmi_window = enable_nmi_window, | 7585 | .enable_nmi_window = enable_nmi_window, |
7367 | .enable_irq_window = enable_irq_window, | 7586 | .enable_irq_window = enable_irq_window, |
7368 | .update_cr8_intercept = update_cr8_intercept, | 7587 | .update_cr8_intercept = update_cr8_intercept, |
7588 | .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, | ||
7589 | .vm_has_apicv = vmx_vm_has_apicv, | ||
7590 | .load_eoi_exitmap = vmx_load_eoi_exitmap, | ||
7591 | .hwapic_irr_update = vmx_hwapic_irr_update, | ||
7592 | .hwapic_isr_update = vmx_hwapic_isr_update, | ||
7369 | 7593 | ||
7370 | .set_tss_addr = vmx_set_tss_addr, | 7594 | .set_tss_addr = vmx_set_tss_addr, |
7371 | .get_tdp_level = get_ept_level, | 7595 | .get_tdp_level = get_ept_level, |
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
7398 | 7622 | ||
7399 | static int __init vmx_init(void) | 7623 | static int __init vmx_init(void) |
7400 | { | 7624 | { |
7401 | int r, i; | 7625 | int r, i, msr; |
7402 | 7626 | ||
7403 | rdmsrl_safe(MSR_EFER, &host_efer); | 7627 | rdmsrl_safe(MSR_EFER, &host_efer); |
7404 | 7628 | ||
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void) | |||
7419 | if (!vmx_msr_bitmap_legacy) | 7643 | if (!vmx_msr_bitmap_legacy) |
7420 | goto out1; | 7644 | goto out1; |
7421 | 7645 | ||
7646 | vmx_msr_bitmap_legacy_x2apic = | ||
7647 | (unsigned long *)__get_free_page(GFP_KERNEL); | ||
7648 | if (!vmx_msr_bitmap_legacy_x2apic) | ||
7649 | goto out2; | ||
7422 | 7650 | ||
7423 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); | 7651 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); |
7424 | if (!vmx_msr_bitmap_longmode) | 7652 | if (!vmx_msr_bitmap_longmode) |
7425 | goto out2; | 7653 | goto out3; |
7426 | 7654 | ||
7655 | vmx_msr_bitmap_longmode_x2apic = | ||
7656 | (unsigned long *)__get_free_page(GFP_KERNEL); | ||
7657 | if (!vmx_msr_bitmap_longmode_x2apic) | ||
7658 | goto out4; | ||
7427 | 7659 | ||
7428 | /* | 7660 | /* |
7429 | * Allow direct access to the PC debug port (it is often used for I/O | 7661 | * Allow direct access to the PC debug port (it is often used for I/O |
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void) | |||
7455 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); | 7687 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); |
7456 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); | 7688 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); |
7457 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7689 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
7690 | memcpy(vmx_msr_bitmap_legacy_x2apic, | ||
7691 | vmx_msr_bitmap_legacy, PAGE_SIZE); | ||
7692 | memcpy(vmx_msr_bitmap_longmode_x2apic, | ||
7693 | vmx_msr_bitmap_longmode, PAGE_SIZE); | ||
7694 | |||
7695 | if (enable_apicv_reg_vid) { | ||
7696 | for (msr = 0x800; msr <= 0x8ff; msr++) | ||
7697 | vmx_disable_intercept_msr_read_x2apic(msr); | ||
7698 | |||
7699 | /* According SDM, in x2apic mode, the whole id reg is used. | ||
7700 | * But in KVM, it only use the highest eight bits. Need to | ||
7701 | * intercept it */ | ||
7702 | vmx_enable_intercept_msr_read_x2apic(0x802); | ||
7703 | /* TMCCT */ | ||
7704 | vmx_enable_intercept_msr_read_x2apic(0x839); | ||
7705 | /* TPR */ | ||
7706 | vmx_disable_intercept_msr_write_x2apic(0x808); | ||
7707 | /* EOI */ | ||
7708 | vmx_disable_intercept_msr_write_x2apic(0x80b); | ||
7709 | /* SELF-IPI */ | ||
7710 | vmx_disable_intercept_msr_write_x2apic(0x83f); | ||
7711 | } | ||
7458 | 7712 | ||
7459 | if (enable_ept) { | 7713 | if (enable_ept) { |
7460 | kvm_mmu_set_mask_ptes(0ull, | 7714 | kvm_mmu_set_mask_ptes(0ull, |
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void) | |||
7468 | 7722 | ||
7469 | return 0; | 7723 | return 0; |
7470 | 7724 | ||
7471 | out3: | 7725 | out4: |
7472 | free_page((unsigned long)vmx_msr_bitmap_longmode); | 7726 | free_page((unsigned long)vmx_msr_bitmap_longmode); |
7727 | out3: | ||
7728 | free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); | ||
7473 | out2: | 7729 | out2: |
7474 | free_page((unsigned long)vmx_msr_bitmap_legacy); | 7730 | free_page((unsigned long)vmx_msr_bitmap_legacy); |
7475 | out1: | 7731 | out1: |
@@ -7481,6 +7737,8 @@ out: | |||
7481 | 7737 | ||
7482 | static void __exit vmx_exit(void) | 7738 | static void __exit vmx_exit(void) |
7483 | { | 7739 | { |
7740 | free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); | ||
7741 | free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); | ||
7484 | free_page((unsigned long)vmx_msr_bitmap_legacy); | 7742 | free_page((unsigned long)vmx_msr_bitmap_legacy); |
7485 | free_page((unsigned long)vmx_msr_bitmap_longmode); | 7743 | free_page((unsigned long)vmx_msr_bitmap_longmode); |
7486 | free_page((unsigned long)vmx_io_bitmap_b); | 7744 | free_page((unsigned long)vmx_io_bitmap_b); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 37040079cd6b..f71500af1f81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
872 | 872 | ||
873 | kvm_x86_ops->set_efer(vcpu, efer); | 873 | kvm_x86_ops->set_efer(vcpu, efer); |
874 | 874 | ||
875 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | ||
876 | |||
877 | /* Update reserved bits */ | 875 | /* Update reserved bits */ |
878 | if ((efer ^ old_efer) & EFER_NX) | 876 | if ((efer ^ old_efer) & EFER_NX) |
879 | kvm_mmu_reset_context(vcpu); | 877 | kvm_mmu_reset_context(vcpu); |
@@ -2522,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2522 | r = KVM_MAX_VCPUS; | 2520 | r = KVM_MAX_VCPUS; |
2523 | break; | 2521 | break; |
2524 | case KVM_CAP_NR_MEMSLOTS: | 2522 | case KVM_CAP_NR_MEMSLOTS: |
2525 | r = KVM_MEMORY_SLOTS; | 2523 | r = KVM_USER_MEM_SLOTS; |
2526 | break; | 2524 | break; |
2527 | case KVM_CAP_PV_MMU: /* obsolete */ | 2525 | case KVM_CAP_PV_MMU: /* obsolete */ |
2528 | r = 0; | 2526 | r = 0; |
@@ -3274,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | |||
3274 | return -EINVAL; | 3272 | return -EINVAL; |
3275 | 3273 | ||
3276 | mutex_lock(&kvm->slots_lock); | 3274 | mutex_lock(&kvm->slots_lock); |
3277 | spin_lock(&kvm->mmu_lock); | ||
3278 | 3275 | ||
3279 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | 3276 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); |
3280 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | 3277 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; |
3281 | 3278 | ||
3282 | spin_unlock(&kvm->mmu_lock); | ||
3283 | mutex_unlock(&kvm->slots_lock); | 3279 | mutex_unlock(&kvm->slots_lock); |
3284 | return 0; | 3280 | return 0; |
3285 | } | 3281 | } |
@@ -3439,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
3439 | mutex_lock(&kvm->slots_lock); | 3435 | mutex_lock(&kvm->slots_lock); |
3440 | 3436 | ||
3441 | r = -EINVAL; | 3437 | r = -EINVAL; |
3442 | if (log->slot >= KVM_MEMORY_SLOTS) | 3438 | if (log->slot >= KVM_USER_MEM_SLOTS) |
3443 | goto out; | 3439 | goto out; |
3444 | 3440 | ||
3445 | memslot = id_to_memslot(kvm->memslots, log->slot); | 3441 | memslot = id_to_memslot(kvm->memslots, log->slot); |
@@ -4495,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, | |||
4495 | kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); | 4491 | kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); |
4496 | *selector = var.selector; | 4492 | *selector = var.selector; |
4497 | 4493 | ||
4498 | if (var.unusable) | 4494 | if (var.unusable) { |
4495 | memset(desc, 0, sizeof(*desc)); | ||
4499 | return false; | 4496 | return false; |
4497 | } | ||
4500 | 4498 | ||
4501 | if (var.g) | 4499 | if (var.g) |
4502 | var.limit >>= 12; | 4500 | var.limit >>= 12; |
@@ -4757,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) | |||
4757 | return r; | 4755 | return r; |
4758 | } | 4756 | } |
4759 | 4757 | ||
4760 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | 4758 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, |
4759 | bool write_fault_to_shadow_pgtable) | ||
4761 | { | 4760 | { |
4762 | gpa_t gpa; | 4761 | gpa_t gpa = cr2; |
4763 | pfn_t pfn; | 4762 | pfn_t pfn; |
4764 | 4763 | ||
4765 | if (tdp_enabled) | 4764 | if (!vcpu->arch.mmu.direct_map) { |
4766 | return false; | 4765 | /* |
4767 | 4766 | * Write permission should be allowed since only | |
4768 | /* | 4767 | * write access need to be emulated. |
4769 | * if emulation was due to access to shadowed page table | 4768 | */ |
4770 | * and it failed try to unshadow page and re-enter the | 4769 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); |
4771 | * guest to let CPU execute the instruction. | ||
4772 | */ | ||
4773 | if (kvm_mmu_unprotect_page_virt(vcpu, gva)) | ||
4774 | return true; | ||
4775 | |||
4776 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); | ||
4777 | 4770 | ||
4778 | if (gpa == UNMAPPED_GVA) | 4771 | /* |
4779 | return true; /* let cpu generate fault */ | 4772 | * If the mapping is invalid in guest, let cpu retry |
4773 | * it to generate fault. | ||
4774 | */ | ||
4775 | if (gpa == UNMAPPED_GVA) | ||
4776 | return true; | ||
4777 | } | ||
4780 | 4778 | ||
4781 | /* | 4779 | /* |
4782 | * Do not retry the unhandleable instruction if it faults on the | 4780 | * Do not retry the unhandleable instruction if it faults on the |
@@ -4785,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4785 | * instruction -> ... | 4783 | * instruction -> ... |
4786 | */ | 4784 | */ |
4787 | pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); | 4785 | pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); |
4788 | if (!is_error_noslot_pfn(pfn)) { | 4786 | |
4789 | kvm_release_pfn_clean(pfn); | 4787 | /* |
4788 | * If the instruction failed on the error pfn, it can not be fixed, | ||
4789 | * report the error to userspace. | ||
4790 | */ | ||
4791 | if (is_error_noslot_pfn(pfn)) | ||
4792 | return false; | ||
4793 | |||
4794 | kvm_release_pfn_clean(pfn); | ||
4795 | |||
4796 | /* The instructions are well-emulated on direct mmu. */ | ||
4797 | if (vcpu->arch.mmu.direct_map) { | ||
4798 | unsigned int indirect_shadow_pages; | ||
4799 | |||
4800 | spin_lock(&vcpu->kvm->mmu_lock); | ||
4801 | indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; | ||
4802 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
4803 | |||
4804 | if (indirect_shadow_pages) | ||
4805 | kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); | ||
4806 | |||
4790 | return true; | 4807 | return true; |
4791 | } | 4808 | } |
4792 | 4809 | ||
4793 | return false; | 4810 | /* |
4811 | * if emulation was due to access to shadowed page table | ||
4812 | * and it failed try to unshadow page and re-enter the | ||
4813 | * guest to let CPU execute the instruction. | ||
4814 | */ | ||
4815 | kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); | ||
4816 | |||
4817 | /* | ||
4818 | * If the access faults on its page table, it can not | ||
4819 | * be fixed by unprotecting shadow page and it should | ||
4820 | * be reported to userspace. | ||
4821 | */ | ||
4822 | return !write_fault_to_shadow_pgtable; | ||
4794 | } | 4823 | } |
4795 | 4824 | ||
4796 | static bool retry_instruction(struct x86_emulate_ctxt *ctxt, | 4825 | static bool retry_instruction(struct x86_emulate_ctxt *ctxt, |
@@ -4832,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, | |||
4832 | if (!vcpu->arch.mmu.direct_map) | 4861 | if (!vcpu->arch.mmu.direct_map) |
4833 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); | 4862 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); |
4834 | 4863 | ||
4835 | kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 4864 | kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); |
4836 | 4865 | ||
4837 | return true; | 4866 | return true; |
4838 | } | 4867 | } |
@@ -4849,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4849 | int r; | 4878 | int r; |
4850 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4879 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4851 | bool writeback = true; | 4880 | bool writeback = true; |
4881 | bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; | ||
4852 | 4882 | ||
4883 | /* | ||
4884 | * Clear write_fault_to_shadow_pgtable here to ensure it is | ||
4885 | * never reused. | ||
4886 | */ | ||
4887 | vcpu->arch.write_fault_to_shadow_pgtable = false; | ||
4853 | kvm_clear_exception_queue(vcpu); | 4888 | kvm_clear_exception_queue(vcpu); |
4854 | 4889 | ||
4855 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4890 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
@@ -4868,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4868 | if (r != EMULATION_OK) { | 4903 | if (r != EMULATION_OK) { |
4869 | if (emulation_type & EMULTYPE_TRAP_UD) | 4904 | if (emulation_type & EMULTYPE_TRAP_UD) |
4870 | return EMULATE_FAIL; | 4905 | return EMULATE_FAIL; |
4871 | if (reexecute_instruction(vcpu, cr2)) | 4906 | if (reexecute_instruction(vcpu, cr2, |
4907 | write_fault_to_spt)) | ||
4872 | return EMULATE_DONE; | 4908 | return EMULATE_DONE; |
4873 | if (emulation_type & EMULTYPE_SKIP) | 4909 | if (emulation_type & EMULTYPE_SKIP) |
4874 | return EMULATE_FAIL; | 4910 | return EMULATE_FAIL; |
@@ -4898,7 +4934,7 @@ restart: | |||
4898 | return EMULATE_DONE; | 4934 | return EMULATE_DONE; |
4899 | 4935 | ||
4900 | if (r == EMULATION_FAILED) { | 4936 | if (r == EMULATION_FAILED) { |
4901 | if (reexecute_instruction(vcpu, cr2)) | 4937 | if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) |
4902 | return EMULATE_DONE; | 4938 | return EMULATE_DONE; |
4903 | 4939 | ||
4904 | return handle_emulation_failure(vcpu); | 4940 | return handle_emulation_failure(vcpu); |
@@ -5541,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) | |||
5541 | vcpu->arch.nmi_injected = true; | 5577 | vcpu->arch.nmi_injected = true; |
5542 | kvm_x86_ops->set_nmi(vcpu); | 5578 | kvm_x86_ops->set_nmi(vcpu); |
5543 | } | 5579 | } |
5544 | } else if (kvm_cpu_has_interrupt(vcpu)) { | 5580 | } else if (kvm_cpu_has_injectable_intr(vcpu)) { |
5545 | if (kvm_x86_ops->interrupt_allowed(vcpu)) { | 5581 | if (kvm_x86_ops->interrupt_allowed(vcpu)) { |
5546 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), | 5582 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), |
5547 | false); | 5583 | false); |
@@ -5609,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) | |||
5609 | #endif | 5645 | #endif |
5610 | } | 5646 | } |
5611 | 5647 | ||
5648 | static void update_eoi_exitmap(struct kvm_vcpu *vcpu) | ||
5649 | { | ||
5650 | u64 eoi_exit_bitmap[4]; | ||
5651 | |||
5652 | memset(eoi_exit_bitmap, 0, 32); | ||
5653 | |||
5654 | kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); | ||
5655 | kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); | ||
5656 | } | ||
5657 | |||
5612 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | 5658 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
5613 | { | 5659 | { |
5614 | int r; | 5660 | int r; |
@@ -5662,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5662 | kvm_handle_pmu_event(vcpu); | 5708 | kvm_handle_pmu_event(vcpu); |
5663 | if (kvm_check_request(KVM_REQ_PMI, vcpu)) | 5709 | if (kvm_check_request(KVM_REQ_PMI, vcpu)) |
5664 | kvm_deliver_pmi(vcpu); | 5710 | kvm_deliver_pmi(vcpu); |
5711 | if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) | ||
5712 | update_eoi_exitmap(vcpu); | ||
5665 | } | 5713 | } |
5666 | 5714 | ||
5667 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { | 5715 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { |
@@ -5670,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5670 | /* enable NMI/IRQ window open exits if needed */ | 5718 | /* enable NMI/IRQ window open exits if needed */ |
5671 | if (vcpu->arch.nmi_pending) | 5719 | if (vcpu->arch.nmi_pending) |
5672 | kvm_x86_ops->enable_nmi_window(vcpu); | 5720 | kvm_x86_ops->enable_nmi_window(vcpu); |
5673 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | 5721 | else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) |
5674 | kvm_x86_ops->enable_irq_window(vcpu); | 5722 | kvm_x86_ops->enable_irq_window(vcpu); |
5675 | 5723 | ||
5676 | if (kvm_lapic_enabled(vcpu)) { | 5724 | if (kvm_lapic_enabled(vcpu)) { |
5725 | /* | ||
5726 | * Update architecture specific hints for APIC | ||
5727 | * virtual interrupt delivery. | ||
5728 | */ | ||
5729 | if (kvm_x86_ops->hwapic_irr_update) | ||
5730 | kvm_x86_ops->hwapic_irr_update(vcpu, | ||
5731 | kvm_lapic_find_highest_irr(vcpu)); | ||
5677 | update_cr8_intercept(vcpu); | 5732 | update_cr8_intercept(vcpu); |
5678 | kvm_lapic_sync_to_vapic(vcpu); | 5733 | kvm_lapic_sync_to_vapic(vcpu); |
5679 | } | 5734 | } |
@@ -6853,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
6853 | struct kvm_memory_slot *memslot, | 6908 | struct kvm_memory_slot *memslot, |
6854 | struct kvm_memory_slot old, | 6909 | struct kvm_memory_slot old, |
6855 | struct kvm_userspace_memory_region *mem, | 6910 | struct kvm_userspace_memory_region *mem, |
6856 | int user_alloc) | 6911 | bool user_alloc) |
6857 | { | 6912 | { |
6858 | int npages = memslot->npages; | 6913 | int npages = memslot->npages; |
6859 | int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; | ||
6860 | 6914 | ||
6861 | /* Prevent internal slot pages from being moved by fork()/COW. */ | 6915 | /* |
6862 | if (memslot->id >= KVM_MEMORY_SLOTS) | 6916 | * Only private memory slots need to be mapped here since |
6863 | map_flags = MAP_SHARED | MAP_ANONYMOUS; | 6917 | * KVM_SET_MEMORY_REGION ioctl is no longer supported. |
6864 | |||
6865 | /*To keep backward compatibility with older userspace, | ||
6866 | *x86 needs to handle !user_alloc case. | ||
6867 | */ | 6918 | */ |
6868 | if (!user_alloc) { | 6919 | if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { |
6869 | if (npages && !old.npages) { | 6920 | unsigned long userspace_addr; |
6870 | unsigned long userspace_addr; | ||
6871 | 6921 | ||
6872 | userspace_addr = vm_mmap(NULL, 0, | 6922 | /* |
6873 | npages * PAGE_SIZE, | 6923 | * MAP_SHARED to prevent internal slot pages from being moved |
6874 | PROT_READ | PROT_WRITE, | 6924 | * by fork()/COW. |
6875 | map_flags, | 6925 | */ |
6876 | 0); | 6926 | userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, |
6927 | PROT_READ | PROT_WRITE, | ||
6928 | MAP_SHARED | MAP_ANONYMOUS, 0); | ||
6877 | 6929 | ||
6878 | if (IS_ERR((void *)userspace_addr)) | 6930 | if (IS_ERR((void *)userspace_addr)) |
6879 | return PTR_ERR((void *)userspace_addr); | 6931 | return PTR_ERR((void *)userspace_addr); |
6880 | 6932 | ||
6881 | memslot->userspace_addr = userspace_addr; | 6933 | memslot->userspace_addr = userspace_addr; |
6882 | } | ||
6883 | } | 6934 | } |
6884 | 6935 | ||
6885 | |||
6886 | return 0; | 6936 | return 0; |
6887 | } | 6937 | } |
6888 | 6938 | ||
6889 | void kvm_arch_commit_memory_region(struct kvm *kvm, | 6939 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
6890 | struct kvm_userspace_memory_region *mem, | 6940 | struct kvm_userspace_memory_region *mem, |
6891 | struct kvm_memory_slot old, | 6941 | struct kvm_memory_slot old, |
6892 | int user_alloc) | 6942 | bool user_alloc) |
6893 | { | 6943 | { |
6894 | 6944 | ||
6895 | int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; | 6945 | int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; |
6896 | 6946 | ||
6897 | if (!user_alloc && !old.user_alloc && old.npages && !npages) { | 6947 | if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { |
6898 | int ret; | 6948 | int ret; |
6899 | 6949 | ||
6900 | ret = vm_munmap(old.userspace_addr, | 6950 | ret = vm_munmap(old.userspace_addr, |
@@ -6908,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
6908 | if (!kvm->arch.n_requested_mmu_pages) | 6958 | if (!kvm->arch.n_requested_mmu_pages) |
6909 | nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | 6959 | nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); |
6910 | 6960 | ||
6911 | spin_lock(&kvm->mmu_lock); | ||
6912 | if (nr_mmu_pages) | 6961 | if (nr_mmu_pages) |
6913 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 6962 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
6914 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 6963 | /* |
6915 | spin_unlock(&kvm->mmu_lock); | 6964 | * Write protect all pages for dirty logging. |
6965 | * Existing largepage mappings are destroyed here and new ones will | ||
6966 | * not be created until the end of the logging. | ||
6967 | */ | ||
6968 | if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
6969 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | ||
6916 | /* | 6970 | /* |
6917 | * If memory slot is created, or moved, we need to clear all | 6971 | * If memory slot is created, or moved, we need to clear all |
6918 | * mmio sptes. | 6972 | * mmio sptes. |