aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kvm/vmx.c440
1 files changed, 186 insertions, 254 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 28942823cc3a..6ef2a7b5ad99 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -112,6 +112,14 @@ static u64 __read_mostly host_xss;
112static bool __read_mostly enable_pml = 1; 112static bool __read_mostly enable_pml = 1;
113module_param_named(pml, enable_pml, bool, S_IRUGO); 113module_param_named(pml, enable_pml, bool, S_IRUGO);
114 114
115#define MSR_TYPE_R 1
116#define MSR_TYPE_W 2
117#define MSR_TYPE_RW 3
118
119#define MSR_BITMAP_MODE_X2APIC 1
120#define MSR_BITMAP_MODE_X2APIC_APICV 2
121#define MSR_BITMAP_MODE_LM 4
122
115#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 123#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
116 124
117/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 125/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
@@ -186,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
186extern const ulong vmx_return; 194extern const ulong vmx_return;
187 195
188#define NR_AUTOLOAD_MSRS 8 196#define NR_AUTOLOAD_MSRS 8
189#define VMCS02_POOL_SIZE 1
190 197
191struct vmcs { 198struct vmcs {
192 u32 revision_id; 199 u32 revision_id;
@@ -211,6 +218,7 @@ struct loaded_vmcs {
211 int soft_vnmi_blocked; 218 int soft_vnmi_blocked;
212 ktime_t entry_time; 219 ktime_t entry_time;
213 s64 vnmi_blocked_time; 220 s64 vnmi_blocked_time;
221 unsigned long *msr_bitmap;
214 struct list_head loaded_vmcss_on_cpu_link; 222 struct list_head loaded_vmcss_on_cpu_link;
215}; 223};
216 224
@@ -227,7 +235,7 @@ struct shared_msr_entry {
227 * stored in guest memory specified by VMPTRLD, but is opaque to the guest, 235 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
228 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. 236 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
229 * More than one of these structures may exist, if L1 runs multiple L2 guests. 237 * More than one of these structures may exist, if L1 runs multiple L2 guests.
230 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the 238 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
231 * underlying hardware which will be used to run L2. 239 * underlying hardware which will be used to run L2.
232 * This structure is packed to ensure that its layout is identical across 240 * This structure is packed to ensure that its layout is identical across
233 * machines (necessary for live migration). 241 * machines (necessary for live migration).
@@ -410,13 +418,6 @@ struct __packed vmcs12 {
410 */ 418 */
411#define VMCS12_SIZE 0x1000 419#define VMCS12_SIZE 0x1000
412 420
413/* Used to remember the last vmcs02 used for some recently used vmcs12s */
414struct vmcs02_list {
415 struct list_head list;
416 gpa_t vmptr;
417 struct loaded_vmcs vmcs02;
418};
419
420/* 421/*
421 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 422 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
422 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 423 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -441,15 +442,15 @@ struct nested_vmx {
441 */ 442 */
442 bool sync_shadow_vmcs; 443 bool sync_shadow_vmcs;
443 444
444 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
445 struct list_head vmcs02_pool;
446 int vmcs02_num;
447 bool change_vmcs01_virtual_x2apic_mode; 445 bool change_vmcs01_virtual_x2apic_mode;
448 /* L2 must run next, and mustn't decide to exit to L1. */ 446 /* L2 must run next, and mustn't decide to exit to L1. */
449 bool nested_run_pending; 447 bool nested_run_pending;
448
449 struct loaded_vmcs vmcs02;
450
450 /* 451 /*
451 * Guest pages referred to in vmcs02 with host-physical pointers, so 452 * Guest pages referred to in the vmcs02 with host-physical
452 * we must keep them pinned while L2 runs. 453 * pointers, so we must keep them pinned while L2 runs.
453 */ 454 */
454 struct page *apic_access_page; 455 struct page *apic_access_page;
455 struct page *virtual_apic_page; 456 struct page *virtual_apic_page;
@@ -458,8 +459,6 @@ struct nested_vmx {
458 bool pi_pending; 459 bool pi_pending;
459 u16 posted_intr_nv; 460 u16 posted_intr_nv;
460 461
461 unsigned long *msr_bitmap;
462
463 struct hrtimer preemption_timer; 462 struct hrtimer preemption_timer;
464 bool preemption_timer_expired; 463 bool preemption_timer_expired;
465 464
@@ -582,6 +581,7 @@ struct vcpu_vmx {
582 struct kvm_vcpu vcpu; 581 struct kvm_vcpu vcpu;
583 unsigned long host_rsp; 582 unsigned long host_rsp;
584 u8 fail; 583 u8 fail;
584 u8 msr_bitmap_mode;
585 u32 exit_intr_info; 585 u32 exit_intr_info;
586 u32 idt_vectoring_info; 586 u32 idt_vectoring_info;
587 ulong rflags; 587 ulong rflags;
@@ -933,6 +933,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
933static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); 933static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
934static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 934static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
935 u16 error_code); 935 u16 error_code);
936static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
936 937
937static DEFINE_PER_CPU(struct vmcs *, vmxarea); 938static DEFINE_PER_CPU(struct vmcs *, vmxarea);
938static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 939static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -952,12 +953,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
952enum { 953enum {
953 VMX_IO_BITMAP_A, 954 VMX_IO_BITMAP_A,
954 VMX_IO_BITMAP_B, 955 VMX_IO_BITMAP_B,
955 VMX_MSR_BITMAP_LEGACY,
956 VMX_MSR_BITMAP_LONGMODE,
957 VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
958 VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
959 VMX_MSR_BITMAP_LEGACY_X2APIC,
960 VMX_MSR_BITMAP_LONGMODE_X2APIC,
961 VMX_VMREAD_BITMAP, 956 VMX_VMREAD_BITMAP,
962 VMX_VMWRITE_BITMAP, 957 VMX_VMWRITE_BITMAP,
963 VMX_BITMAP_NR 958 VMX_BITMAP_NR
@@ -967,12 +962,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
967 962
968#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) 963#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
969#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) 964#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
970#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
971#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
972#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
973#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
974#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
975#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
976#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 965#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
977#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 966#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
978 967
@@ -2570,36 +2559,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2570 vmx->guest_msrs[from] = tmp; 2559 vmx->guest_msrs[from] = tmp;
2571} 2560}
2572 2561
2573static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2574{
2575 unsigned long *msr_bitmap;
2576
2577 if (is_guest_mode(vcpu))
2578 msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2579 else if (cpu_has_secondary_exec_ctrls() &&
2580 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2581 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2582 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2583 if (is_long_mode(vcpu))
2584 msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2585 else
2586 msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2587 } else {
2588 if (is_long_mode(vcpu))
2589 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2590 else
2591 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2592 }
2593 } else {
2594 if (is_long_mode(vcpu))
2595 msr_bitmap = vmx_msr_bitmap_longmode;
2596 else
2597 msr_bitmap = vmx_msr_bitmap_legacy;
2598 }
2599
2600 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2601}
2602
2603/* 2562/*
2604 * Set up the vmcs to automatically save and restore system 2563 * Set up the vmcs to automatically save and restore system
2605 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 2564 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -2640,7 +2599,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2640 vmx->save_nmsrs = save_nmsrs; 2599 vmx->save_nmsrs = save_nmsrs;
2641 2600
2642 if (cpu_has_vmx_msr_bitmap()) 2601 if (cpu_has_vmx_msr_bitmap())
2643 vmx_set_msr_bitmap(&vmx->vcpu); 2602 vmx_update_msr_bitmap(&vmx->vcpu);
2644} 2603}
2645 2604
2646/* 2605/*
@@ -3835,11 +3794,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
3835 return vmcs; 3794 return vmcs;
3836} 3795}
3837 3796
3838static struct vmcs *alloc_vmcs(void)
3839{
3840 return alloc_vmcs_cpu(raw_smp_processor_id());
3841}
3842
3843static void free_vmcs(struct vmcs *vmcs) 3797static void free_vmcs(struct vmcs *vmcs)
3844{ 3798{
3845 free_pages((unsigned long)vmcs, vmcs_config.order); 3799 free_pages((unsigned long)vmcs, vmcs_config.order);
@@ -3855,9 +3809,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3855 loaded_vmcs_clear(loaded_vmcs); 3809 loaded_vmcs_clear(loaded_vmcs);
3856 free_vmcs(loaded_vmcs->vmcs); 3810 free_vmcs(loaded_vmcs->vmcs);
3857 loaded_vmcs->vmcs = NULL; 3811 loaded_vmcs->vmcs = NULL;
3812 if (loaded_vmcs->msr_bitmap)
3813 free_page((unsigned long)loaded_vmcs->msr_bitmap);
3858 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 3814 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3859} 3815}
3860 3816
3817static struct vmcs *alloc_vmcs(void)
3818{
3819 return alloc_vmcs_cpu(raw_smp_processor_id());
3820}
3821
3822static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3823{
3824 loaded_vmcs->vmcs = alloc_vmcs();
3825 if (!loaded_vmcs->vmcs)
3826 return -ENOMEM;
3827
3828 loaded_vmcs->shadow_vmcs = NULL;
3829 loaded_vmcs_init(loaded_vmcs);
3830
3831 if (cpu_has_vmx_msr_bitmap()) {
3832 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3833 if (!loaded_vmcs->msr_bitmap)
3834 goto out_vmcs;
3835 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3836 }
3837 return 0;
3838
3839out_vmcs:
3840 free_loaded_vmcs(loaded_vmcs);
3841 return -ENOMEM;
3842}
3843
3861static void free_kvm_area(void) 3844static void free_kvm_area(void)
3862{ 3845{
3863 int cpu; 3846 int cpu;
@@ -4916,10 +4899,8 @@ static void free_vpid(int vpid)
4916 spin_unlock(&vmx_vpid_lock); 4899 spin_unlock(&vmx_vpid_lock);
4917} 4900}
4918 4901
4919#define MSR_TYPE_R 1 4902static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4920#define MSR_TYPE_W 2 4903 u32 msr, int type)
4921static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4922 u32 msr, int type)
4923{ 4904{
4924 int f = sizeof(unsigned long); 4905 int f = sizeof(unsigned long);
4925 4906
@@ -4953,6 +4934,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4953 } 4934 }
4954} 4935}
4955 4936
4937static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4938 u32 msr, int type)
4939{
4940 int f = sizeof(unsigned long);
4941
4942 if (!cpu_has_vmx_msr_bitmap())
4943 return;
4944
4945 /*
4946 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4947 * have the write-low and read-high bitmap offsets the wrong way round.
4948 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4949 */
4950 if (msr <= 0x1fff) {
4951 if (type & MSR_TYPE_R)
4952 /* read-low */
4953 __set_bit(msr, msr_bitmap + 0x000 / f);
4954
4955 if (type & MSR_TYPE_W)
4956 /* write-low */
4957 __set_bit(msr, msr_bitmap + 0x800 / f);
4958
4959 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4960 msr &= 0x1fff;
4961 if (type & MSR_TYPE_R)
4962 /* read-high */
4963 __set_bit(msr, msr_bitmap + 0x400 / f);
4964
4965 if (type & MSR_TYPE_W)
4966 /* write-high */
4967 __set_bit(msr, msr_bitmap + 0xc00 / f);
4968
4969 }
4970}
4971
4972static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
4973 u32 msr, int type, bool value)
4974{
4975 if (value)
4976 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
4977 else
4978 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
4979}
4980
4956/* 4981/*
4957 * If a msr is allowed by L0, we should check whether it is allowed by L1. 4982 * If a msr is allowed by L0, we should check whether it is allowed by L1.
4958 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 4983 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4999,30 +5024,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4999 } 5024 }
5000} 5025}
5001 5026
5002static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 5027static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5003{ 5028{
5004 if (!longmode_only) 5029 u8 mode = 0;
5005 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 5030
5006 msr, MSR_TYPE_R | MSR_TYPE_W); 5031 if (cpu_has_secondary_exec_ctrls() &&
5007 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 5032 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
5008 msr, MSR_TYPE_R | MSR_TYPE_W); 5033 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
5034 mode |= MSR_BITMAP_MODE_X2APIC;
5035 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
5036 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
5037 }
5038
5039 if (is_long_mode(vcpu))
5040 mode |= MSR_BITMAP_MODE_LM;
5041
5042 return mode;
5009} 5043}
5010 5044
5011static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) 5045#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
5046
5047static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
5048 u8 mode)
5012{ 5049{
5013 if (apicv_active) { 5050 int msr;
5014 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, 5051
5015 msr, type); 5052 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
5016 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, 5053 unsigned word = msr / BITS_PER_LONG;
5017 msr, type); 5054 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
5018 } else { 5055 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
5019 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 5056 }
5020 msr, type); 5057
5021 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 5058 if (mode & MSR_BITMAP_MODE_X2APIC) {
5022 msr, type); 5059 /*
5060 * TPR reads and writes can be virtualized even if virtual interrupt
5061 * delivery is not in use.
5062 */
5063 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
5064 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
5065 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
5066 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
5067 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
5068 }
5023 } 5069 }
5024} 5070}
5025 5071
5072static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
5073{
5074 struct vcpu_vmx *vmx = to_vmx(vcpu);
5075 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
5076 u8 mode = vmx_msr_bitmap_mode(vcpu);
5077 u8 changed = mode ^ vmx->msr_bitmap_mode;
5078
5079 if (!changed)
5080 return;
5081
5082 vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
5083 !(mode & MSR_BITMAP_MODE_LM));
5084
5085 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
5086 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
5087
5088 vmx->msr_bitmap_mode = mode;
5089}
5090
5026static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) 5091static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
5027{ 5092{
5028 return enable_apicv; 5093 return enable_apicv;
@@ -5272,7 +5337,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5272 } 5337 }
5273 5338
5274 if (cpu_has_vmx_msr_bitmap()) 5339 if (cpu_has_vmx_msr_bitmap())
5275 vmx_set_msr_bitmap(vcpu); 5340 vmx_update_msr_bitmap(vcpu);
5276} 5341}
5277 5342
5278static u32 vmx_exec_control(struct vcpu_vmx *vmx) 5343static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5459,7 +5524,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
5459 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5524 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5460 } 5525 }
5461 if (cpu_has_vmx_msr_bitmap()) 5526 if (cpu_has_vmx_msr_bitmap())
5462 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 5527 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
5463 5528
5464 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 5529 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
5465 5530
@@ -6742,7 +6807,7 @@ void vmx_enable_tdp(void)
6742 6807
6743static __init int hardware_setup(void) 6808static __init int hardware_setup(void)
6744{ 6809{
6745 int r = -ENOMEM, i, msr; 6810 int r = -ENOMEM, i;
6746 6811
6747 rdmsrl_safe(MSR_EFER, &host_efer); 6812 rdmsrl_safe(MSR_EFER, &host_efer);
6748 6813
@@ -6762,9 +6827,6 @@ static __init int hardware_setup(void)
6762 6827
6763 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); 6828 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
6764 6829
6765 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
6766 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
6767
6768 if (setup_vmcs_config(&vmcs_config) < 0) { 6830 if (setup_vmcs_config(&vmcs_config) < 0) {
6769 r = -EIO; 6831 r = -EIO;
6770 goto out; 6832 goto out;
@@ -6833,42 +6895,8 @@ static __init int hardware_setup(void)
6833 kvm_tsc_scaling_ratio_frac_bits = 48; 6895 kvm_tsc_scaling_ratio_frac_bits = 48;
6834 } 6896 }
6835 6897
6836 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
6837 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
6838 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
6839 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
6840 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
6841 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
6842
6843 memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
6844 vmx_msr_bitmap_legacy, PAGE_SIZE);
6845 memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
6846 vmx_msr_bitmap_longmode, PAGE_SIZE);
6847 memcpy(vmx_msr_bitmap_legacy_x2apic,
6848 vmx_msr_bitmap_legacy, PAGE_SIZE);
6849 memcpy(vmx_msr_bitmap_longmode_x2apic,
6850 vmx_msr_bitmap_longmode, PAGE_SIZE);
6851
6852 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 6898 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
6853 6899
6854 for (msr = 0x800; msr <= 0x8ff; msr++) {
6855 if (msr == 0x839 /* TMCCT */)
6856 continue;
6857 vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
6858 }
6859
6860 /*
6861 * TPR reads and writes can be virtualized even if virtual interrupt
6862 * delivery is not in use.
6863 */
6864 vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
6865 vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
6866
6867 /* EOI */
6868 vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
6869 /* SELF-IPI */
6870 vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
6871
6872 if (enable_ept) 6900 if (enable_ept)
6873 vmx_enable_tdp(); 6901 vmx_enable_tdp();
6874 else 6902 else
@@ -6972,94 +7000,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
6972} 7000}
6973 7001
6974/* 7002/*
6975 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
6976 * We could reuse a single VMCS for all the L2 guests, but we also want the
6977 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
6978 * allows keeping them loaded on the processor, and in the future will allow
6979 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
6980 * every entry if they never change.
6981 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
6982 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
6983 *
6984 * The following functions allocate and free a vmcs02 in this pool.
6985 */
6986
6987/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
6988static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
6989{
6990 struct vmcs02_list *item;
6991 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
6992 if (item->vmptr == vmx->nested.current_vmptr) {
6993 list_move(&item->list, &vmx->nested.vmcs02_pool);
6994 return &item->vmcs02;
6995 }
6996
6997 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
6998 /* Recycle the least recently used VMCS. */
6999 item = list_last_entry(&vmx->nested.vmcs02_pool,
7000 struct vmcs02_list, list);
7001 item->vmptr = vmx->nested.current_vmptr;
7002 list_move(&item->list, &vmx->nested.vmcs02_pool);
7003 return &item->vmcs02;
7004 }
7005
7006 /* Create a new VMCS */
7007 item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
7008 if (!item)
7009 return NULL;
7010 item->vmcs02.vmcs = alloc_vmcs();
7011 item->vmcs02.shadow_vmcs = NULL;
7012 if (!item->vmcs02.vmcs) {
7013 kfree(item);
7014 return NULL;
7015 }
7016 loaded_vmcs_init(&item->vmcs02);
7017 item->vmptr = vmx->nested.current_vmptr;
7018 list_add(&(item->list), &(vmx->nested.vmcs02_pool));
7019 vmx->nested.vmcs02_num++;
7020 return &item->vmcs02;
7021}
7022
7023/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
7024static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
7025{
7026 struct vmcs02_list *item;
7027 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
7028 if (item->vmptr == vmptr) {
7029 free_loaded_vmcs(&item->vmcs02);
7030 list_del(&item->list);
7031 kfree(item);
7032 vmx->nested.vmcs02_num--;
7033 return;
7034 }
7035}
7036
7037/*
7038 * Free all VMCSs saved for this vcpu, except the one pointed by
7039 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
7040 * must be &vmx->vmcs01.
7041 */
7042static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
7043{
7044 struct vmcs02_list *item, *n;
7045
7046 WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
7047 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
7048 /*
7049 * Something will leak if the above WARN triggers. Better than
7050 * a use-after-free.
7051 */
7052 if (vmx->loaded_vmcs == &item->vmcs02)
7053 continue;
7054
7055 free_loaded_vmcs(&item->vmcs02);
7056 list_del(&item->list);
7057 kfree(item);
7058 vmx->nested.vmcs02_num--;
7059 }
7060}
7061
7062/*
7063 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 7003 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
7064 * set the success or error code of an emulated VMX instruction, as specified 7004 * set the success or error code of an emulated VMX instruction, as specified
7065 * by Vol 2B, VMX Instruction Reference, "Conventions". 7005 * by Vol 2B, VMX Instruction Reference, "Conventions".
@@ -7239,13 +7179,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
7239{ 7179{
7240 struct vcpu_vmx *vmx = to_vmx(vcpu); 7180 struct vcpu_vmx *vmx = to_vmx(vcpu);
7241 struct vmcs *shadow_vmcs; 7181 struct vmcs *shadow_vmcs;
7182 int r;
7242 7183
7243 if (cpu_has_vmx_msr_bitmap()) { 7184 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
7244 vmx->nested.msr_bitmap = 7185 if (r < 0)
7245 (unsigned long *)__get_free_page(GFP_KERNEL); 7186 goto out_vmcs02;
7246 if (!vmx->nested.msr_bitmap)
7247 goto out_msr_bitmap;
7248 }
7249 7187
7250 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); 7188 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
7251 if (!vmx->nested.cached_vmcs12) 7189 if (!vmx->nested.cached_vmcs12)
@@ -7262,9 +7200,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
7262 vmx->vmcs01.shadow_vmcs = shadow_vmcs; 7200 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
7263 } 7201 }
7264 7202
7265 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
7266 vmx->nested.vmcs02_num = 0;
7267
7268 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 7203 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
7269 HRTIMER_MODE_REL_PINNED); 7204 HRTIMER_MODE_REL_PINNED);
7270 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 7205 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7276,9 +7211,9 @@ out_shadow_vmcs:
7276 kfree(vmx->nested.cached_vmcs12); 7211 kfree(vmx->nested.cached_vmcs12);
7277 7212
7278out_cached_vmcs12: 7213out_cached_vmcs12:
7279 free_page((unsigned long)vmx->nested.msr_bitmap); 7214 free_loaded_vmcs(&vmx->nested.vmcs02);
7280 7215
7281out_msr_bitmap: 7216out_vmcs02:
7282 return -ENOMEM; 7217 return -ENOMEM;
7283} 7218}
7284 7219
@@ -7421,10 +7356,6 @@ static void free_nested(struct vcpu_vmx *vmx)
7421 free_vpid(vmx->nested.vpid02); 7356 free_vpid(vmx->nested.vpid02);
7422 vmx->nested.posted_intr_nv = -1; 7357 vmx->nested.posted_intr_nv = -1;
7423 vmx->nested.current_vmptr = -1ull; 7358 vmx->nested.current_vmptr = -1ull;
7424 if (vmx->nested.msr_bitmap) {
7425 free_page((unsigned long)vmx->nested.msr_bitmap);
7426 vmx->nested.msr_bitmap = NULL;
7427 }
7428 if (enable_shadow_vmcs) { 7359 if (enable_shadow_vmcs) {
7429 vmx_disable_shadow_vmcs(vmx); 7360 vmx_disable_shadow_vmcs(vmx);
7430 vmcs_clear(vmx->vmcs01.shadow_vmcs); 7361 vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -7432,7 +7363,7 @@ static void free_nested(struct vcpu_vmx *vmx)
7432 vmx->vmcs01.shadow_vmcs = NULL; 7363 vmx->vmcs01.shadow_vmcs = NULL;
7433 } 7364 }
7434 kfree(vmx->nested.cached_vmcs12); 7365 kfree(vmx->nested.cached_vmcs12);
7435 /* Unpin physical memory we referred to in current vmcs02 */ 7366 /* Unpin physical memory we referred to in the vmcs02 */
7436 if (vmx->nested.apic_access_page) { 7367 if (vmx->nested.apic_access_page) {
7437 kvm_release_page_dirty(vmx->nested.apic_access_page); 7368 kvm_release_page_dirty(vmx->nested.apic_access_page);
7438 vmx->nested.apic_access_page = NULL; 7369 vmx->nested.apic_access_page = NULL;
@@ -7448,7 +7379,7 @@ static void free_nested(struct vcpu_vmx *vmx)
7448 vmx->nested.pi_desc = NULL; 7379 vmx->nested.pi_desc = NULL;
7449 } 7380 }
7450 7381
7451 nested_free_all_saved_vmcss(vmx); 7382 free_loaded_vmcs(&vmx->nested.vmcs02);
7452} 7383}
7453 7384
7454/* Emulate the VMXOFF instruction */ 7385/* Emulate the VMXOFF instruction */
@@ -7491,8 +7422,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
7491 vmptr + offsetof(struct vmcs12, launch_state), 7422 vmptr + offsetof(struct vmcs12, launch_state),
7492 &zero, sizeof(zero)); 7423 &zero, sizeof(zero));
7493 7424
7494 nested_free_vmcs02(vmx, vmptr);
7495
7496 nested_vmx_succeed(vcpu); 7425 nested_vmx_succeed(vcpu);
7497 return kvm_skip_emulated_instruction(vcpu); 7426 return kvm_skip_emulated_instruction(vcpu);
7498} 7427}
@@ -8404,10 +8333,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8404 8333
8405 /* 8334 /*
8406 * The host physical addresses of some pages of guest memory 8335 * The host physical addresses of some pages of guest memory
8407 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU 8336 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
8408 * may write to these pages via their host physical address while 8337 * Page). The CPU may write to these pages via their host
8409 * L2 is running, bypassing any address-translation-based dirty 8338 * physical address while L2 is running, bypassing any
8410 * tracking (e.g. EPT write protection). 8339 * address-translation-based dirty tracking (e.g. EPT write
8340 * protection).
8411 * 8341 *
8412 * Mark them dirty on every exit from L2 to prevent them from 8342 * Mark them dirty on every exit from L2 to prevent them from
8413 * getting out of sync with dirty tracking. 8343 * getting out of sync with dirty tracking.
@@ -8941,7 +8871,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
8941 } 8871 }
8942 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 8872 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
8943 8873
8944 vmx_set_msr_bitmap(vcpu); 8874 vmx_update_msr_bitmap(vcpu);
8945} 8875}
8946 8876
8947static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 8877static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -9602,6 +9532,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9602{ 9532{
9603 int err; 9533 int err;
9604 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 9534 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
9535 unsigned long *msr_bitmap;
9605 int cpu; 9536 int cpu;
9606 9537
9607 if (!vmx) 9538 if (!vmx)
@@ -9634,13 +9565,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9634 if (!vmx->guest_msrs) 9565 if (!vmx->guest_msrs)
9635 goto free_pml; 9566 goto free_pml;
9636 9567
9637 vmx->loaded_vmcs = &vmx->vmcs01; 9568 err = alloc_loaded_vmcs(&vmx->vmcs01);
9638 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 9569 if (err < 0)
9639 vmx->loaded_vmcs->shadow_vmcs = NULL;
9640 if (!vmx->loaded_vmcs->vmcs)
9641 goto free_msrs; 9570 goto free_msrs;
9642 loaded_vmcs_init(vmx->loaded_vmcs);
9643 9571
9572 msr_bitmap = vmx->vmcs01.msr_bitmap;
9573 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
9574 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
9575 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
9576 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
9577 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
9578 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
9579 vmx->msr_bitmap_mode = 0;
9580
9581 vmx->loaded_vmcs = &vmx->vmcs01;
9644 cpu = get_cpu(); 9582 cpu = get_cpu();
9645 vmx_vcpu_load(&vmx->vcpu, cpu); 9583 vmx_vcpu_load(&vmx->vcpu, cpu);
9646 vmx->vcpu.cpu = cpu; 9584 vmx->vcpu.cpu = cpu;
@@ -10103,7 +10041,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
10103 int msr; 10041 int msr;
10104 struct page *page; 10042 struct page *page;
10105 unsigned long *msr_bitmap_l1; 10043 unsigned long *msr_bitmap_l1;
10106 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; 10044 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
10107 10045
10108 /* This shortcut is ok because we support only x2APIC MSRs so far. */ 10046 /* This shortcut is ok because we support only x2APIC MSRs so far. */
10109 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 10047 if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
@@ -10680,6 +10618,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10680 if (kvm_has_tsc_control) 10618 if (kvm_has_tsc_control)
10681 decache_tsc_multiplier(vmx); 10619 decache_tsc_multiplier(vmx);
10682 10620
10621 if (cpu_has_vmx_msr_bitmap())
10622 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
10623
10683 if (enable_vpid) { 10624 if (enable_vpid) {
10684 /* 10625 /*
10685 * There is no direct mapping between vpid02 and vpid12, the 10626 * There is no direct mapping between vpid02 and vpid12, the
@@ -10901,20 +10842,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
10901{ 10842{
10902 struct vcpu_vmx *vmx = to_vmx(vcpu); 10843 struct vcpu_vmx *vmx = to_vmx(vcpu);
10903 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 10844 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10904 struct loaded_vmcs *vmcs02;
10905 u32 msr_entry_idx; 10845 u32 msr_entry_idx;
10906 u32 exit_qual; 10846 u32 exit_qual;
10907 10847
10908 vmcs02 = nested_get_current_vmcs02(vmx);
10909 if (!vmcs02)
10910 return -ENOMEM;
10911
10912 enter_guest_mode(vcpu); 10848 enter_guest_mode(vcpu);
10913 10849
10914 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 10850 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
10915 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 10851 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
10916 10852
10917 vmx_switch_vmcs(vcpu, vmcs02); 10853 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
10918 vmx_segment_cache_clear(vmx); 10854 vmx_segment_cache_clear(vmx);
10919 10855
10920 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { 10856 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11483,7 +11419,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
11483 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 11419 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
11484 11420
11485 if (cpu_has_vmx_msr_bitmap()) 11421 if (cpu_has_vmx_msr_bitmap())
11486 vmx_set_msr_bitmap(vcpu); 11422 vmx_update_msr_bitmap(vcpu);
11487 11423
11488 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 11424 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
11489 vmcs12->vm_exit_msr_load_count)) 11425 vmcs12->vm_exit_msr_load_count))
@@ -11532,10 +11468,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11532 vm_exit_controls_reset_shadow(vmx); 11468 vm_exit_controls_reset_shadow(vmx);
11533 vmx_segment_cache_clear(vmx); 11469 vmx_segment_cache_clear(vmx);
11534 11470
11535 /* if no vmcs02 cache requested, remove the one we used */
11536 if (VMCS02_POOL_SIZE == 0)
11537 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
11538
11539 /* Update any VMCS fields that might have changed while L2 ran */ 11471 /* Update any VMCS fields that might have changed while L2 ran */
11540 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); 11472 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
11541 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); 11473 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);