aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c2784
1 files changed, 2615 insertions, 169 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea421..e65a158dee64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
43#include "trace.h" 43#include "trace.h"
44 44
45#define __ex(x) __kvm_handle_fault_on_reboot(x) 45#define __ex(x) __kvm_handle_fault_on_reboot(x)
46#define __ex_clear(x, reg) \
47 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
46 48
47MODULE_AUTHOR("Qumranet"); 49MODULE_AUTHOR("Qumranet");
48MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
49 51
50static int __read_mostly bypass_guest_pf = 1;
51module_param(bypass_guest_pf, bool, S_IRUGO);
52
53static int __read_mostly enable_vpid = 1; 52static int __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 53module_param_named(vpid, enable_vpid, bool, 0444);
55 54
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
72static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
74 73
74/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
77 * use VMX instructions.
78 */
79static int __read_mostly nested = 0;
80module_param(nested, bool, S_IRUGO);
81
75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 82#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 83 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
77#define KVM_GUEST_CR0_MASK \ 84#define KVM_GUEST_CR0_MASK \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
109module_param(ple_window, int, S_IRUGO); 116module_param(ple_window, int, S_IRUGO);
110 117
111#define NR_AUTOLOAD_MSRS 1 118#define NR_AUTOLOAD_MSRS 1
119#define VMCS02_POOL_SIZE 1
112 120
113struct vmcs { 121struct vmcs {
114 u32 revision_id; 122 u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
116 char data[0]; 124 char data[0];
117}; 125};
118 126
127/*
128 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
129 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
130 * loaded on this CPU (so we can clear them if the CPU goes down).
131 */
132struct loaded_vmcs {
133 struct vmcs *vmcs;
134 int cpu;
135 int launched;
136 struct list_head loaded_vmcss_on_cpu_link;
137};
138
119struct shared_msr_entry { 139struct shared_msr_entry {
120 unsigned index; 140 unsigned index;
121 u64 data; 141 u64 data;
122 u64 mask; 142 u64 mask;
123}; 143};
124 144
145/*
146 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
147 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
148 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
149 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
150 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
151 * More than one of these structures may exist, if L1 runs multiple L2 guests.
152 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
153 * underlying hardware which will be used to run L2.
154 * This structure is packed to ensure that its layout is identical across
155 * machines (necessary for live migration).
156 * If there are changes in this struct, VMCS12_REVISION must be changed.
157 */
158typedef u64 natural_width;
159struct __packed vmcs12 {
160 /* According to the Intel spec, a VMCS region must start with the
161 * following two fields. Then follow implementation-specific data.
162 */
163 u32 revision_id;
164 u32 abort;
165
166 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
167 u32 padding[7]; /* room for future expansion */
168
169 u64 io_bitmap_a;
170 u64 io_bitmap_b;
171 u64 msr_bitmap;
172 u64 vm_exit_msr_store_addr;
173 u64 vm_exit_msr_load_addr;
174 u64 vm_entry_msr_load_addr;
175 u64 tsc_offset;
176 u64 virtual_apic_page_addr;
177 u64 apic_access_addr;
178 u64 ept_pointer;
179 u64 guest_physical_address;
180 u64 vmcs_link_pointer;
181 u64 guest_ia32_debugctl;
182 u64 guest_ia32_pat;
183 u64 guest_ia32_efer;
184 u64 guest_ia32_perf_global_ctrl;
185 u64 guest_pdptr0;
186 u64 guest_pdptr1;
187 u64 guest_pdptr2;
188 u64 guest_pdptr3;
189 u64 host_ia32_pat;
190 u64 host_ia32_efer;
191 u64 host_ia32_perf_global_ctrl;
192 u64 padding64[8]; /* room for future expansion */
193 /*
194 * To allow migration of L1 (complete with its L2 guests) between
195 * machines of different natural widths (32 or 64 bit), we cannot have
196 * unsigned long fields with no explict size. We use u64 (aliased
197 * natural_width) instead. Luckily, x86 is little-endian.
198 */
199 natural_width cr0_guest_host_mask;
200 natural_width cr4_guest_host_mask;
201 natural_width cr0_read_shadow;
202 natural_width cr4_read_shadow;
203 natural_width cr3_target_value0;
204 natural_width cr3_target_value1;
205 natural_width cr3_target_value2;
206 natural_width cr3_target_value3;
207 natural_width exit_qualification;
208 natural_width guest_linear_address;
209 natural_width guest_cr0;
210 natural_width guest_cr3;
211 natural_width guest_cr4;
212 natural_width guest_es_base;
213 natural_width guest_cs_base;
214 natural_width guest_ss_base;
215 natural_width guest_ds_base;
216 natural_width guest_fs_base;
217 natural_width guest_gs_base;
218 natural_width guest_ldtr_base;
219 natural_width guest_tr_base;
220 natural_width guest_gdtr_base;
221 natural_width guest_idtr_base;
222 natural_width guest_dr7;
223 natural_width guest_rsp;
224 natural_width guest_rip;
225 natural_width guest_rflags;
226 natural_width guest_pending_dbg_exceptions;
227 natural_width guest_sysenter_esp;
228 natural_width guest_sysenter_eip;
229 natural_width host_cr0;
230 natural_width host_cr3;
231 natural_width host_cr4;
232 natural_width host_fs_base;
233 natural_width host_gs_base;
234 natural_width host_tr_base;
235 natural_width host_gdtr_base;
236 natural_width host_idtr_base;
237 natural_width host_ia32_sysenter_esp;
238 natural_width host_ia32_sysenter_eip;
239 natural_width host_rsp;
240 natural_width host_rip;
241 natural_width paddingl[8]; /* room for future expansion */
242 u32 pin_based_vm_exec_control;
243 u32 cpu_based_vm_exec_control;
244 u32 exception_bitmap;
245 u32 page_fault_error_code_mask;
246 u32 page_fault_error_code_match;
247 u32 cr3_target_count;
248 u32 vm_exit_controls;
249 u32 vm_exit_msr_store_count;
250 u32 vm_exit_msr_load_count;
251 u32 vm_entry_controls;
252 u32 vm_entry_msr_load_count;
253 u32 vm_entry_intr_info_field;
254 u32 vm_entry_exception_error_code;
255 u32 vm_entry_instruction_len;
256 u32 tpr_threshold;
257 u32 secondary_vm_exec_control;
258 u32 vm_instruction_error;
259 u32 vm_exit_reason;
260 u32 vm_exit_intr_info;
261 u32 vm_exit_intr_error_code;
262 u32 idt_vectoring_info_field;
263 u32 idt_vectoring_error_code;
264 u32 vm_exit_instruction_len;
265 u32 vmx_instruction_info;
266 u32 guest_es_limit;
267 u32 guest_cs_limit;
268 u32 guest_ss_limit;
269 u32 guest_ds_limit;
270 u32 guest_fs_limit;
271 u32 guest_gs_limit;
272 u32 guest_ldtr_limit;
273 u32 guest_tr_limit;
274 u32 guest_gdtr_limit;
275 u32 guest_idtr_limit;
276 u32 guest_es_ar_bytes;
277 u32 guest_cs_ar_bytes;
278 u32 guest_ss_ar_bytes;
279 u32 guest_ds_ar_bytes;
280 u32 guest_fs_ar_bytes;
281 u32 guest_gs_ar_bytes;
282 u32 guest_ldtr_ar_bytes;
283 u32 guest_tr_ar_bytes;
284 u32 guest_interruptibility_info;
285 u32 guest_activity_state;
286 u32 guest_sysenter_cs;
287 u32 host_ia32_sysenter_cs;
288 u32 padding32[8]; /* room for future expansion */
289 u16 virtual_processor_id;
290 u16 guest_es_selector;
291 u16 guest_cs_selector;
292 u16 guest_ss_selector;
293 u16 guest_ds_selector;
294 u16 guest_fs_selector;
295 u16 guest_gs_selector;
296 u16 guest_ldtr_selector;
297 u16 guest_tr_selector;
298 u16 host_es_selector;
299 u16 host_cs_selector;
300 u16 host_ss_selector;
301 u16 host_ds_selector;
302 u16 host_fs_selector;
303 u16 host_gs_selector;
304 u16 host_tr_selector;
305};
306
307/*
308 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
309 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
310 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
311 */
312#define VMCS12_REVISION 0x11e57ed0
313
314/*
315 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
316 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
317 * current implementation, 4K are reserved to avoid future complications.
318 */
319#define VMCS12_SIZE 0x1000
320
321/* Used to remember the last vmcs02 used for some recently used vmcs12s */
322struct vmcs02_list {
323 struct list_head list;
324 gpa_t vmptr;
325 struct loaded_vmcs vmcs02;
326};
327
328/*
329 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
330 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
331 */
332struct nested_vmx {
333 /* Has the level1 guest done vmxon? */
334 bool vmxon;
335
336 /* The guest-physical address of the current VMCS L1 keeps for L2 */
337 gpa_t current_vmptr;
338 /* The host-usable pointer to the above */
339 struct page *current_vmcs12_page;
340 struct vmcs12 *current_vmcs12;
341
342 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
343 struct list_head vmcs02_pool;
344 int vmcs02_num;
345 u64 vmcs01_tsc_offset;
346 /* L2 must run next, and mustn't decide to exit to L1. */
347 bool nested_run_pending;
348 /*
349 * Guest pages referred to in vmcs02 with host-physical pointers, so
350 * we must keep them pinned while L2 runs.
351 */
352 struct page *apic_access_page;
353};
354
125struct vcpu_vmx { 355struct vcpu_vmx {
126 struct kvm_vcpu vcpu; 356 struct kvm_vcpu vcpu;
127 struct list_head local_vcpus_link;
128 unsigned long host_rsp; 357 unsigned long host_rsp;
129 int launched;
130 u8 fail; 358 u8 fail;
131 u8 cpl; 359 u8 cpl;
132 bool nmi_known_unmasked; 360 bool nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
140 u64 msr_host_kernel_gs_base; 368 u64 msr_host_kernel_gs_base;
141 u64 msr_guest_kernel_gs_base; 369 u64 msr_guest_kernel_gs_base;
142#endif 370#endif
143 struct vmcs *vmcs; 371 /*
372 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
373 * non-nested (L1) guest, it always points to vmcs01. For a nested
374 * guest (L2), it points to a different VMCS.
375 */
376 struct loaded_vmcs vmcs01;
377 struct loaded_vmcs *loaded_vmcs;
378 bool __launched; /* temporary, used in vmx_vcpu_run */
144 struct msr_autoload { 379 struct msr_autoload {
145 unsigned nr; 380 unsigned nr;
146 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 381 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
176 u32 exit_reason; 411 u32 exit_reason;
177 412
178 bool rdtscp_enabled; 413 bool rdtscp_enabled;
414
415 /* Support for a guest hypervisor (nested VMX) */
416 struct nested_vmx nested;
179}; 417};
180 418
181enum segment_cache_field { 419enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
192 return container_of(vcpu, struct vcpu_vmx, vcpu); 430 return container_of(vcpu, struct vcpu_vmx, vcpu);
193} 431}
194 432
433#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
434#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
435#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
436 [number##_HIGH] = VMCS12_OFFSET(name)+4
437
438static unsigned short vmcs_field_to_offset_table[] = {
439 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
440 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
441 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
442 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
443 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
444 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
445 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
446 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
447 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
448 FIELD(HOST_ES_SELECTOR, host_es_selector),
449 FIELD(HOST_CS_SELECTOR, host_cs_selector),
450 FIELD(HOST_SS_SELECTOR, host_ss_selector),
451 FIELD(HOST_DS_SELECTOR, host_ds_selector),
452 FIELD(HOST_FS_SELECTOR, host_fs_selector),
453 FIELD(HOST_GS_SELECTOR, host_gs_selector),
454 FIELD(HOST_TR_SELECTOR, host_tr_selector),
455 FIELD64(IO_BITMAP_A, io_bitmap_a),
456 FIELD64(IO_BITMAP_B, io_bitmap_b),
457 FIELD64(MSR_BITMAP, msr_bitmap),
458 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
459 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
460 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
461 FIELD64(TSC_OFFSET, tsc_offset),
462 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
463 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
464 FIELD64(EPT_POINTER, ept_pointer),
465 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
466 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
467 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
468 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
469 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
470 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
471 FIELD64(GUEST_PDPTR0, guest_pdptr0),
472 FIELD64(GUEST_PDPTR1, guest_pdptr1),
473 FIELD64(GUEST_PDPTR2, guest_pdptr2),
474 FIELD64(GUEST_PDPTR3, guest_pdptr3),
475 FIELD64(HOST_IA32_PAT, host_ia32_pat),
476 FIELD64(HOST_IA32_EFER, host_ia32_efer),
477 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
478 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
479 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
480 FIELD(EXCEPTION_BITMAP, exception_bitmap),
481 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
482 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
483 FIELD(CR3_TARGET_COUNT, cr3_target_count),
484 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
485 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
486 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
487 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
488 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
489 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
490 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
491 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
492 FIELD(TPR_THRESHOLD, tpr_threshold),
493 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
494 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
495 FIELD(VM_EXIT_REASON, vm_exit_reason),
496 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
497 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
498 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
499 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
500 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
501 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
502 FIELD(GUEST_ES_LIMIT, guest_es_limit),
503 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
504 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
505 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
506 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
507 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
508 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
509 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
510 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
511 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
512 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
513 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
514 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
515 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
516 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
517 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
518 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
519 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
520 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
521 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
522 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
523 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
524 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
525 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
526 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
527 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
528 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
529 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
530 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
531 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
532 FIELD(EXIT_QUALIFICATION, exit_qualification),
533 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
534 FIELD(GUEST_CR0, guest_cr0),
535 FIELD(GUEST_CR3, guest_cr3),
536 FIELD(GUEST_CR4, guest_cr4),
537 FIELD(GUEST_ES_BASE, guest_es_base),
538 FIELD(GUEST_CS_BASE, guest_cs_base),
539 FIELD(GUEST_SS_BASE, guest_ss_base),
540 FIELD(GUEST_DS_BASE, guest_ds_base),
541 FIELD(GUEST_FS_BASE, guest_fs_base),
542 FIELD(GUEST_GS_BASE, guest_gs_base),
543 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
544 FIELD(GUEST_TR_BASE, guest_tr_base),
545 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
546 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
547 FIELD(GUEST_DR7, guest_dr7),
548 FIELD(GUEST_RSP, guest_rsp),
549 FIELD(GUEST_RIP, guest_rip),
550 FIELD(GUEST_RFLAGS, guest_rflags),
551 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
552 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
553 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
554 FIELD(HOST_CR0, host_cr0),
555 FIELD(HOST_CR3, host_cr3),
556 FIELD(HOST_CR4, host_cr4),
557 FIELD(HOST_FS_BASE, host_fs_base),
558 FIELD(HOST_GS_BASE, host_gs_base),
559 FIELD(HOST_TR_BASE, host_tr_base),
560 FIELD(HOST_GDTR_BASE, host_gdtr_base),
561 FIELD(HOST_IDTR_BASE, host_idtr_base),
562 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
563 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
564 FIELD(HOST_RSP, host_rsp),
565 FIELD(HOST_RIP, host_rip),
566};
567static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
568
569static inline short vmcs_field_to_offset(unsigned long field)
570{
571 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
572 return -1;
573 return vmcs_field_to_offset_table[field];
574}
575
576static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
577{
578 return to_vmx(vcpu)->nested.current_vmcs12;
579}
580
581static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
582{
583 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
584 if (is_error_page(page)) {
585 kvm_release_page_clean(page);
586 return NULL;
587 }
588 return page;
589}
590
591static void nested_release_page(struct page *page)
592{
593 kvm_release_page_dirty(page);
594}
595
596static void nested_release_page_clean(struct page *page)
597{
598 kvm_release_page_clean(page);
599}
600
195static u64 construct_eptp(unsigned long root_hpa); 601static u64 construct_eptp(unsigned long root_hpa);
196static void kvm_cpu_vmxon(u64 addr); 602static void kvm_cpu_vmxon(u64 addr);
197static void kvm_cpu_vmxoff(void); 603static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
200 606
201static DEFINE_PER_CPU(struct vmcs *, vmxarea); 607static DEFINE_PER_CPU(struct vmcs *, vmxarea);
202static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 608static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
203static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 609/*
610 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
611 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
612 */
613static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
204static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 614static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
205 615
206static unsigned long *vmx_io_bitmap_a; 616static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
442 return flexpriority_enabled; 852 return flexpriority_enabled;
443} 853}
444 854
855static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
856{
857 return vmcs12->cpu_based_vm_exec_control & bit;
858}
859
860static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
861{
862 return (vmcs12->cpu_based_vm_exec_control &
863 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
864 (vmcs12->secondary_vm_exec_control & bit);
865}
866
867static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
868 struct kvm_vcpu *vcpu)
869{
870 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
871}
872
873static inline bool is_exception(u32 intr_info)
874{
875 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
876 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
877}
878
879static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
880static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
881 struct vmcs12 *vmcs12,
882 u32 reason, unsigned long qualification);
883
445static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 884static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
446{ 885{
447 int i; 886 int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
501 vmcs, phys_addr); 940 vmcs, phys_addr);
502} 941}
503 942
943static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
944{
945 vmcs_clear(loaded_vmcs->vmcs);
946 loaded_vmcs->cpu = -1;
947 loaded_vmcs->launched = 0;
948}
949
504static void vmcs_load(struct vmcs *vmcs) 950static void vmcs_load(struct vmcs *vmcs)
505{ 951{
506 u64 phys_addr = __pa(vmcs); 952 u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 956 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
511 : "cc", "memory"); 957 : "cc", "memory");
512 if (error) 958 if (error)
513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 959 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
514 vmcs, phys_addr); 960 vmcs, phys_addr);
515} 961}
516 962
517static void __vcpu_clear(void *arg) 963static void __loaded_vmcs_clear(void *arg)
518{ 964{
519 struct vcpu_vmx *vmx = arg; 965 struct loaded_vmcs *loaded_vmcs = arg;
520 int cpu = raw_smp_processor_id(); 966 int cpu = raw_smp_processor_id();
521 967
522 if (vmx->vcpu.cpu == cpu) 968 if (loaded_vmcs->cpu != cpu)
523 vmcs_clear(vmx->vmcs); 969 return; /* vcpu migration can race with cpu offline */
524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 970 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
525 per_cpu(current_vmcs, cpu) = NULL; 971 per_cpu(current_vmcs, cpu) = NULL;
526 list_del(&vmx->local_vcpus_link); 972 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
527 vmx->vcpu.cpu = -1; 973 loaded_vmcs_init(loaded_vmcs);
528 vmx->launched = 0;
529} 974}
530 975
531static void vcpu_clear(struct vcpu_vmx *vmx) 976static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
532{ 977{
533 if (vmx->vcpu.cpu == -1) 978 if (loaded_vmcs->cpu != -1)
534 return; 979 smp_call_function_single(
535 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 980 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
536} 981}
537 982
538static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 983static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
585 } 1030 }
586} 1031}
587 1032
588static unsigned long vmcs_readl(unsigned long field) 1033static __always_inline unsigned long vmcs_readl(unsigned long field)
589{ 1034{
590 unsigned long value = 0; 1035 unsigned long value;
591 1036
592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 1037 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
593 : "+a"(value) : "d"(field) : "cc"); 1038 : "=a"(value) : "d"(field) : "cc");
594 return value; 1039 return value;
595} 1040}
596 1041
597static u16 vmcs_read16(unsigned long field) 1042static __always_inline u16 vmcs_read16(unsigned long field)
598{ 1043{
599 return vmcs_readl(field); 1044 return vmcs_readl(field);
600} 1045}
601 1046
602static u32 vmcs_read32(unsigned long field) 1047static __always_inline u32 vmcs_read32(unsigned long field)
603{ 1048{
604 return vmcs_readl(field); 1049 return vmcs_readl(field);
605} 1050}
606 1051
607static u64 vmcs_read64(unsigned long field) 1052static __always_inline u64 vmcs_read64(unsigned long field)
608{ 1053{
609#ifdef CONFIG_X86_64 1054#ifdef CONFIG_X86_64
610 return vmcs_readl(field); 1055 return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
731 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1176 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
732 if (vcpu->fpu_active) 1177 if (vcpu->fpu_active)
733 eb &= ~(1u << NM_VECTOR); 1178 eb &= ~(1u << NM_VECTOR);
1179
1180 /* When we are running a nested L2 guest and L1 specified for it a
1181 * certain exception bitmap, we must trap the same exceptions and pass
1182 * them to L1. When running L2, we will only handle the exceptions
1183 * specified above if L1 did not want them.
1184 */
1185 if (is_guest_mode(vcpu))
1186 eb |= get_vmcs12(vcpu)->exception_bitmap;
1187
734 vmcs_write32(EXCEPTION_BITMAP, eb); 1188 vmcs_write32(EXCEPTION_BITMAP, eb);
735} 1189}
736 1190
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
971 1425
972 if (!vmm_exclusive) 1426 if (!vmm_exclusive)
973 kvm_cpu_vmxon(phys_addr); 1427 kvm_cpu_vmxon(phys_addr);
974 else if (vcpu->cpu != cpu) 1428 else if (vmx->loaded_vmcs->cpu != cpu)
975 vcpu_clear(vmx); 1429 loaded_vmcs_clear(vmx->loaded_vmcs);
976 1430
977 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 1431 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
978 per_cpu(current_vmcs, cpu) = vmx->vmcs; 1432 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
979 vmcs_load(vmx->vmcs); 1433 vmcs_load(vmx->loaded_vmcs->vmcs);
980 } 1434 }
981 1435
982 if (vcpu->cpu != cpu) { 1436 if (vmx->loaded_vmcs->cpu != cpu) {
983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt); 1437 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
984 unsigned long sysenter_esp; 1438 unsigned long sysenter_esp;
985 1439
986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1440 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
987 local_irq_disable(); 1441 local_irq_disable();
988 list_add(&vmx->local_vcpus_link, 1442 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
989 &per_cpu(vcpus_on_cpu, cpu)); 1443 &per_cpu(loaded_vmcss_on_cpu, cpu));
990 local_irq_enable(); 1444 local_irq_enable();
991 1445
992 /* 1446 /*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
998 1452
999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1453 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1454 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1455 vmx->loaded_vmcs->cpu = cpu;
1001 } 1456 }
1002} 1457}
1003 1458
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1005{ 1460{
1006 __vmx_load_host_state(to_vmx(vcpu)); 1461 __vmx_load_host_state(to_vmx(vcpu));
1007 if (!vmm_exclusive) { 1462 if (!vmm_exclusive) {
1008 __vcpu_clear(to_vmx(vcpu)); 1463 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1464 vcpu->cpu = -1;
1009 kvm_cpu_vmxoff(); 1465 kvm_cpu_vmxoff();
1010 } 1466 }
1011} 1467}
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1023 vmcs_writel(GUEST_CR0, cr0); 1479 vmcs_writel(GUEST_CR0, cr0);
1024 update_exception_bitmap(vcpu); 1480 update_exception_bitmap(vcpu);
1025 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1481 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1482 if (is_guest_mode(vcpu))
1483 vcpu->arch.cr0_guest_owned_bits &=
1484 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1026 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1485 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1027} 1486}
1028 1487
1029static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1488static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1030 1489
1490/*
1491 * Return the cr0 value that a nested guest would read. This is a combination
1492 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1493 * its hypervisor (cr0_read_shadow).
1494 */
1495static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1496{
1497 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1498 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1499}
1500static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1501{
1502 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1503 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1504}
1505
1031static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1506static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1032{ 1507{
1508 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1509 * set this *before* calling this function.
1510 */
1033 vmx_decache_cr0_guest_bits(vcpu); 1511 vmx_decache_cr0_guest_bits(vcpu);
1034 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 1512 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1035 update_exception_bitmap(vcpu); 1513 update_exception_bitmap(vcpu);
1036 vcpu->arch.cr0_guest_owned_bits = 0; 1514 vcpu->arch.cr0_guest_owned_bits = 0;
1037 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1515 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1038 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 1516 if (is_guest_mode(vcpu)) {
1517 /*
1518 * L1's specified read shadow might not contain the TS bit,
1519 * so now that we turned on shadowing of this bit, we need to
1520 * set this bit of the shadow. Like in nested_vmx_run we need
1521 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1522 * up-to-date here because we just decached cr0.TS (and we'll
1523 * only update vmcs12->guest_cr0 on nested exit).
1524 */
1525 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1526 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1527 (vcpu->arch.cr0 & X86_CR0_TS);
1528 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1529 } else
1530 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1039} 1531}
1040 1532
1041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1533static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1611 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120} 1612}
1121 1613
1614/*
1615 * KVM wants to inject page-faults which it got to the guest. This function
1616 * checks whether in a nested guest, we need to inject them to L1 or L2.
1617 * This function assumes it is called with the exit reason in vmcs02 being
1618 * a #PF exception (this is the only case in which KVM injects a #PF when L2
1619 * is running).
1620 */
1621static int nested_pf_handled(struct kvm_vcpu *vcpu)
1622{
1623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1624
1625 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1626 if (!(vmcs12->exception_bitmap & PF_VECTOR))
1627 return 0;
1628
1629 nested_vmx_vmexit(vcpu);
1630 return 1;
1631}
1632
1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1633static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1123 bool has_error_code, u32 error_code, 1634 bool has_error_code, u32 error_code,
1124 bool reinject) 1635 bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1126 struct vcpu_vmx *vmx = to_vmx(vcpu); 1637 struct vcpu_vmx *vmx = to_vmx(vcpu);
1127 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1638 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1128 1639
1640 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1641 nested_pf_handled(vcpu))
1642 return;
1643
1129 if (has_error_code) { 1644 if (has_error_code) {
1130 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1645 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1131 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1646 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1249{ 1764{
1250 vmcs_write64(TSC_OFFSET, offset); 1765 vmcs_write64(TSC_OFFSET, offset);
1766 if (is_guest_mode(vcpu))
1767 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also
1770 * set the vmcs12 field here.
1771 */
1772 get_vmcs12(vcpu)->tsc_offset = offset -
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset;
1251} 1774}
1252 1775
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{ 1777{
1255 u64 offset = vmcs_read64(TSC_OFFSET); 1778 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment); 1779 vmcs_write64(TSC_OFFSET, offset + adjustment);
1780 if (is_guest_mode(vcpu)) {
1781 /* Even when running L2, the adjustment needs to apply to L1 */
1782 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
1783 }
1257} 1784}
1258 1785
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 1786static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1261 return target_tsc - native_read_tsc(); 1788 return target_tsc - native_read_tsc();
1262} 1789}
1263 1790
1791static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1792{
1793 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1794 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1795}
1796
1797/*
1798 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1799 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1800 * all guests if the "nested" module option is off, and can also be disabled
1801 * for a single guest by disabling its VMX cpuid bit.
1802 */
1803static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1804{
1805 return nested && guest_cpuid_has_vmx(vcpu);
1806}
1807
1808/*
1809 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1810 * returned for the various VMX controls MSRs when nested VMX is enabled.
1811 * The same values should also be used to verify that vmcs12 control fields are
1812 * valid during nested entry from L1 to L2.
1813 * Each of these control msrs has a low and high 32-bit half: A low bit is on
1814 * if the corresponding bit in the (32-bit) control field *must* be on, and a
1815 * bit in the high half is on if the corresponding bit in the control field
1816 * may be on. See also vmx_control_verify().
1817 * TODO: allow these variables to be modified (downgraded) by module options
1818 * or other means.
1819 */
1820static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1821static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1822static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1823static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1824static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1825static __init void nested_vmx_setup_ctls_msrs(void)
1826{
1827 /*
1828 * Note that as a general rule, the high half of the MSRs (bits in
1829 * the control fields which may be 1) should be initialized by the
1830 * intersection of the underlying hardware's MSR (i.e., features which
1831 * can be supported) and the list of features we want to expose -
1832 * because they are known to be properly supported in our code.
1833 * Also, usually, the low half of the MSRs (bits which must be 1) can
1834 * be set to 0, meaning that L1 may turn off any of these bits. The
1835 * reason is that if one of these bits is necessary, it will appear
1836 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
1837 * fields of vmcs01 and vmcs02, will turn these bits off - and
1838 * nested_vmx_exit_handled() will not pass related exits to L1.
1839 * These rules have exceptions below.
1840 */
1841
1842 /* pin-based controls */
1843 /*
1844 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
1845 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
1846 */
1847 nested_vmx_pinbased_ctls_low = 0x16 ;
1848 nested_vmx_pinbased_ctls_high = 0x16 |
1849 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
1850 PIN_BASED_VIRTUAL_NMIS;
1851
1852 /* exit controls */
1853 nested_vmx_exit_ctls_low = 0;
1854 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
1855#ifdef CONFIG_X86_64
1856 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
1857#else
1858 nested_vmx_exit_ctls_high = 0;
1859#endif
1860
1861 /* entry controls */
1862 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
1863 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
1864 nested_vmx_entry_ctls_low = 0;
1865 nested_vmx_entry_ctls_high &=
1866 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
1867
1868 /* cpu-based controls */
1869 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
1870 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
1871 nested_vmx_procbased_ctls_low = 0;
1872 nested_vmx_procbased_ctls_high &=
1873 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
1874 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
1875 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
1876 CPU_BASED_CR3_STORE_EXITING |
1877#ifdef CONFIG_X86_64
1878 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
1879#endif
1880 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1881 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1882 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1883 /*
1884 * We can allow some features even when not supported by the
1885 * hardware. For example, L1 can specify an MSR bitmap - and we
1886 * can use it to avoid exits to L1 - even when L0 runs L2
1887 * without MSR bitmaps.
1888 */
1889 nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
1890
1891 /* secondary cpu-based controls */
1892 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
1893 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
1894 nested_vmx_secondary_ctls_low = 0;
1895 nested_vmx_secondary_ctls_high &=
1896 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1897}
1898
1899static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1900{
1901 /*
1902 * Bits 0 in high must be 0, and bits 1 in low must be 1.
1903 */
1904 return ((control & high) | low) == control;
1905}
1906
1907static inline u64 vmx_control_msr(u32 low, u32 high)
1908{
1909 return low | ((u64)high << 32);
1910}
1911
1912/*
1913 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
1914 * also let it use VMX-specific MSRs.
1915 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
1916 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
1917 * like all other MSRs).
1918 */
1919static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1920{
1921 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
1922 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
1923 /*
1924 * According to the spec, processors which do not support VMX
1925 * should throw a #GP(0) when VMX capability MSRs are read.
1926 */
1927 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
1928 return 1;
1929 }
1930
1931 switch (msr_index) {
1932 case MSR_IA32_FEATURE_CONTROL:
1933 *pdata = 0;
1934 break;
1935 case MSR_IA32_VMX_BASIC:
1936 /*
1937 * This MSR reports some information about VMX support. We
1938 * should return information about the VMX we emulate for the
1939 * guest, and the VMCS structure we give it - not about the
1940 * VMX support of the underlying hardware.
1941 */
1942 *pdata = VMCS12_REVISION |
1943 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
1944 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
1945 break;
1946 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1947 case MSR_IA32_VMX_PINBASED_CTLS:
1948 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
1949 nested_vmx_pinbased_ctls_high);
1950 break;
1951 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1952 case MSR_IA32_VMX_PROCBASED_CTLS:
1953 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
1954 nested_vmx_procbased_ctls_high);
1955 break;
1956 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1957 case MSR_IA32_VMX_EXIT_CTLS:
1958 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
1959 nested_vmx_exit_ctls_high);
1960 break;
1961 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1962 case MSR_IA32_VMX_ENTRY_CTLS:
1963 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
1964 nested_vmx_entry_ctls_high);
1965 break;
1966 case MSR_IA32_VMX_MISC:
1967 *pdata = 0;
1968 break;
1969 /*
1970 * These MSRs specify bits which the guest must keep fixed (on or off)
1971 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
1972 * We picked the standard core2 setting.
1973 */
1974#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
1975#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
1976 case MSR_IA32_VMX_CR0_FIXED0:
1977 *pdata = VMXON_CR0_ALWAYSON;
1978 break;
1979 case MSR_IA32_VMX_CR0_FIXED1:
1980 *pdata = -1ULL;
1981 break;
1982 case MSR_IA32_VMX_CR4_FIXED0:
1983 *pdata = VMXON_CR4_ALWAYSON;
1984 break;
1985 case MSR_IA32_VMX_CR4_FIXED1:
1986 *pdata = -1ULL;
1987 break;
1988 case MSR_IA32_VMX_VMCS_ENUM:
1989 *pdata = 0x1f;
1990 break;
1991 case MSR_IA32_VMX_PROCBASED_CTLS2:
1992 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
1993 nested_vmx_secondary_ctls_high);
1994 break;
1995 case MSR_IA32_VMX_EPT_VPID_CAP:
1996 /* Currently, no nested ept or nested vpid */
1997 *pdata = 0;
1998 break;
1999 default:
2000 return 0;
2001 }
2002
2003 return 1;
2004}
2005
2006static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2007{
2008 if (!nested_vmx_allowed(vcpu))
2009 return 0;
2010
2011 if (msr_index == MSR_IA32_FEATURE_CONTROL)
2012 /* TODO: the right thing. */
2013 return 1;
2014 /*
2015 * No need to treat VMX capability MSRs specially: If we don't handle
2016 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2017 */
2018 return 0;
2019}
2020
1264/* 2021/*
1265 * Reads an msr value (of 'msr_index') into 'pdata'. 2022 * Reads an msr value (of 'msr_index') into 'pdata'.
1266 * Returns 0 on success, non-0 otherwise. 2023 * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1309 /* Otherwise falls through */ 2066 /* Otherwise falls through */
1310 default: 2067 default:
1311 vmx_load_host_state(to_vmx(vcpu)); 2068 vmx_load_host_state(to_vmx(vcpu));
2069 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2070 return 0;
1312 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2071 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1313 if (msr) { 2072 if (msr) {
1314 vmx_load_host_state(to_vmx(vcpu)); 2073 vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1380 return 1; 2139 return 1;
1381 /* Otherwise falls through */ 2140 /* Otherwise falls through */
1382 default: 2141 default:
2142 if (vmx_set_vmx_msr(vcpu, msr_index, data))
2143 break;
1383 msr = find_msr_entry(vmx, msr_index); 2144 msr = find_msr_entry(vmx, msr_index);
1384 if (msr) { 2145 if (msr) {
1385 vmx_load_host_state(vmx); 2146 vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
1469 if (read_cr4() & X86_CR4_VMXE) 2230 if (read_cr4() & X86_CR4_VMXE)
1470 return -EBUSY; 2231 return -EBUSY;
1471 2232
1472 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 2233 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
1473 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2234 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1474 2235
1475 test_bits = FEATURE_CONTROL_LOCKED; 2236 test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
1493 return 0; 2254 return 0;
1494} 2255}
1495 2256
1496static void vmclear_local_vcpus(void) 2257static void vmclear_local_loaded_vmcss(void)
1497{ 2258{
1498 int cpu = raw_smp_processor_id(); 2259 int cpu = raw_smp_processor_id();
1499 struct vcpu_vmx *vmx, *n; 2260 struct loaded_vmcs *v, *n;
1500 2261
1501 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), 2262 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
1502 local_vcpus_link) 2263 loaded_vmcss_on_cpu_link)
1503 __vcpu_clear(vmx); 2264 __loaded_vmcs_clear(v);
1504} 2265}
1505 2266
1506 2267
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
1515static void hardware_disable(void *garbage) 2276static void hardware_disable(void *garbage)
1516{ 2277{
1517 if (vmm_exclusive) { 2278 if (vmm_exclusive) {
1518 vmclear_local_vcpus(); 2279 vmclear_local_loaded_vmcss();
1519 kvm_cpu_vmxoff(); 2280 kvm_cpu_vmxoff();
1520 } 2281 }
1521 write_cr4(read_cr4() & ~X86_CR4_VMXE); 2282 write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
1696 free_pages((unsigned long)vmcs, vmcs_config.order); 2457 free_pages((unsigned long)vmcs, vmcs_config.order);
1697} 2458}
1698 2459
2460/*
2461 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2462 */
2463static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2464{
2465 if (!loaded_vmcs->vmcs)
2466 return;
2467 loaded_vmcs_clear(loaded_vmcs);
2468 free_vmcs(loaded_vmcs->vmcs);
2469 loaded_vmcs->vmcs = NULL;
2470}
2471
1699static void free_kvm_area(void) 2472static void free_kvm_area(void)
1700{ 2473{
1701 int cpu; 2474 int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
1756 if (!cpu_has_vmx_ple()) 2529 if (!cpu_has_vmx_ple())
1757 ple_gap = 0; 2530 ple_gap = 0;
1758 2531
2532 if (nested)
2533 nested_vmx_setup_ctls_msrs();
2534
1759 return alloc_kvm_area(); 2535 return alloc_kvm_area();
1760} 2536}
1761 2537
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2041 (unsigned long *)&vcpu->arch.regs_dirty); 2817 (unsigned long *)&vcpu->arch.regs_dirty);
2042} 2818}
2043 2819
2044static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 2820static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2045 2821
2046static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2822static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2047 unsigned long cr0, 2823 unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2139 vmcs_writel(GUEST_CR3, guest_cr3); 2915 vmcs_writel(GUEST_CR3, guest_cr3);
2140} 2916}
2141 2917
2142static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2918static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2143{ 2919{
2144 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 2920 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2145 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 2921 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2146 2922
2923 if (cr4 & X86_CR4_VMXE) {
2924 /*
2925 * To use VMXON (and later other VMX instructions), a guest
2926 * must first be able to turn on cr4.VMXE (see handle_vmon()).
2927 * So basically the check on whether to allow nested VMX
2928 * is here.
2929 */
2930 if (!nested_vmx_allowed(vcpu))
2931 return 1;
2932 } else if (to_vmx(vcpu)->nested.vmxon)
2933 return 1;
2934
2147 vcpu->arch.cr4 = cr4; 2935 vcpu->arch.cr4 = cr4;
2148 if (enable_ept) { 2936 if (enable_ept) {
2149 if (!is_paging(vcpu)) { 2937 if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2156 2944
2157 vmcs_writel(CR4_READ_SHADOW, cr4); 2945 vmcs_writel(CR4_READ_SHADOW, cr4);
2158 vmcs_writel(GUEST_CR4, hw_cr4); 2946 vmcs_writel(GUEST_CR4, hw_cr4);
2947 return 0;
2159} 2948}
2160 2949
2161static void vmx_get_segment(struct kvm_vcpu *vcpu, 2950static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2721} 3510}
2722 3511
2723/* 3512/*
3513 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3514 * will not change in the lifetime of the guest.
3515 * Note that host-state that does change is set elsewhere. E.g., host-state
3516 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3517 */
3518static void vmx_set_constant_host_state(void)
3519{
3520 u32 low32, high32;
3521 unsigned long tmpl;
3522 struct desc_ptr dt;
3523
3524 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
3525 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
3526 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3527
3528 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3529 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3530 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3531 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3532 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3533
3534 native_store_idt(&dt);
3535 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3536
3537 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
3538 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3539
3540 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3541 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3542 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3543 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3544
3545 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3546 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3547 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3548 }
3549}
3550
3551static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3552{
3553 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3554 if (enable_ept)
3555 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3556 if (is_guest_mode(&vmx->vcpu))
3557 vmx->vcpu.arch.cr4_guest_owned_bits &=
3558 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3559 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3560}
3561
3562static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3563{
3564 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3565 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3566 exec_control &= ~CPU_BASED_TPR_SHADOW;
3567#ifdef CONFIG_X86_64
3568 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3569 CPU_BASED_CR8_LOAD_EXITING;
3570#endif
3571 }
3572 if (!enable_ept)
3573 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3574 CPU_BASED_CR3_LOAD_EXITING |
3575 CPU_BASED_INVLPG_EXITING;
3576 return exec_control;
3577}
3578
3579static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3580{
3581 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3582 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3583 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3584 if (vmx->vpid == 0)
3585 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3586 if (!enable_ept) {
3587 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3588 enable_unrestricted_guest = 0;
3589 }
3590 if (!enable_unrestricted_guest)
3591 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3592 if (!ple_gap)
3593 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3594 return exec_control;
3595}
3596
3597static void ept_set_mmio_spte_mask(void)
3598{
3599 /*
3600 * EPT Misconfigurations can be generated if the value of bits 2:0
3601 * of an EPT paging-structure entry is 110b (write/execute).
3602 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
3603 * spte.
3604 */
3605 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
3606}
3607
3608/*
2724 * Sets up the vmcs for emulated real mode. 3609 * Sets up the vmcs for emulated real mode.
2725 */ 3610 */
2726static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 3611static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2727{ 3612{
2728 u32 host_sysenter_cs, msr_low, msr_high; 3613#ifdef CONFIG_X86_64
2729 u32 junk;
2730 u64 host_pat;
2731 unsigned long a; 3614 unsigned long a;
2732 struct desc_ptr dt; 3615#endif
2733 int i; 3616 int i;
2734 unsigned long kvm_vmx_return;
2735 u32 exec_control;
2736 3617
2737 /* I/O */ 3618 /* I/O */
2738 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 3619 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2747 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 3628 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2748 vmcs_config.pin_based_exec_ctrl); 3629 vmcs_config.pin_based_exec_ctrl);
2749 3630
2750 exec_control = vmcs_config.cpu_based_exec_ctrl; 3631 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
2751 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2752 exec_control &= ~CPU_BASED_TPR_SHADOW;
2753#ifdef CONFIG_X86_64
2754 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2755 CPU_BASED_CR8_LOAD_EXITING;
2756#endif
2757 }
2758 if (!enable_ept)
2759 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2760 CPU_BASED_CR3_LOAD_EXITING |
2761 CPU_BASED_INVLPG_EXITING;
2762 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2763 3632
2764 if (cpu_has_secondary_exec_ctrls()) { 3633 if (cpu_has_secondary_exec_ctrls()) {
2765 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3634 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
2766 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 3635 vmx_secondary_exec_control(vmx));
2767 exec_control &=
2768 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2769 if (vmx->vpid == 0)
2770 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2771 if (!enable_ept) {
2772 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2773 enable_unrestricted_guest = 0;
2774 }
2775 if (!enable_unrestricted_guest)
2776 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2777 if (!ple_gap)
2778 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2779 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2780 } 3636 }
2781 3637
2782 if (ple_gap) { 3638 if (ple_gap) {
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2784 vmcs_write32(PLE_WINDOW, ple_window); 3640 vmcs_write32(PLE_WINDOW, ple_window);
2785 } 3641 }
2786 3642
2787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 3643 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2788 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 3644 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2789 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 3645 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2790 3646
2791 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2792 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2793 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2794
2795 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2796 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2797 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2798 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 3647 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
2799 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 3648 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
2800 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3649 vmx_set_constant_host_state();
2801#ifdef CONFIG_X86_64 3650#ifdef CONFIG_X86_64
2802 rdmsrl(MSR_FS_BASE, a); 3651 rdmsrl(MSR_FS_BASE, a);
2803 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 3652 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2808 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 3657 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2809#endif 3658#endif
2810 3659
2811 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2812
2813 native_store_idt(&dt);
2814 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2815
2816 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2817 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2818 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 3660 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2819 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3661 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2820 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 3662 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2821 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3663 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2822 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 3664 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2823 3665
2824 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2825 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2826 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2827 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
2828 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2829 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2830
2831 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2832 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2833 host_pat = msr_low | ((u64) msr_high << 32);
2834 vmcs_write64(HOST_IA32_PAT, host_pat);
2835 }
2836 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 3666 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3667 u32 msr_low, msr_high;
3668 u64 host_pat;
2837 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 3669 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2838 host_pat = msr_low | ((u64) msr_high << 32); 3670 host_pat = msr_low | ((u64) msr_high << 32);
2839 /* Write the default value follow host pat */ 3671 /* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2863 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 3695 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2864 3696
2865 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3697 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2866 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3698 set_cr4_guest_host_mask(vmx);
2867 if (enable_ept)
2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2870 3699
2871 kvm_write_tsc(&vmx->vcpu, 0); 3700 kvm_write_tsc(&vmx->vcpu, 0);
2872 3701
@@ -2990,9 +3819,25 @@ out:
2990 return ret; 3819 return ret;
2991} 3820}
2992 3821
3822/*
3823 * In nested virtualization, check if L1 asked to exit on external interrupts.
3824 * For most existing hypervisors, this will always return true.
3825 */
3826static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3827{
3828 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
3829 PIN_BASED_EXT_INTR_MASK;
3830}
3831
2993static void enable_irq_window(struct kvm_vcpu *vcpu) 3832static void enable_irq_window(struct kvm_vcpu *vcpu)
2994{ 3833{
2995 u32 cpu_based_vm_exec_control; 3834 u32 cpu_based_vm_exec_control;
3835 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
3836 /* We can get here when nested_run_pending caused
3837 * vmx_interrupt_allowed() to return false. In this case, do
3838 * nothing - the interrupt will be injected later.
3839 */
3840 return;
2996 3841
2997 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3842 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2998 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3843 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
3049{ 3894{
3050 struct vcpu_vmx *vmx = to_vmx(vcpu); 3895 struct vcpu_vmx *vmx = to_vmx(vcpu);
3051 3896
3897 if (is_guest_mode(vcpu))
3898 return;
3899
3052 if (!cpu_has_virtual_nmis()) { 3900 if (!cpu_has_virtual_nmis()) {
3053 /* 3901 /*
3054 * Tracking the NMI-blocked state in software is built upon 3902 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3115 3963
3116static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 3964static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
3117{ 3965{
3966 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3967 struct vmcs12 *vmcs12;
3968 if (to_vmx(vcpu)->nested.nested_run_pending)
3969 return 0;
3970 nested_vmx_vmexit(vcpu);
3971 vmcs12 = get_vmcs12(vcpu);
3972 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
3973 vmcs12->vm_exit_intr_info = 0;
3974 /* fall through to normal code, but now in L1, not L2 */
3975 }
3976
3118 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 3977 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
3119 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3978 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3120 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 3979 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3356 hypercall[2] = 0xc1; 4215 hypercall[2] = 0xc1;
3357} 4216}
3358 4217
4218/* called to set cr0 as approriate for a mov-to-cr0 exit. */
4219static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4220{
4221 if (to_vmx(vcpu)->nested.vmxon &&
4222 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4223 return 1;
4224
4225 if (is_guest_mode(vcpu)) {
4226 /*
4227 * We get here when L2 changed cr0 in a way that did not change
4228 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4229 * but did change L0 shadowed bits. This can currently happen
4230 * with the TS bit: L0 may want to leave TS on (for lazy fpu
4231 * loading) while pretending to allow the guest to change it.
4232 */
4233 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
4234 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
4235 return 1;
4236 vmcs_writel(CR0_READ_SHADOW, val);
4237 return 0;
4238 } else
4239 return kvm_set_cr0(vcpu, val);
4240}
4241
4242static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4243{
4244 if (is_guest_mode(vcpu)) {
4245 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
4246 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
4247 return 1;
4248 vmcs_writel(CR4_READ_SHADOW, val);
4249 return 0;
4250 } else
4251 return kvm_set_cr4(vcpu, val);
4252}
4253
4254/* called to set cr0 as approriate for clts instruction exit. */
4255static void handle_clts(struct kvm_vcpu *vcpu)
4256{
4257 if (is_guest_mode(vcpu)) {
4258 /*
4259 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4260 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
4261 * just pretend it's off (also in arch.cr0 for fpu_activate).
4262 */
4263 vmcs_writel(CR0_READ_SHADOW,
4264 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
4265 vcpu->arch.cr0 &= ~X86_CR0_TS;
4266 } else
4267 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4268}
4269
3359static int handle_cr(struct kvm_vcpu *vcpu) 4270static int handle_cr(struct kvm_vcpu *vcpu)
3360{ 4271{
3361 unsigned long exit_qualification, val; 4272 unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3372 trace_kvm_cr_write(cr, val); 4283 trace_kvm_cr_write(cr, val);
3373 switch (cr) { 4284 switch (cr) {
3374 case 0: 4285 case 0:
3375 err = kvm_set_cr0(vcpu, val); 4286 err = handle_set_cr0(vcpu, val);
3376 kvm_complete_insn_gp(vcpu, err); 4287 kvm_complete_insn_gp(vcpu, err);
3377 return 1; 4288 return 1;
3378 case 3: 4289 case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3380 kvm_complete_insn_gp(vcpu, err); 4291 kvm_complete_insn_gp(vcpu, err);
3381 return 1; 4292 return 1;
3382 case 4: 4293 case 4:
3383 err = kvm_set_cr4(vcpu, val); 4294 err = handle_set_cr4(vcpu, val);
3384 kvm_complete_insn_gp(vcpu, err); 4295 kvm_complete_insn_gp(vcpu, err);
3385 return 1; 4296 return 1;
3386 case 8: { 4297 case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3398 }; 4309 };
3399 break; 4310 break;
3400 case 2: /* clts */ 4311 case 2: /* clts */
3401 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4312 handle_clts(vcpu);
3402 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4313 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3403 skip_emulated_instruction(vcpu); 4314 skip_emulated_instruction(vcpu);
3404 vmx_fpu_activate(vcpu); 4315 vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
3574 return 1; 4485 return 1;
3575} 4486}
3576 4487
3577static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3578{
3579 kvm_queue_exception(vcpu, UD_VECTOR);
3580 return 1;
3581}
3582
3583static int handle_invd(struct kvm_vcpu *vcpu) 4488static int handle_invd(struct kvm_vcpu *vcpu)
3584{ 4489{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4490 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3777static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 4682static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3778{ 4683{
3779 u64 sptes[4]; 4684 u64 sptes[4];
3780 int nr_sptes, i; 4685 int nr_sptes, i, ret;
3781 gpa_t gpa; 4686 gpa_t gpa;
3782 4687
3783 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4688 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3784 4689
4690 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
4691 if (likely(ret == 1))
4692 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
4693 EMULATE_DONE;
4694 if (unlikely(!ret))
4695 return 1;
4696
4697 /* It is the real ept misconfig */
3785 printk(KERN_ERR "EPT: Misconfiguration.\n"); 4698 printk(KERN_ERR "EPT: Misconfiguration.\n");
3786 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 4699 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3787 4700
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
3866} 4779}
3867 4780
3868/* 4781/*
4782 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
4783 * We could reuse a single VMCS for all the L2 guests, but we also want the
4784 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
4785 * allows keeping them loaded on the processor, and in the future will allow
4786 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
4787 * every entry if they never change.
4788 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
4789 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
4790 *
4791 * The following functions allocate and free a vmcs02 in this pool.
4792 */
4793
4794/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
4795static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
4796{
4797 struct vmcs02_list *item;
4798 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4799 if (item->vmptr == vmx->nested.current_vmptr) {
4800 list_move(&item->list, &vmx->nested.vmcs02_pool);
4801 return &item->vmcs02;
4802 }
4803
4804 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
4805 /* Recycle the least recently used VMCS. */
4806 item = list_entry(vmx->nested.vmcs02_pool.prev,
4807 struct vmcs02_list, list);
4808 item->vmptr = vmx->nested.current_vmptr;
4809 list_move(&item->list, &vmx->nested.vmcs02_pool);
4810 return &item->vmcs02;
4811 }
4812
4813 /* Create a new VMCS */
4814 item = (struct vmcs02_list *)
4815 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
4816 if (!item)
4817 return NULL;
4818 item->vmcs02.vmcs = alloc_vmcs();
4819 if (!item->vmcs02.vmcs) {
4820 kfree(item);
4821 return NULL;
4822 }
4823 loaded_vmcs_init(&item->vmcs02);
4824 item->vmptr = vmx->nested.current_vmptr;
4825 list_add(&(item->list), &(vmx->nested.vmcs02_pool));
4826 vmx->nested.vmcs02_num++;
4827 return &item->vmcs02;
4828}
4829
4830/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
4831static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
4832{
4833 struct vmcs02_list *item;
4834 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4835 if (item->vmptr == vmptr) {
4836 free_loaded_vmcs(&item->vmcs02);
4837 list_del(&item->list);
4838 kfree(item);
4839 vmx->nested.vmcs02_num--;
4840 return;
4841 }
4842}
4843
4844/*
4845 * Free all VMCSs saved for this vcpu, except the one pointed by
4846 * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
4847 * currently used, if running L2), and vmcs01 when running L2.
4848 */
4849static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
4850{
4851 struct vmcs02_list *item, *n;
4852 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
4853 if (vmx->loaded_vmcs != &item->vmcs02)
4854 free_loaded_vmcs(&item->vmcs02);
4855 list_del(&item->list);
4856 kfree(item);
4857 }
4858 vmx->nested.vmcs02_num = 0;
4859
4860 if (vmx->loaded_vmcs != &vmx->vmcs01)
4861 free_loaded_vmcs(&vmx->vmcs01);
4862}
4863
4864/*
4865 * Emulate the VMXON instruction.
4866 * Currently, we just remember that VMX is active, and do not save or even
4867 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4868 * do not currently need to store anything in that guest-allocated memory
4869 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4870 * argument is different from the VMXON pointer (which the spec says they do).
4871 */
4872static int handle_vmon(struct kvm_vcpu *vcpu)
4873{
4874 struct kvm_segment cs;
4875 struct vcpu_vmx *vmx = to_vmx(vcpu);
4876
4877 /* The Intel VMX Instruction Reference lists a bunch of bits that
4878 * are prerequisite to running VMXON, most notably cr4.VMXE must be
4879 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
4880 * Otherwise, we should fail with #UD. We test these now:
4881 */
4882 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
4883 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
4884 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4885 kvm_queue_exception(vcpu, UD_VECTOR);
4886 return 1;
4887 }
4888
4889 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4890 if (is_long_mode(vcpu) && !cs.l) {
4891 kvm_queue_exception(vcpu, UD_VECTOR);
4892 return 1;
4893 }
4894
4895 if (vmx_get_cpl(vcpu)) {
4896 kvm_inject_gp(vcpu, 0);
4897 return 1;
4898 }
4899
4900 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
4901 vmx->nested.vmcs02_num = 0;
4902
4903 vmx->nested.vmxon = true;
4904
4905 skip_emulated_instruction(vcpu);
4906 return 1;
4907}
4908
4909/*
4910 * Intel's VMX Instruction Reference specifies a common set of prerequisites
4911 * for running VMX instructions (except VMXON, whose prerequisites are
4912 * slightly different). It also specifies what exception to inject otherwise.
4913 */
4914static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
4915{
4916 struct kvm_segment cs;
4917 struct vcpu_vmx *vmx = to_vmx(vcpu);
4918
4919 if (!vmx->nested.vmxon) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 0;
4922 }
4923
4924 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4925 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
4926 (is_long_mode(vcpu) && !cs.l)) {
4927 kvm_queue_exception(vcpu, UD_VECTOR);
4928 return 0;
4929 }
4930
4931 if (vmx_get_cpl(vcpu)) {
4932 kvm_inject_gp(vcpu, 0);
4933 return 0;
4934 }
4935
4936 return 1;
4937}
4938
4939/*
4940 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
4941 * just stops using VMX.
4942 */
4943static void free_nested(struct vcpu_vmx *vmx)
4944{
4945 if (!vmx->nested.vmxon)
4946 return;
4947 vmx->nested.vmxon = false;
4948 if (vmx->nested.current_vmptr != -1ull) {
4949 kunmap(vmx->nested.current_vmcs12_page);
4950 nested_release_page(vmx->nested.current_vmcs12_page);
4951 vmx->nested.current_vmptr = -1ull;
4952 vmx->nested.current_vmcs12 = NULL;
4953 }
4954 /* Unpin physical memory we referred to in current vmcs02 */
4955 if (vmx->nested.apic_access_page) {
4956 nested_release_page(vmx->nested.apic_access_page);
4957 vmx->nested.apic_access_page = 0;
4958 }
4959
4960 nested_free_all_saved_vmcss(vmx);
4961}
4962
4963/* Emulate the VMXOFF instruction */
4964static int handle_vmoff(struct kvm_vcpu *vcpu)
4965{
4966 if (!nested_vmx_check_permission(vcpu))
4967 return 1;
4968 free_nested(to_vmx(vcpu));
4969 skip_emulated_instruction(vcpu);
4970 return 1;
4971}
4972
4973/*
4974 * Decode the memory-address operand of a vmx instruction, as recorded on an
4975 * exit caused by such an instruction (run by a guest hypervisor).
4976 * On success, returns 0. When the operand is invalid, returns 1 and throws
4977 * #UD or #GP.
4978 */
4979static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
4980 unsigned long exit_qualification,
4981 u32 vmx_instruction_info, gva_t *ret)
4982{
4983 /*
4984 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4985 * Execution", on an exit, vmx_instruction_info holds most of the
4986 * addressing components of the operand. Only the displacement part
4987 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4988 * For how an actual address is calculated from all these components,
4989 * refer to Vol. 1, "Operand Addressing".
4990 */
4991 int scaling = vmx_instruction_info & 3;
4992 int addr_size = (vmx_instruction_info >> 7) & 7;
4993 bool is_reg = vmx_instruction_info & (1u << 10);
4994 int seg_reg = (vmx_instruction_info >> 15) & 7;
4995 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4996 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4997 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4998 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4999
5000 if (is_reg) {
5001 kvm_queue_exception(vcpu, UD_VECTOR);
5002 return 1;
5003 }
5004
5005 /* Addr = segment_base + offset */
5006 /* offset = base + [index * scale] + displacement */
5007 *ret = vmx_get_segment_base(vcpu, seg_reg);
5008 if (base_is_valid)
5009 *ret += kvm_register_read(vcpu, base_reg);
5010 if (index_is_valid)
5011 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5012 *ret += exit_qualification; /* holds the displacement */
5013
5014 if (addr_size == 1) /* 32 bit */
5015 *ret &= 0xffffffff;
5016
5017 /*
5018 * TODO: throw #GP (and return 1) in various cases that the VM*
5019 * instructions require it - e.g., offset beyond segment limit,
5020 * unusable or unreadable/unwritable segment, non-canonical 64-bit
5021 * address, and so on. Currently these are not checked.
5022 */
5023 return 0;
5024}
5025
5026/*
5027 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5028 * set the success or error code of an emulated VMX instruction, as specified
5029 * by Vol 2B, VMX Instruction Reference, "Conventions".
5030 */
5031static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5032{
5033 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5034 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5035 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5036}
5037
5038static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5039{
5040 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5041 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5042 X86_EFLAGS_SF | X86_EFLAGS_OF))
5043 | X86_EFLAGS_CF);
5044}
5045
5046static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5047 u32 vm_instruction_error)
5048{
5049 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5050 /*
5051 * failValid writes the error number to the current VMCS, which
5052 * can't be done there isn't a current VMCS.
5053 */
5054 nested_vmx_failInvalid(vcpu);
5055 return;
5056 }
5057 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5058 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5059 X86_EFLAGS_SF | X86_EFLAGS_OF))
5060 | X86_EFLAGS_ZF);
5061 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5062}
5063
5064/* Emulate the VMCLEAR instruction */
5065static int handle_vmclear(struct kvm_vcpu *vcpu)
5066{
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068 gva_t gva;
5069 gpa_t vmptr;
5070 struct vmcs12 *vmcs12;
5071 struct page *page;
5072 struct x86_exception e;
5073
5074 if (!nested_vmx_check_permission(vcpu))
5075 return 1;
5076
5077 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5078 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5079 return 1;
5080
5081 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5082 sizeof(vmptr), &e)) {
5083 kvm_inject_page_fault(vcpu, &e);
5084 return 1;
5085 }
5086
5087 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5088 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5089 skip_emulated_instruction(vcpu);
5090 return 1;
5091 }
5092
5093 if (vmptr == vmx->nested.current_vmptr) {
5094 kunmap(vmx->nested.current_vmcs12_page);
5095 nested_release_page(vmx->nested.current_vmcs12_page);
5096 vmx->nested.current_vmptr = -1ull;
5097 vmx->nested.current_vmcs12 = NULL;
5098 }
5099
5100 page = nested_get_page(vcpu, vmptr);
5101 if (page == NULL) {
5102 /*
5103 * For accurate processor emulation, VMCLEAR beyond available
5104 * physical memory should do nothing at all. However, it is
5105 * possible that a nested vmx bug, not a guest hypervisor bug,
5106 * resulted in this case, so let's shut down before doing any
5107 * more damage:
5108 */
5109 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5110 return 1;
5111 }
5112 vmcs12 = kmap(page);
5113 vmcs12->launch_state = 0;
5114 kunmap(page);
5115 nested_release_page(page);
5116
5117 nested_free_vmcs02(vmx, vmptr);
5118
5119 skip_emulated_instruction(vcpu);
5120 nested_vmx_succeed(vcpu);
5121 return 1;
5122}
5123
5124static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
5125
5126/* Emulate the VMLAUNCH instruction */
5127static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5128{
5129 return nested_vmx_run(vcpu, true);
5130}
5131
5132/* Emulate the VMRESUME instruction */
5133static int handle_vmresume(struct kvm_vcpu *vcpu)
5134{
5135
5136 return nested_vmx_run(vcpu, false);
5137}
5138
5139enum vmcs_field_type {
5140 VMCS_FIELD_TYPE_U16 = 0,
5141 VMCS_FIELD_TYPE_U64 = 1,
5142 VMCS_FIELD_TYPE_U32 = 2,
5143 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
5144};
5145
5146static inline int vmcs_field_type(unsigned long field)
5147{
5148 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
5149 return VMCS_FIELD_TYPE_U32;
5150 return (field >> 13) & 0x3 ;
5151}
5152
5153static inline int vmcs_field_readonly(unsigned long field)
5154{
5155 return (((field >> 10) & 0x3) == 1);
5156}
5157
5158/*
5159 * Read a vmcs12 field. Since these can have varying lengths and we return
5160 * one type, we chose the biggest type (u64) and zero-extend the return value
5161 * to that size. Note that the caller, handle_vmread, might need to use only
5162 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
5163 * 64-bit fields are to be returned).
5164 */
5165static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5166 unsigned long field, u64 *ret)
5167{
5168 short offset = vmcs_field_to_offset(field);
5169 char *p;
5170
5171 if (offset < 0)
5172 return 0;
5173
5174 p = ((char *)(get_vmcs12(vcpu))) + offset;
5175
5176 switch (vmcs_field_type(field)) {
5177 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5178 *ret = *((natural_width *)p);
5179 return 1;
5180 case VMCS_FIELD_TYPE_U16:
5181 *ret = *((u16 *)p);
5182 return 1;
5183 case VMCS_FIELD_TYPE_U32:
5184 *ret = *((u32 *)p);
5185 return 1;
5186 case VMCS_FIELD_TYPE_U64:
5187 *ret = *((u64 *)p);
5188 return 1;
5189 default:
5190 return 0; /* can never happen. */
5191 }
5192}
5193
5194/*
5195 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5196 * used before) all generate the same failure when it is missing.
5197 */
5198static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
5199{
5200 struct vcpu_vmx *vmx = to_vmx(vcpu);
5201 if (vmx->nested.current_vmptr == -1ull) {
5202 nested_vmx_failInvalid(vcpu);
5203 skip_emulated_instruction(vcpu);
5204 return 0;
5205 }
5206 return 1;
5207}
5208
5209static int handle_vmread(struct kvm_vcpu *vcpu)
5210{
5211 unsigned long field;
5212 u64 field_value;
5213 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5214 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5215 gva_t gva = 0;
5216
5217 if (!nested_vmx_check_permission(vcpu) ||
5218 !nested_vmx_check_vmcs12(vcpu))
5219 return 1;
5220
5221 /* Decode instruction info and find the field to read */
5222 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5223 /* Read the field, zero-extended to a u64 field_value */
5224 if (!vmcs12_read_any(vcpu, field, &field_value)) {
5225 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5226 skip_emulated_instruction(vcpu);
5227 return 1;
5228 }
5229 /*
5230 * Now copy part of this value to register or memory, as requested.
5231 * Note that the number of bits actually copied is 32 or 64 depending
5232 * on the guest's mode (32 or 64 bit), not on the given field's length.
5233 */
5234 if (vmx_instruction_info & (1u << 10)) {
5235 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
5236 field_value);
5237 } else {
5238 if (get_vmx_mem_address(vcpu, exit_qualification,
5239 vmx_instruction_info, &gva))
5240 return 1;
5241 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
5242 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
5243 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
5244 }
5245
5246 nested_vmx_succeed(vcpu);
5247 skip_emulated_instruction(vcpu);
5248 return 1;
5249}
5250
5251
5252static int handle_vmwrite(struct kvm_vcpu *vcpu)
5253{
5254 unsigned long field;
5255 gva_t gva;
5256 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5257 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5258 char *p;
5259 short offset;
5260 /* The value to write might be 32 or 64 bits, depending on L1's long
5261 * mode, and eventually we need to write that into a field of several
5262 * possible lengths. The code below first zero-extends the value to 64
5263 * bit (field_value), and then copies only the approriate number of
5264 * bits into the vmcs12 field.
5265 */
5266 u64 field_value = 0;
5267 struct x86_exception e;
5268
5269 if (!nested_vmx_check_permission(vcpu) ||
5270 !nested_vmx_check_vmcs12(vcpu))
5271 return 1;
5272
5273 if (vmx_instruction_info & (1u << 10))
5274 field_value = kvm_register_read(vcpu,
5275 (((vmx_instruction_info) >> 3) & 0xf));
5276 else {
5277 if (get_vmx_mem_address(vcpu, exit_qualification,
5278 vmx_instruction_info, &gva))
5279 return 1;
5280 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
5281 &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
5282 kvm_inject_page_fault(vcpu, &e);
5283 return 1;
5284 }
5285 }
5286
5287
5288 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5289 if (vmcs_field_readonly(field)) {
5290 nested_vmx_failValid(vcpu,
5291 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5292 skip_emulated_instruction(vcpu);
5293 return 1;
5294 }
5295
5296 offset = vmcs_field_to_offset(field);
5297 if (offset < 0) {
5298 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5299 skip_emulated_instruction(vcpu);
5300 return 1;
5301 }
5302 p = ((char *) get_vmcs12(vcpu)) + offset;
5303
5304 switch (vmcs_field_type(field)) {
5305 case VMCS_FIELD_TYPE_U16:
5306 *(u16 *)p = field_value;
5307 break;
5308 case VMCS_FIELD_TYPE_U32:
5309 *(u32 *)p = field_value;
5310 break;
5311 case VMCS_FIELD_TYPE_U64:
5312 *(u64 *)p = field_value;
5313 break;
5314 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5315 *(natural_width *)p = field_value;
5316 break;
5317 default:
5318 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5319 skip_emulated_instruction(vcpu);
5320 return 1;
5321 }
5322
5323 nested_vmx_succeed(vcpu);
5324 skip_emulated_instruction(vcpu);
5325 return 1;
5326}
5327
5328/* Emulate the VMPTRLD instruction */
5329static int handle_vmptrld(struct kvm_vcpu *vcpu)
5330{
5331 struct vcpu_vmx *vmx = to_vmx(vcpu);
5332 gva_t gva;
5333 gpa_t vmptr;
5334 struct x86_exception e;
5335
5336 if (!nested_vmx_check_permission(vcpu))
5337 return 1;
5338
5339 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5340 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5341 return 1;
5342
5343 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5344 sizeof(vmptr), &e)) {
5345 kvm_inject_page_fault(vcpu, &e);
5346 return 1;
5347 }
5348
5349 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5350 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5351 skip_emulated_instruction(vcpu);
5352 return 1;
5353 }
5354
5355 if (vmx->nested.current_vmptr != vmptr) {
5356 struct vmcs12 *new_vmcs12;
5357 struct page *page;
5358 page = nested_get_page(vcpu, vmptr);
5359 if (page == NULL) {
5360 nested_vmx_failInvalid(vcpu);
5361 skip_emulated_instruction(vcpu);
5362 return 1;
5363 }
5364 new_vmcs12 = kmap(page);
5365 if (new_vmcs12->revision_id != VMCS12_REVISION) {
5366 kunmap(page);
5367 nested_release_page_clean(page);
5368 nested_vmx_failValid(vcpu,
5369 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5370 skip_emulated_instruction(vcpu);
5371 return 1;
5372 }
5373 if (vmx->nested.current_vmptr != -1ull) {
5374 kunmap(vmx->nested.current_vmcs12_page);
5375 nested_release_page(vmx->nested.current_vmcs12_page);
5376 }
5377
5378 vmx->nested.current_vmptr = vmptr;
5379 vmx->nested.current_vmcs12 = new_vmcs12;
5380 vmx->nested.current_vmcs12_page = page;
5381 }
5382
5383 nested_vmx_succeed(vcpu);
5384 skip_emulated_instruction(vcpu);
5385 return 1;
5386}
5387
5388/* Emulate the VMPTRST instruction */
5389static int handle_vmptrst(struct kvm_vcpu *vcpu)
5390{
5391 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5392 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5393 gva_t vmcs_gva;
5394 struct x86_exception e;
5395
5396 if (!nested_vmx_check_permission(vcpu))
5397 return 1;
5398
5399 if (get_vmx_mem_address(vcpu, exit_qualification,
5400 vmx_instruction_info, &vmcs_gva))
5401 return 1;
5402 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
5403 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
5404 (void *)&to_vmx(vcpu)->nested.current_vmptr,
5405 sizeof(u64), &e)) {
5406 kvm_inject_page_fault(vcpu, &e);
5407 return 1;
5408 }
5409 nested_vmx_succeed(vcpu);
5410 skip_emulated_instruction(vcpu);
5411 return 1;
5412}
5413
5414/*
3869 * The exit handlers return 1 if the exit was handled fully and guest execution 5415 * The exit handlers return 1 if the exit was handled fully and guest execution
3870 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5416 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3871 * to be done to userspace and return 0. 5417 * to be done to userspace and return 0.
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3886 [EXIT_REASON_INVD] = handle_invd, 5432 [EXIT_REASON_INVD] = handle_invd,
3887 [EXIT_REASON_INVLPG] = handle_invlpg, 5433 [EXIT_REASON_INVLPG] = handle_invlpg,
3888 [EXIT_REASON_VMCALL] = handle_vmcall, 5434 [EXIT_REASON_VMCALL] = handle_vmcall,
3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 5435 [EXIT_REASON_VMCLEAR] = handle_vmclear,
3890 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, 5436 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
3891 [EXIT_REASON_VMPTRLD] = handle_vmx_insn, 5437 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
3892 [EXIT_REASON_VMPTRST] = handle_vmx_insn, 5438 [EXIT_REASON_VMPTRST] = handle_vmptrst,
3893 [EXIT_REASON_VMREAD] = handle_vmx_insn, 5439 [EXIT_REASON_VMREAD] = handle_vmread,
3894 [EXIT_REASON_VMRESUME] = handle_vmx_insn, 5440 [EXIT_REASON_VMRESUME] = handle_vmresume,
3895 [EXIT_REASON_VMWRITE] = handle_vmx_insn, 5441 [EXIT_REASON_VMWRITE] = handle_vmwrite,
3896 [EXIT_REASON_VMOFF] = handle_vmx_insn, 5442 [EXIT_REASON_VMOFF] = handle_vmoff,
3897 [EXIT_REASON_VMON] = handle_vmx_insn, 5443 [EXIT_REASON_VMON] = handle_vmon,
3898 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5444 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3899 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5445 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3900 [EXIT_REASON_WBINVD] = handle_wbinvd, 5446 [EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3911static const int kvm_vmx_max_exit_handlers = 5457static const int kvm_vmx_max_exit_handlers =
3912 ARRAY_SIZE(kvm_vmx_exit_handlers); 5458 ARRAY_SIZE(kvm_vmx_exit_handlers);
3913 5459
5460/*
5461 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5462 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5463 * disinterest in the current event (read or write a specific MSR) by using an
5464 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5465 */
5466static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5467 struct vmcs12 *vmcs12, u32 exit_reason)
5468{
5469 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5470 gpa_t bitmap;
5471
5472 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
5473 return 1;
5474
5475 /*
5476 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5477 * for the four combinations of read/write and low/high MSR numbers.
5478 * First we need to figure out which of the four to use:
5479 */
5480 bitmap = vmcs12->msr_bitmap;
5481 if (exit_reason == EXIT_REASON_MSR_WRITE)
5482 bitmap += 2048;
5483 if (msr_index >= 0xc0000000) {
5484 msr_index -= 0xc0000000;
5485 bitmap += 1024;
5486 }
5487
5488 /* Then read the msr_index'th bit from this bitmap: */
5489 if (msr_index < 1024*8) {
5490 unsigned char b;
5491 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
5492 return 1 & (b >> (msr_index & 7));
5493 } else
5494 return 1; /* let L1 handle the wrong parameter */
5495}
5496
5497/*
5498 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5499 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5500 * intercept (via guest_host_mask etc.) the current event.
5501 */
5502static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5503 struct vmcs12 *vmcs12)
5504{
5505 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5506 int cr = exit_qualification & 15;
5507 int reg = (exit_qualification >> 8) & 15;
5508 unsigned long val = kvm_register_read(vcpu, reg);
5509
5510 switch ((exit_qualification >> 4) & 3) {
5511 case 0: /* mov to cr */
5512 switch (cr) {
5513 case 0:
5514 if (vmcs12->cr0_guest_host_mask &
5515 (val ^ vmcs12->cr0_read_shadow))
5516 return 1;
5517 break;
5518 case 3:
5519 if ((vmcs12->cr3_target_count >= 1 &&
5520 vmcs12->cr3_target_value0 == val) ||
5521 (vmcs12->cr3_target_count >= 2 &&
5522 vmcs12->cr3_target_value1 == val) ||
5523 (vmcs12->cr3_target_count >= 3 &&
5524 vmcs12->cr3_target_value2 == val) ||
5525 (vmcs12->cr3_target_count >= 4 &&
5526 vmcs12->cr3_target_value3 == val))
5527 return 0;
5528 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5529 return 1;
5530 break;
5531 case 4:
5532 if (vmcs12->cr4_guest_host_mask &
5533 (vmcs12->cr4_read_shadow ^ val))
5534 return 1;
5535 break;
5536 case 8:
5537 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5538 return 1;
5539 break;
5540 }
5541 break;
5542 case 2: /* clts */
5543 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5544 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5545 return 1;
5546 break;
5547 case 1: /* mov from cr */
5548 switch (cr) {
5549 case 3:
5550 if (vmcs12->cpu_based_vm_exec_control &
5551 CPU_BASED_CR3_STORE_EXITING)
5552 return 1;
5553 break;
5554 case 8:
5555 if (vmcs12->cpu_based_vm_exec_control &
5556 CPU_BASED_CR8_STORE_EXITING)
5557 return 1;
5558 break;
5559 }
5560 break;
5561 case 3: /* lmsw */
5562 /*
5563 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5564 * cr0. Other attempted changes are ignored, with no exit.
5565 */
5566 if (vmcs12->cr0_guest_host_mask & 0xe &
5567 (val ^ vmcs12->cr0_read_shadow))
5568 return 1;
5569 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5570 !(vmcs12->cr0_read_shadow & 0x1) &&
5571 (val & 0x1))
5572 return 1;
5573 break;
5574 }
5575 return 0;
5576}
5577
5578/*
5579 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5580 * should handle it ourselves in L0 (and then continue L2). Only call this
5581 * when in is_guest_mode (L2).
5582 */
5583static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5584{
5585 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5586 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5587 struct vcpu_vmx *vmx = to_vmx(vcpu);
5588 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5589
5590 if (vmx->nested.nested_run_pending)
5591 return 0;
5592
5593 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n",
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1;
5597 }
5598
5599 switch (exit_reason) {
5600 case EXIT_REASON_EXCEPTION_NMI:
5601 if (!is_exception(intr_info))
5602 return 0;
5603 else if (is_page_fault(intr_info))
5604 return enable_ept;
5605 return vmcs12->exception_bitmap &
5606 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5607 case EXIT_REASON_EXTERNAL_INTERRUPT:
5608 return 0;
5609 case EXIT_REASON_TRIPLE_FAULT:
5610 return 1;
5611 case EXIT_REASON_PENDING_INTERRUPT:
5612 case EXIT_REASON_NMI_WINDOW:
5613 /*
5614 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
5615 * (aka Interrupt Window Exiting) only when L1 turned it on,
5616 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
5617 * Same for NMI Window Exiting.
5618 */
5619 return 1;
5620 case EXIT_REASON_TASK_SWITCH:
5621 return 1;
5622 case EXIT_REASON_CPUID:
5623 return 1;
5624 case EXIT_REASON_HLT:
5625 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5626 case EXIT_REASON_INVD:
5627 return 1;
5628 case EXIT_REASON_INVLPG:
5629 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5630 case EXIT_REASON_RDPMC:
5631 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5632 case EXIT_REASON_RDTSC:
5633 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5634 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5635 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5636 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
5637 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
5638 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5639 /*
5640 * VMX instructions trap unconditionally. This allows L1 to
5641 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5642 */
5643 return 1;
5644 case EXIT_REASON_CR_ACCESS:
5645 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5646 case EXIT_REASON_DR_ACCESS:
5647 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5648 case EXIT_REASON_IO_INSTRUCTION:
5649 /* TODO: support IO bitmaps */
5650 return 1;
5651 case EXIT_REASON_MSR_READ:
5652 case EXIT_REASON_MSR_WRITE:
5653 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5654 case EXIT_REASON_INVALID_STATE:
5655 return 1;
5656 case EXIT_REASON_MWAIT_INSTRUCTION:
5657 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5658 case EXIT_REASON_MONITOR_INSTRUCTION:
5659 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5660 case EXIT_REASON_PAUSE_INSTRUCTION:
5661 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5662 nested_cpu_has2(vmcs12,
5663 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5664 case EXIT_REASON_MCE_DURING_VMENTRY:
5665 return 0;
5666 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5667 return 1;
5668 case EXIT_REASON_APIC_ACCESS:
5669 return nested_cpu_has2(vmcs12,
5670 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
5671 case EXIT_REASON_EPT_VIOLATION:
5672 case EXIT_REASON_EPT_MISCONFIG:
5673 return 0;
5674 case EXIT_REASON_WBINVD:
5675 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5676 case EXIT_REASON_XSETBV:
5677 return 1;
5678 default:
5679 return 1;
5680 }
5681}
5682
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5683static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{ 5684{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5685 *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3933 if (vmx->emulation_required && emulate_invalid_guest_state) 5702 if (vmx->emulation_required && emulate_invalid_guest_state)
3934 return handle_invalid_guest_state(vcpu); 5703 return handle_invalid_guest_state(vcpu);
3935 5704
5705 /*
5706 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
5707 * we did not inject a still-pending event to L1 now because of
5708 * nested_run_pending, we need to re-enable this bit.
5709 */
5710 if (vmx->nested.nested_run_pending)
5711 kvm_make_request(KVM_REQ_EVENT, vcpu);
5712
5713 if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
5714 exit_reason == EXIT_REASON_VMRESUME))
5715 vmx->nested.nested_run_pending = 1;
5716 else
5717 vmx->nested.nested_run_pending = 0;
5718
5719 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
5720 nested_vmx_vmexit(vcpu);
5721 return 1;
5722 }
5723
3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5724 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5725 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3938 vcpu->run->fail_entry.hardware_entry_failure_reason 5726 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3955 "(0x%x) and exit reason is 0x%x\n", 5743 "(0x%x) and exit reason is 0x%x\n",
3956 __func__, vectoring_info, exit_reason); 5744 __func__, vectoring_info, exit_reason);
3957 5745
3958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 5746 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5747 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
5748 get_vmcs12(vcpu), vcpu)))) {
3959 if (vmx_interrupt_allowed(vcpu)) { 5749 if (vmx_interrupt_allowed(vcpu)) {
3960 vmx->soft_vnmi_blocked = 0; 5750 vmx->soft_vnmi_blocked = 0;
3961 } else if (vmx->vnmi_blocked_time > 1000000000LL && 5751 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4118 5908
4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 5909static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4120{ 5910{
5911 if (is_guest_mode(&vmx->vcpu))
5912 return;
4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, 5913 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
4122 VM_EXIT_INSTRUCTION_LEN, 5914 VM_EXIT_INSTRUCTION_LEN,
4123 IDT_VECTORING_ERROR_CODE); 5915 IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4125 5917
4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 5918static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
4127{ 5919{
5920 if (is_guest_mode(vcpu))
5921 return;
4128 __vmx_complete_interrupts(to_vmx(vcpu), 5922 __vmx_complete_interrupts(to_vmx(vcpu),
4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5923 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
4130 VM_ENTRY_INSTRUCTION_LEN, 5924 VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4145{ 5939{
4146 struct vcpu_vmx *vmx = to_vmx(vcpu); 5940 struct vcpu_vmx *vmx = to_vmx(vcpu);
4147 5941
5942 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
5943 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5944 if (vmcs12->idt_vectoring_info_field &
5945 VECTORING_INFO_VALID_MASK) {
5946 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5947 vmcs12->idt_vectoring_info_field);
5948 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5949 vmcs12->vm_exit_instruction_len);
5950 if (vmcs12->idt_vectoring_info_field &
5951 VECTORING_INFO_DELIVER_CODE_MASK)
5952 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
5953 vmcs12->idt_vectoring_error_code);
5954 }
5955 }
5956
4148 /* Record the guest's net vcpu time for enforced NMI injections. */ 5957 /* Record the guest's net vcpu time for enforced NMI injections. */
4149 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 5958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
4150 vmx->entry_time = ktime_get(); 5959 vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4167 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5976 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
4168 vmx_set_interrupt_shadow(vcpu, 0); 5977 vmx_set_interrupt_shadow(vcpu, 0);
4169 5978
5979 vmx->__launched = vmx->loaded_vmcs->launched;
4170 asm( 5980 asm(
4171 /* Store host registers */ 5981 /* Store host registers */
4172 "push %%"R"dx; push %%"R"bp;" 5982 "push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4237 "pop %%"R"bp; pop %%"R"dx \n\t" 6047 "pop %%"R"bp; pop %%"R"dx \n\t"
4238 "setbe %c[fail](%0) \n\t" 6048 "setbe %c[fail](%0) \n\t"
4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6049 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 6050 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
4241 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6051 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4242 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 6052 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4243 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 6053 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4276 6086
4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6087 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4278 6088
6089 if (is_guest_mode(vcpu)) {
6090 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6091 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6092 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6093 vmcs12->idt_vectoring_error_code =
6094 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6095 vmcs12->vm_exit_instruction_len =
6096 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6097 }
6098 }
6099
4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 6100 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4280 vmx->launched = 1; 6101 vmx->loaded_vmcs->launched = 1;
4281 6102
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283 6104
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4289#undef R 6110#undef R
4290#undef Q 6111#undef Q
4291 6112
4292static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
4293{
4294 struct vcpu_vmx *vmx = to_vmx(vcpu);
4295
4296 if (vmx->vmcs) {
4297 vcpu_clear(vmx);
4298 free_vmcs(vmx->vmcs);
4299 vmx->vmcs = NULL;
4300 }
4301}
4302
4303static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6113static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4304{ 6114{
4305 struct vcpu_vmx *vmx = to_vmx(vcpu); 6115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4306 6116
4307 free_vpid(vmx); 6117 free_vpid(vmx);
4308 vmx_free_vmcs(vcpu); 6118 free_nested(vmx);
6119 free_loaded_vmcs(vmx->loaded_vmcs);
4309 kfree(vmx->guest_msrs); 6120 kfree(vmx->guest_msrs);
4310 kvm_vcpu_uninit(vcpu); 6121 kvm_vcpu_uninit(vcpu);
4311 kmem_cache_free(kvm_vcpu_cache, vmx); 6122 kmem_cache_free(kvm_vcpu_cache, vmx);
4312} 6123}
4313 6124
4314static inline void vmcs_init(struct vmcs *vmcs)
4315{
4316 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4317
4318 if (!vmm_exclusive)
4319 kvm_cpu_vmxon(phys_addr);
4320
4321 vmcs_clear(vmcs);
4322
4323 if (!vmm_exclusive)
4324 kvm_cpu_vmxoff();
4325}
4326
4327static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6125static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4328{ 6126{
4329 int err; 6127 int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4345 goto uninit_vcpu; 6143 goto uninit_vcpu;
4346 } 6144 }
4347 6145
4348 vmx->vmcs = alloc_vmcs(); 6146 vmx->loaded_vmcs = &vmx->vmcs01;
4349 if (!vmx->vmcs) 6147 vmx->loaded_vmcs->vmcs = alloc_vmcs();
6148 if (!vmx->loaded_vmcs->vmcs)
4350 goto free_msrs; 6149 goto free_msrs;
4351 6150 if (!vmm_exclusive)
4352 vmcs_init(vmx->vmcs); 6151 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
6152 loaded_vmcs_init(vmx->loaded_vmcs);
6153 if (!vmm_exclusive)
6154 kvm_cpu_vmxoff();
4353 6155
4354 cpu = get_cpu(); 6156 cpu = get_cpu();
4355 vmx_vcpu_load(&vmx->vcpu, cpu); 6157 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4375 goto free_vmcs; 6177 goto free_vmcs;
4376 } 6178 }
4377 6179
6180 vmx->nested.current_vmptr = -1ull;
6181 vmx->nested.current_vmcs12 = NULL;
6182
4378 return &vmx->vcpu; 6183 return &vmx->vcpu;
4379 6184
4380free_vmcs: 6185free_vmcs:
4381 free_vmcs(vmx->vmcs); 6186 free_vmcs(vmx->loaded_vmcs->vmcs);
4382free_msrs: 6187free_msrs:
4383 kfree(vmx->guest_msrs); 6188 kfree(vmx->guest_msrs);
4384uninit_vcpu: 6189uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4512 6317
4513static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6318static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4514{ 6319{
6320 if (func == 1 && nested)
6321 entry->ecx |= bit(X86_FEATURE_VMX);
6322}
6323
6324/*
6325 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
6326 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
6327 * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
6328 * guest in a way that will both be appropriate to L1's requests, and our
6329 * needs. In addition to modifying the active vmcs (which is vmcs02), this
6330 * function also has additional necessary side-effects, like setting various
6331 * vcpu->arch fields.
6332 */
6333static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6334{
6335 struct vcpu_vmx *vmx = to_vmx(vcpu);
6336 u32 exec_control;
6337
6338 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
6339 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
6340 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
6341 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
6342 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
6343 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
6344 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
6345 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
6346 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
6347 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
6348 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
6349 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
6350 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
6351 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
6352 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
6353 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
6354 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
6355 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
6356 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
6357 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
6358 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
6359 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
6360 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
6361 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
6362 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
6363 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
6364 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
6365 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
6366 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
6367 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
6368 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
6369 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
6370 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
6371 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
6372 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
6373 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
6374
6375 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
6376 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6377 vmcs12->vm_entry_intr_info_field);
6378 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6379 vmcs12->vm_entry_exception_error_code);
6380 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6381 vmcs12->vm_entry_instruction_len);
6382 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6383 vmcs12->guest_interruptibility_info);
6384 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6385 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6386 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
6387 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6388 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6389 vmcs12->guest_pending_dbg_exceptions);
6390 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
6391 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
6392
6393 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6394
6395 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
6396 (vmcs_config.pin_based_exec_ctrl |
6397 vmcs12->pin_based_vm_exec_control));
6398
6399 /*
6400 * Whether page-faults are trapped is determined by a combination of
6401 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
6402 * If enable_ept, L0 doesn't care about page faults and we should
6403 * set all of these to L1's desires. However, if !enable_ept, L0 does
6404 * care about (at least some) page faults, and because it is not easy
6405 * (if at all possible?) to merge L0 and L1's desires, we simply ask
6406 * to exit on each and every L2 page fault. This is done by setting
6407 * MASK=MATCH=0 and (see below) EB.PF=1.
6408 * Note that below we don't need special code to set EB.PF beyond the
6409 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
6410 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
6411 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
6412 *
6413 * A problem with this approach (when !enable_ept) is that L1 may be
6414 * injected with more page faults than it asked for. This could have
6415 * caused problems, but in practice existing hypervisors don't care.
6416 * To fix this, we will need to emulate the PFEC checking (on the L1
6417 * page tables), using walk_addr(), when injecting PFs to L1.
6418 */
6419 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
6420 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
6421 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
6422 enable_ept ? vmcs12->page_fault_error_code_match : 0);
6423
6424 if (cpu_has_secondary_exec_ctrls()) {
6425 u32 exec_control = vmx_secondary_exec_control(vmx);
6426 if (!vmx->rdtscp_enabled)
6427 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6428 /* Take the following fields only from vmcs12 */
6429 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6430 if (nested_cpu_has(vmcs12,
6431 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
6432 exec_control |= vmcs12->secondary_vm_exec_control;
6433
6434 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
6435 /*
6436 * Translate L1 physical address to host physical
6437 * address for vmcs02. Keep the page pinned, so this
6438 * physical address remains valid. We keep a reference
6439 * to it so we can release it later.
6440 */
6441 if (vmx->nested.apic_access_page) /* shouldn't happen */
6442 nested_release_page(vmx->nested.apic_access_page);
6443 vmx->nested.apic_access_page =
6444 nested_get_page(vcpu, vmcs12->apic_access_addr);
6445 /*
6446 * If translation failed, no matter: This feature asks
6447 * to exit when accessing the given address, and if it
6448 * can never be accessed, this feature won't do
6449 * anything anyway.
6450 */
6451 if (!vmx->nested.apic_access_page)
6452 exec_control &=
6453 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6454 else
6455 vmcs_write64(APIC_ACCESS_ADDR,
6456 page_to_phys(vmx->nested.apic_access_page));
6457 }
6458
6459 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6460 }
6461
6462
6463 /*
6464 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
6465 * Some constant fields are set here by vmx_set_constant_host_state().
6466 * Other fields are different per CPU, and will be set later when
6467 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
6468 */
6469 vmx_set_constant_host_state();
6470
6471 /*
6472 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
6473 * entry, but only if the current (host) sp changed from the value
6474 * we wrote last (vmx->host_rsp). This cache is no longer relevant
6475 * if we switch vmcs, and rather than hold a separate cache per vmcs,
6476 * here we just force the write to happen on entry.
6477 */
6478 vmx->host_rsp = 0;
6479
6480 exec_control = vmx_exec_control(vmx); /* L0's desires */
6481 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6482 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6483 exec_control &= ~CPU_BASED_TPR_SHADOW;
6484 exec_control |= vmcs12->cpu_based_vm_exec_control;
6485 /*
6486 * Merging of IO and MSR bitmaps not currently supported.
6487 * Rather, exit every time.
6488 */
6489 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
6490 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
6491 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
6492
6493 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6494
6495 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
6496 * bitwise-or of what L1 wants to trap for L2, and what we want to
6497 * trap. Note that CR0.TS also needs updating - we do this later.
6498 */
6499 update_exception_bitmap(vcpu);
6500 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
6501 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6502
6503 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
6504 vmcs_write32(VM_EXIT_CONTROLS,
6505 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
6506 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
6507 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
6508
6509 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
6510 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
6511 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6512 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6513
6514
6515 set_cr4_guest_host_mask(vmx);
6516
6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519
6520 if (enable_vpid) {
6521 /*
6522 * Trivially support vpid by letting L2s share their parent
6523 * L1's vpid. TODO: move to a more elaborate solution, giving
6524 * each L2 its own vpid and exposing the vpid feature to L1.
6525 */
6526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6527 vmx_flush_tlb(vcpu);
6528 }
6529
6530 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
6531 vcpu->arch.efer = vmcs12->guest_ia32_efer;
6532 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
6533 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6534 else
6535 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6536 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
6537 vmx_set_efer(vcpu, vcpu->arch.efer);
6538
6539 /*
6540 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
6541 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
6542 * The CR0_READ_SHADOW is what L2 should have expected to read given
6543 * the specifications by L1; It's not enough to take
6544 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
6545 * have more bits than L1 expected.
6546 */
6547 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
6548 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
6549
6550 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
6551 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
6552
6553 /* shadow page tables on either EPT or shadow page tables */
6554 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
6555 kvm_mmu_reset_context(vcpu);
6556
6557 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
6558 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
6559}
6560
6561/*
6562 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
6563 * for running an L2 nested guest.
6564 */
6565static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6566{
6567 struct vmcs12 *vmcs12;
6568 struct vcpu_vmx *vmx = to_vmx(vcpu);
6569 int cpu;
6570 struct loaded_vmcs *vmcs02;
6571
6572 if (!nested_vmx_check_permission(vcpu) ||
6573 !nested_vmx_check_vmcs12(vcpu))
6574 return 1;
6575
6576 skip_emulated_instruction(vcpu);
6577 vmcs12 = get_vmcs12(vcpu);
6578
6579 /*
6580 * The nested entry process starts with enforcing various prerequisites
6581 * on vmcs12 as required by the Intel SDM, and act appropriately when
6582 * they fail: As the SDM explains, some conditions should cause the
6583 * instruction to fail, while others will cause the instruction to seem
6584 * to succeed, but return an EXIT_REASON_INVALID_STATE.
6585 * To speed up the normal (success) code path, we should avoid checking
6586 * for misconfigurations which will anyway be caught by the processor
6587 * when using the merged vmcs02.
6588 */
6589 if (vmcs12->launch_state == launch) {
6590 nested_vmx_failValid(vcpu,
6591 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
6592 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
6593 return 1;
6594 }
6595
6596 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
6597 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
6598 /*TODO: Also verify bits beyond physical address width are 0*/
6599 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6600 return 1;
6601 }
6602
6603 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
6604 !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
6605 /*TODO: Also verify bits beyond physical address width are 0*/
6606 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6607 return 1;
6608 }
6609
6610 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit())
6614 printk(KERN_WARNING
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1;
6618 }
6619
6620 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6621 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
6622 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6623 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
6624 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6625 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
6626 !vmx_control_verify(vmcs12->vm_exit_controls,
6627 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
6628 !vmx_control_verify(vmcs12->vm_entry_controls,
6629 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
6630 {
6631 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6632 return 1;
6633 }
6634
6635 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6636 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6637 nested_vmx_failValid(vcpu,
6638 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
6639 return 1;
6640 }
6641
6642 if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6643 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6644 nested_vmx_entry_failure(vcpu, vmcs12,
6645 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
6646 return 1;
6647 }
6648 if (vmcs12->vmcs_link_pointer != -1ull) {
6649 nested_vmx_entry_failure(vcpu, vmcs12,
6650 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
6651 return 1;
6652 }
6653
6654 /*
6655 * We're finally done with prerequisite checking, and can start with
6656 * the nested entry.
6657 */
6658
6659 vmcs02 = nested_get_current_vmcs02(vmx);
6660 if (!vmcs02)
6661 return -ENOMEM;
6662
6663 enter_guest_mode(vcpu);
6664
6665 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
6666
6667 cpu = get_cpu();
6668 vmx->loaded_vmcs = vmcs02;
6669 vmx_vcpu_put(vcpu);
6670 vmx_vcpu_load(vcpu, cpu);
6671 vcpu->cpu = cpu;
6672 put_cpu();
6673
6674 vmcs12->launch_state = 1;
6675
6676 prepare_vmcs02(vcpu, vmcs12);
6677
6678 /*
6679 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
6680 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
6681 * returned as far as L1 is concerned. It will only return (and set
6682 * the success flag) when L2 exits (see nested_vmx_vmexit()).
6683 */
6684 return 1;
6685}
6686
6687/*
6688 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6689 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6690 * This function returns the new value we should put in vmcs12.guest_cr0.
6691 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6692 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6693 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6694 * didn't trap the bit, because if L1 did, so would L0).
6695 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6696 * been modified by L2, and L1 knows it. So just leave the old value of
6697 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6698 * isn't relevant, because if L0 traps this bit it can set it to anything.
6699 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6700 * changed these bits, and therefore they need to be updated, but L0
6701 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
6702 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
6703 */
6704static inline unsigned long
6705vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6706{
6707 return
6708 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
6709 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
6710 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
6711 vcpu->arch.cr0_guest_owned_bits));
6712}
6713
6714static inline unsigned long
6715vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6716{
6717 return
6718 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
6719 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
6720 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
6721 vcpu->arch.cr4_guest_owned_bits));
6722}
6723
6724/*
6725 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
6726 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
6727 * and this function updates it to reflect the changes to the guest state while
6728 * L2 was running (and perhaps made some exits which were handled directly by L0
6729 * without going back to L1), and to reflect the exit reason.
6730 * Note that we do not have to copy here all VMCS fields, just those that
6731 * could have changed by the L2 guest or the exit - i.e., the guest-state and
6732 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
6733 * which already writes to vmcs12 directly.
6734 */
6735void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6736{
6737 /* update guest state fields: */
6738 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
6739 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
6740
6741 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
6742 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6743 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
6744 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
6745
6746 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
6747 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
6748 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
6749 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
6750 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
6751 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
6752 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
6753 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
6754 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
6755 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
6756 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
6757 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
6758 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
6759 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
6760 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
6761 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
6762 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
6763 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
6764 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
6765 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
6766 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
6767 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
6768 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
6769 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
6770 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
6771 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
6772 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
6773 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
6774 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
6775 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
6776 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
6777 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
6778 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
6779 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
6780 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
6781 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
6782
6783 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
6784 vmcs12->guest_interruptibility_info =
6785 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
6786 vmcs12->guest_pending_dbg_exceptions =
6787 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
6788
6789 /* TODO: These cannot have changed unless we have MSR bitmaps and
6790 * the relevant bit asks not to trap the change */
6791 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
6792 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
6793 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
6794 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
6795 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
6796 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
6797
6798 /* update exit information fields: */
6799
6800 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
6801 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6802
6803 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6804 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6805 vmcs12->idt_vectoring_info_field =
6806 vmcs_read32(IDT_VECTORING_INFO_FIELD);
6807 vmcs12->idt_vectoring_error_code =
6808 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6809 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6810 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6811
6812 /* clear vm-entry fields which are to be cleared on exit */
6813 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6814 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
6815}
6816
6817/*
6818 * A part of what we need to when the nested L2 guest exits and we want to
6819 * run its L1 parent, is to reset L1's guest state to the host state specified
6820 * in vmcs12.
6821 * This function is to be called not only on normal nested exit, but also on
6822 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
6823 * Failures During or After Loading Guest State").
6824 * This function should be called when the active VMCS is L1's (vmcs01).
6825 */
6826void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6827{
6828 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
6829 vcpu->arch.efer = vmcs12->host_ia32_efer;
6830 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
6831 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6832 else
6833 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6834 vmx_set_efer(vcpu, vcpu->arch.efer);
6835
6836 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
6837 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
6838 /*
6839 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
6840 * actually changed, because it depends on the current state of
6841 * fpu_active (which may have changed).
6842 * Note that vmx_set_cr0 refers to efer set above.
6843 */
6844 kvm_set_cr0(vcpu, vmcs12->host_cr0);
6845 /*
6846 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
6847 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
6848 * but we also need to update cr0_guest_host_mask and exception_bitmap.
6849 */
6850 update_exception_bitmap(vcpu);
6851 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
6852 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6853
6854 /*
6855 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
6856 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
6857 */
6858 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
6859 kvm_set_cr4(vcpu, vmcs12->host_cr4);
6860
6861 /* shadow page tables on either EPT or shadow page tables */
6862 kvm_set_cr3(vcpu, vmcs12->host_cr3);
6863 kvm_mmu_reset_context(vcpu);
6864
6865 if (enable_vpid) {
6866 /*
6867 * Trivially support vpid by letting L2s share their parent
6868 * L1's vpid. TODO: move to a more elaborate solution, giving
6869 * each L2 its own vpid and exposing the vpid feature to L1.
6870 */
6871 vmx_flush_tlb(vcpu);
6872 }
6873
6874
6875 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
6876 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
6877 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
6878 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
6879 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
6880 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
6881 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
6882 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
6883 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
6884 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
6885 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
6886 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
6887 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
6888 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
6889 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
6890
6891 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
6892 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
6893 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6894 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
6895 vmcs12->host_ia32_perf_global_ctrl);
6896}
6897
6898/*
6899 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
6900 * and modify vmcs12 to make it see what it would expect to see there if
6901 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
6902 */
6903static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6904{
6905 struct vcpu_vmx *vmx = to_vmx(vcpu);
6906 int cpu;
6907 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6908
6909 leave_guest_mode(vcpu);
6910 prepare_vmcs12(vcpu, vmcs12);
6911
6912 cpu = get_cpu();
6913 vmx->loaded_vmcs = &vmx->vmcs01;
6914 vmx_vcpu_put(vcpu);
6915 vmx_vcpu_load(vcpu, cpu);
6916 vcpu->cpu = cpu;
6917 put_cpu();
6918
6919 /* if no vmcs02 cache requested, remove the one we used */
6920 if (VMCS02_POOL_SIZE == 0)
6921 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
6922
6923 load_vmcs12_host_state(vcpu, vmcs12);
6924
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */
6929 vmx->host_rsp = 0;
6930
6931 /* Unpin physical memory we referred to in vmcs02 */
6932 if (vmx->nested.apic_access_page) {
6933 nested_release_page(vmx->nested.apic_access_page);
6934 vmx->nested.apic_access_page = 0;
6935 }
6936
6937 /*
6938 * Exiting from L2 to L1, we're now back to L1 which thinks it just
6939 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
6940 * success or failure flag accordingly.
6941 */
6942 if (unlikely(vmx->fail)) {
6943 vmx->fail = 0;
6944 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
6945 } else
6946 nested_vmx_succeed(vcpu);
6947}
6948
6949/*
6950 * L1's failure to enter L2 is a subset of a normal exit, as explained in
6951 * 23.7 "VM-entry failures during or after loading guest state" (this also
6952 * lists the acceptable exit-reason and exit-qualification parameters).
6953 * It should only be called before L2 actually succeeded to run, and when
6954 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
6955 */
6956static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
6957 struct vmcs12 *vmcs12,
6958 u32 reason, unsigned long qualification)
6959{
6960 load_vmcs12_host_state(vcpu, vmcs12);
6961 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
6962 vmcs12->exit_qualification = qualification;
6963 nested_vmx_succeed(vcpu);
4515} 6964}
4516 6965
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6966static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
4670 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7119 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
4671 7120
4672 if (enable_ept) { 7121 if (enable_ept) {
4673 bypass_guest_pf = 0;
4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7122 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4675 VMX_EPT_EXECUTABLE_MASK); 7123 VMX_EPT_EXECUTABLE_MASK);
7124 ept_set_mmio_spte_mask();
4676 kvm_enable_tdp(); 7125 kvm_enable_tdp();
4677 } else 7126 } else
4678 kvm_disable_tdp(); 7127 kvm_disable_tdp();
4679 7128
4680 if (bypass_guest_pf)
4681 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4682
4683 return 0; 7129 return 0;
4684 7130
4685out3: 7131out3: