diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 2784 |
1 files changed, 2615 insertions, 169 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d48ec60ea421..e65a158dee64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -43,13 +43,12 @@ | |||
43 | #include "trace.h" | 43 | #include "trace.h" |
44 | 44 | ||
45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
46 | #define __ex_clear(x, reg) \ | ||
47 | ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) | ||
46 | 48 | ||
47 | MODULE_AUTHOR("Qumranet"); | 49 | MODULE_AUTHOR("Qumranet"); |
48 | MODULE_LICENSE("GPL"); | 50 | MODULE_LICENSE("GPL"); |
49 | 51 | ||
50 | static int __read_mostly bypass_guest_pf = 1; | ||
51 | module_param(bypass_guest_pf, bool, S_IRUGO); | ||
52 | |||
53 | static int __read_mostly enable_vpid = 1; | 52 | static int __read_mostly enable_vpid = 1; |
54 | module_param_named(vpid, enable_vpid, bool, 0444); | 53 | module_param_named(vpid, enable_vpid, bool, 0444); |
55 | 54 | ||
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO); | |||
72 | static int __read_mostly yield_on_hlt = 1; | 71 | static int __read_mostly yield_on_hlt = 1; |
73 | module_param(yield_on_hlt, bool, S_IRUGO); | 72 | module_param(yield_on_hlt, bool, S_IRUGO); |
74 | 73 | ||
74 | /* | ||
75 | * If nested=1, nested virtualization is supported, i.e., guests may use | ||
76 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | ||
77 | * use VMX instructions. | ||
78 | */ | ||
79 | static int __read_mostly nested = 0; | ||
80 | module_param(nested, bool, S_IRUGO); | ||
81 | |||
75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 82 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 83 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
77 | #define KVM_GUEST_CR0_MASK \ | 84 | #define KVM_GUEST_CR0_MASK \ |
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | |||
109 | module_param(ple_window, int, S_IRUGO); | 116 | module_param(ple_window, int, S_IRUGO); |
110 | 117 | ||
111 | #define NR_AUTOLOAD_MSRS 1 | 118 | #define NR_AUTOLOAD_MSRS 1 |
119 | #define VMCS02_POOL_SIZE 1 | ||
112 | 120 | ||
113 | struct vmcs { | 121 | struct vmcs { |
114 | u32 revision_id; | 122 | u32 revision_id; |
@@ -116,17 +124,237 @@ struct vmcs { | |||
116 | char data[0]; | 124 | char data[0]; |
117 | }; | 125 | }; |
118 | 126 | ||
127 | /* | ||
128 | * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also | ||
129 | * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs | ||
130 | * loaded on this CPU (so we can clear them if the CPU goes down). | ||
131 | */ | ||
132 | struct loaded_vmcs { | ||
133 | struct vmcs *vmcs; | ||
134 | int cpu; | ||
135 | int launched; | ||
136 | struct list_head loaded_vmcss_on_cpu_link; | ||
137 | }; | ||
138 | |||
119 | struct shared_msr_entry { | 139 | struct shared_msr_entry { |
120 | unsigned index; | 140 | unsigned index; |
121 | u64 data; | 141 | u64 data; |
122 | u64 mask; | 142 | u64 mask; |
123 | }; | 143 | }; |
124 | 144 | ||
145 | /* | ||
146 | * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a | ||
147 | * single nested guest (L2), hence the name vmcs12. Any VMX implementation has | ||
148 | * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is | ||
149 | * stored in guest memory specified by VMPTRLD, but is opaque to the guest, | ||
150 | * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. | ||
151 | * More than one of these structures may exist, if L1 runs multiple L2 guests. | ||
152 | * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the | ||
153 | * underlying hardware which will be used to run L2. | ||
154 | * This structure is packed to ensure that its layout is identical across | ||
155 | * machines (necessary for live migration). | ||
156 | * If there are changes in this struct, VMCS12_REVISION must be changed. | ||
157 | */ | ||
158 | typedef u64 natural_width; | ||
159 | struct __packed vmcs12 { | ||
160 | /* According to the Intel spec, a VMCS region must start with the | ||
161 | * following two fields. Then follow implementation-specific data. | ||
162 | */ | ||
163 | u32 revision_id; | ||
164 | u32 abort; | ||
165 | |||
166 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
167 | u32 padding[7]; /* room for future expansion */ | ||
168 | |||
169 | u64 io_bitmap_a; | ||
170 | u64 io_bitmap_b; | ||
171 | u64 msr_bitmap; | ||
172 | u64 vm_exit_msr_store_addr; | ||
173 | u64 vm_exit_msr_load_addr; | ||
174 | u64 vm_entry_msr_load_addr; | ||
175 | u64 tsc_offset; | ||
176 | u64 virtual_apic_page_addr; | ||
177 | u64 apic_access_addr; | ||
178 | u64 ept_pointer; | ||
179 | u64 guest_physical_address; | ||
180 | u64 vmcs_link_pointer; | ||
181 | u64 guest_ia32_debugctl; | ||
182 | u64 guest_ia32_pat; | ||
183 | u64 guest_ia32_efer; | ||
184 | u64 guest_ia32_perf_global_ctrl; | ||
185 | u64 guest_pdptr0; | ||
186 | u64 guest_pdptr1; | ||
187 | u64 guest_pdptr2; | ||
188 | u64 guest_pdptr3; | ||
189 | u64 host_ia32_pat; | ||
190 | u64 host_ia32_efer; | ||
191 | u64 host_ia32_perf_global_ctrl; | ||
192 | u64 padding64[8]; /* room for future expansion */ | ||
193 | /* | ||
194 | * To allow migration of L1 (complete with its L2 guests) between | ||
195 | * machines of different natural widths (32 or 64 bit), we cannot have | ||
196 | * unsigned long fields with no explict size. We use u64 (aliased | ||
197 | * natural_width) instead. Luckily, x86 is little-endian. | ||
198 | */ | ||
199 | natural_width cr0_guest_host_mask; | ||
200 | natural_width cr4_guest_host_mask; | ||
201 | natural_width cr0_read_shadow; | ||
202 | natural_width cr4_read_shadow; | ||
203 | natural_width cr3_target_value0; | ||
204 | natural_width cr3_target_value1; | ||
205 | natural_width cr3_target_value2; | ||
206 | natural_width cr3_target_value3; | ||
207 | natural_width exit_qualification; | ||
208 | natural_width guest_linear_address; | ||
209 | natural_width guest_cr0; | ||
210 | natural_width guest_cr3; | ||
211 | natural_width guest_cr4; | ||
212 | natural_width guest_es_base; | ||
213 | natural_width guest_cs_base; | ||
214 | natural_width guest_ss_base; | ||
215 | natural_width guest_ds_base; | ||
216 | natural_width guest_fs_base; | ||
217 | natural_width guest_gs_base; | ||
218 | natural_width guest_ldtr_base; | ||
219 | natural_width guest_tr_base; | ||
220 | natural_width guest_gdtr_base; | ||
221 | natural_width guest_idtr_base; | ||
222 | natural_width guest_dr7; | ||
223 | natural_width guest_rsp; | ||
224 | natural_width guest_rip; | ||
225 | natural_width guest_rflags; | ||
226 | natural_width guest_pending_dbg_exceptions; | ||
227 | natural_width guest_sysenter_esp; | ||
228 | natural_width guest_sysenter_eip; | ||
229 | natural_width host_cr0; | ||
230 | natural_width host_cr3; | ||
231 | natural_width host_cr4; | ||
232 | natural_width host_fs_base; | ||
233 | natural_width host_gs_base; | ||
234 | natural_width host_tr_base; | ||
235 | natural_width host_gdtr_base; | ||
236 | natural_width host_idtr_base; | ||
237 | natural_width host_ia32_sysenter_esp; | ||
238 | natural_width host_ia32_sysenter_eip; | ||
239 | natural_width host_rsp; | ||
240 | natural_width host_rip; | ||
241 | natural_width paddingl[8]; /* room for future expansion */ | ||
242 | u32 pin_based_vm_exec_control; | ||
243 | u32 cpu_based_vm_exec_control; | ||
244 | u32 exception_bitmap; | ||
245 | u32 page_fault_error_code_mask; | ||
246 | u32 page_fault_error_code_match; | ||
247 | u32 cr3_target_count; | ||
248 | u32 vm_exit_controls; | ||
249 | u32 vm_exit_msr_store_count; | ||
250 | u32 vm_exit_msr_load_count; | ||
251 | u32 vm_entry_controls; | ||
252 | u32 vm_entry_msr_load_count; | ||
253 | u32 vm_entry_intr_info_field; | ||
254 | u32 vm_entry_exception_error_code; | ||
255 | u32 vm_entry_instruction_len; | ||
256 | u32 tpr_threshold; | ||
257 | u32 secondary_vm_exec_control; | ||
258 | u32 vm_instruction_error; | ||
259 | u32 vm_exit_reason; | ||
260 | u32 vm_exit_intr_info; | ||
261 | u32 vm_exit_intr_error_code; | ||
262 | u32 idt_vectoring_info_field; | ||
263 | u32 idt_vectoring_error_code; | ||
264 | u32 vm_exit_instruction_len; | ||
265 | u32 vmx_instruction_info; | ||
266 | u32 guest_es_limit; | ||
267 | u32 guest_cs_limit; | ||
268 | u32 guest_ss_limit; | ||
269 | u32 guest_ds_limit; | ||
270 | u32 guest_fs_limit; | ||
271 | u32 guest_gs_limit; | ||
272 | u32 guest_ldtr_limit; | ||
273 | u32 guest_tr_limit; | ||
274 | u32 guest_gdtr_limit; | ||
275 | u32 guest_idtr_limit; | ||
276 | u32 guest_es_ar_bytes; | ||
277 | u32 guest_cs_ar_bytes; | ||
278 | u32 guest_ss_ar_bytes; | ||
279 | u32 guest_ds_ar_bytes; | ||
280 | u32 guest_fs_ar_bytes; | ||
281 | u32 guest_gs_ar_bytes; | ||
282 | u32 guest_ldtr_ar_bytes; | ||
283 | u32 guest_tr_ar_bytes; | ||
284 | u32 guest_interruptibility_info; | ||
285 | u32 guest_activity_state; | ||
286 | u32 guest_sysenter_cs; | ||
287 | u32 host_ia32_sysenter_cs; | ||
288 | u32 padding32[8]; /* room for future expansion */ | ||
289 | u16 virtual_processor_id; | ||
290 | u16 guest_es_selector; | ||
291 | u16 guest_cs_selector; | ||
292 | u16 guest_ss_selector; | ||
293 | u16 guest_ds_selector; | ||
294 | u16 guest_fs_selector; | ||
295 | u16 guest_gs_selector; | ||
296 | u16 guest_ldtr_selector; | ||
297 | u16 guest_tr_selector; | ||
298 | u16 host_es_selector; | ||
299 | u16 host_cs_selector; | ||
300 | u16 host_ss_selector; | ||
301 | u16 host_ds_selector; | ||
302 | u16 host_fs_selector; | ||
303 | u16 host_gs_selector; | ||
304 | u16 host_tr_selector; | ||
305 | }; | ||
306 | |||
307 | /* | ||
308 | * VMCS12_REVISION is an arbitrary id that should be changed if the content or | ||
309 | * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and | ||
310 | * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. | ||
311 | */ | ||
312 | #define VMCS12_REVISION 0x11e57ed0 | ||
313 | |||
314 | /* | ||
315 | * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region | ||
316 | * and any VMCS region. Although only sizeof(struct vmcs12) are used by the | ||
317 | * current implementation, 4K are reserved to avoid future complications. | ||
318 | */ | ||
319 | #define VMCS12_SIZE 0x1000 | ||
320 | |||
321 | /* Used to remember the last vmcs02 used for some recently used vmcs12s */ | ||
322 | struct vmcs02_list { | ||
323 | struct list_head list; | ||
324 | gpa_t vmptr; | ||
325 | struct loaded_vmcs vmcs02; | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * The nested_vmx structure is part of vcpu_vmx, and holds information we need | ||
330 | * for correct emulation of VMX (i.e., nested VMX) on this vcpu. | ||
331 | */ | ||
332 | struct nested_vmx { | ||
333 | /* Has the level1 guest done vmxon? */ | ||
334 | bool vmxon; | ||
335 | |||
336 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | ||
337 | gpa_t current_vmptr; | ||
338 | /* The host-usable pointer to the above */ | ||
339 | struct page *current_vmcs12_page; | ||
340 | struct vmcs12 *current_vmcs12; | ||
341 | |||
342 | /* vmcs02_list cache of VMCSs recently used to run L2 guests */ | ||
343 | struct list_head vmcs02_pool; | ||
344 | int vmcs02_num; | ||
345 | u64 vmcs01_tsc_offset; | ||
346 | /* L2 must run next, and mustn't decide to exit to L1. */ | ||
347 | bool nested_run_pending; | ||
348 | /* | ||
349 | * Guest pages referred to in vmcs02 with host-physical pointers, so | ||
350 | * we must keep them pinned while L2 runs. | ||
351 | */ | ||
352 | struct page *apic_access_page; | ||
353 | }; | ||
354 | |||
125 | struct vcpu_vmx { | 355 | struct vcpu_vmx { |
126 | struct kvm_vcpu vcpu; | 356 | struct kvm_vcpu vcpu; |
127 | struct list_head local_vcpus_link; | ||
128 | unsigned long host_rsp; | 357 | unsigned long host_rsp; |
129 | int launched; | ||
130 | u8 fail; | 358 | u8 fail; |
131 | u8 cpl; | 359 | u8 cpl; |
132 | bool nmi_known_unmasked; | 360 | bool nmi_known_unmasked; |
@@ -140,7 +368,14 @@ struct vcpu_vmx { | |||
140 | u64 msr_host_kernel_gs_base; | 368 | u64 msr_host_kernel_gs_base; |
141 | u64 msr_guest_kernel_gs_base; | 369 | u64 msr_guest_kernel_gs_base; |
142 | #endif | 370 | #endif |
143 | struct vmcs *vmcs; | 371 | /* |
372 | * loaded_vmcs points to the VMCS currently used in this vcpu. For a | ||
373 | * non-nested (L1) guest, it always points to vmcs01. For a nested | ||
374 | * guest (L2), it points to a different VMCS. | ||
375 | */ | ||
376 | struct loaded_vmcs vmcs01; | ||
377 | struct loaded_vmcs *loaded_vmcs; | ||
378 | bool __launched; /* temporary, used in vmx_vcpu_run */ | ||
144 | struct msr_autoload { | 379 | struct msr_autoload { |
145 | unsigned nr; | 380 | unsigned nr; |
146 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; | 381 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; |
@@ -176,6 +411,9 @@ struct vcpu_vmx { | |||
176 | u32 exit_reason; | 411 | u32 exit_reason; |
177 | 412 | ||
178 | bool rdtscp_enabled; | 413 | bool rdtscp_enabled; |
414 | |||
415 | /* Support for a guest hypervisor (nested VMX) */ | ||
416 | struct nested_vmx nested; | ||
179 | }; | 417 | }; |
180 | 418 | ||
181 | enum segment_cache_field { | 419 | enum segment_cache_field { |
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | |||
192 | return container_of(vcpu, struct vcpu_vmx, vcpu); | 430 | return container_of(vcpu, struct vcpu_vmx, vcpu); |
193 | } | 431 | } |
194 | 432 | ||
433 | #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) | ||
434 | #define FIELD(number, name) [number] = VMCS12_OFFSET(name) | ||
435 | #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ | ||
436 | [number##_HIGH] = VMCS12_OFFSET(name)+4 | ||
437 | |||
438 | static unsigned short vmcs_field_to_offset_table[] = { | ||
439 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | ||
440 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | ||
441 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | ||
442 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | ||
443 | FIELD(GUEST_DS_SELECTOR, guest_ds_selector), | ||
444 | FIELD(GUEST_FS_SELECTOR, guest_fs_selector), | ||
445 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | ||
446 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | ||
447 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | ||
448 | FIELD(HOST_ES_SELECTOR, host_es_selector), | ||
449 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | ||
450 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | ||
451 | FIELD(HOST_DS_SELECTOR, host_ds_selector), | ||
452 | FIELD(HOST_FS_SELECTOR, host_fs_selector), | ||
453 | FIELD(HOST_GS_SELECTOR, host_gs_selector), | ||
454 | FIELD(HOST_TR_SELECTOR, host_tr_selector), | ||
455 | FIELD64(IO_BITMAP_A, io_bitmap_a), | ||
456 | FIELD64(IO_BITMAP_B, io_bitmap_b), | ||
457 | FIELD64(MSR_BITMAP, msr_bitmap), | ||
458 | FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), | ||
459 | FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), | ||
460 | FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), | ||
461 | FIELD64(TSC_OFFSET, tsc_offset), | ||
462 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | ||
463 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | ||
464 | FIELD64(EPT_POINTER, ept_pointer), | ||
465 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | ||
466 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | ||
467 | FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), | ||
468 | FIELD64(GUEST_IA32_PAT, guest_ia32_pat), | ||
469 | FIELD64(GUEST_IA32_EFER, guest_ia32_efer), | ||
470 | FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), | ||
471 | FIELD64(GUEST_PDPTR0, guest_pdptr0), | ||
472 | FIELD64(GUEST_PDPTR1, guest_pdptr1), | ||
473 | FIELD64(GUEST_PDPTR2, guest_pdptr2), | ||
474 | FIELD64(GUEST_PDPTR3, guest_pdptr3), | ||
475 | FIELD64(HOST_IA32_PAT, host_ia32_pat), | ||
476 | FIELD64(HOST_IA32_EFER, host_ia32_efer), | ||
477 | FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), | ||
478 | FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), | ||
479 | FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), | ||
480 | FIELD(EXCEPTION_BITMAP, exception_bitmap), | ||
481 | FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), | ||
482 | FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), | ||
483 | FIELD(CR3_TARGET_COUNT, cr3_target_count), | ||
484 | FIELD(VM_EXIT_CONTROLS, vm_exit_controls), | ||
485 | FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), | ||
486 | FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), | ||
487 | FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), | ||
488 | FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), | ||
489 | FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), | ||
490 | FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), | ||
491 | FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), | ||
492 | FIELD(TPR_THRESHOLD, tpr_threshold), | ||
493 | FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), | ||
494 | FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), | ||
495 | FIELD(VM_EXIT_REASON, vm_exit_reason), | ||
496 | FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), | ||
497 | FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), | ||
498 | FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), | ||
499 | FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), | ||
500 | FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), | ||
501 | FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), | ||
502 | FIELD(GUEST_ES_LIMIT, guest_es_limit), | ||
503 | FIELD(GUEST_CS_LIMIT, guest_cs_limit), | ||
504 | FIELD(GUEST_SS_LIMIT, guest_ss_limit), | ||
505 | FIELD(GUEST_DS_LIMIT, guest_ds_limit), | ||
506 | FIELD(GUEST_FS_LIMIT, guest_fs_limit), | ||
507 | FIELD(GUEST_GS_LIMIT, guest_gs_limit), | ||
508 | FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), | ||
509 | FIELD(GUEST_TR_LIMIT, guest_tr_limit), | ||
510 | FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), | ||
511 | FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), | ||
512 | FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), | ||
513 | FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), | ||
514 | FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), | ||
515 | FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), | ||
516 | FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), | ||
517 | FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), | ||
518 | FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), | ||
519 | FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), | ||
520 | FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), | ||
521 | FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), | ||
522 | FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), | ||
523 | FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), | ||
524 | FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), | ||
525 | FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), | ||
526 | FIELD(CR0_READ_SHADOW, cr0_read_shadow), | ||
527 | FIELD(CR4_READ_SHADOW, cr4_read_shadow), | ||
528 | FIELD(CR3_TARGET_VALUE0, cr3_target_value0), | ||
529 | FIELD(CR3_TARGET_VALUE1, cr3_target_value1), | ||
530 | FIELD(CR3_TARGET_VALUE2, cr3_target_value2), | ||
531 | FIELD(CR3_TARGET_VALUE3, cr3_target_value3), | ||
532 | FIELD(EXIT_QUALIFICATION, exit_qualification), | ||
533 | FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), | ||
534 | FIELD(GUEST_CR0, guest_cr0), | ||
535 | FIELD(GUEST_CR3, guest_cr3), | ||
536 | FIELD(GUEST_CR4, guest_cr4), | ||
537 | FIELD(GUEST_ES_BASE, guest_es_base), | ||
538 | FIELD(GUEST_CS_BASE, guest_cs_base), | ||
539 | FIELD(GUEST_SS_BASE, guest_ss_base), | ||
540 | FIELD(GUEST_DS_BASE, guest_ds_base), | ||
541 | FIELD(GUEST_FS_BASE, guest_fs_base), | ||
542 | FIELD(GUEST_GS_BASE, guest_gs_base), | ||
543 | FIELD(GUEST_LDTR_BASE, guest_ldtr_base), | ||
544 | FIELD(GUEST_TR_BASE, guest_tr_base), | ||
545 | FIELD(GUEST_GDTR_BASE, guest_gdtr_base), | ||
546 | FIELD(GUEST_IDTR_BASE, guest_idtr_base), | ||
547 | FIELD(GUEST_DR7, guest_dr7), | ||
548 | FIELD(GUEST_RSP, guest_rsp), | ||
549 | FIELD(GUEST_RIP, guest_rip), | ||
550 | FIELD(GUEST_RFLAGS, guest_rflags), | ||
551 | FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), | ||
552 | FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), | ||
553 | FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), | ||
554 | FIELD(HOST_CR0, host_cr0), | ||
555 | FIELD(HOST_CR3, host_cr3), | ||
556 | FIELD(HOST_CR4, host_cr4), | ||
557 | FIELD(HOST_FS_BASE, host_fs_base), | ||
558 | FIELD(HOST_GS_BASE, host_gs_base), | ||
559 | FIELD(HOST_TR_BASE, host_tr_base), | ||
560 | FIELD(HOST_GDTR_BASE, host_gdtr_base), | ||
561 | FIELD(HOST_IDTR_BASE, host_idtr_base), | ||
562 | FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), | ||
563 | FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), | ||
564 | FIELD(HOST_RSP, host_rsp), | ||
565 | FIELD(HOST_RIP, host_rip), | ||
566 | }; | ||
567 | static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); | ||
568 | |||
569 | static inline short vmcs_field_to_offset(unsigned long field) | ||
570 | { | ||
571 | if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) | ||
572 | return -1; | ||
573 | return vmcs_field_to_offset_table[field]; | ||
574 | } | ||
575 | |||
576 | static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | ||
577 | { | ||
578 | return to_vmx(vcpu)->nested.current_vmcs12; | ||
579 | } | ||
580 | |||
581 | static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) | ||
582 | { | ||
583 | struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); | ||
584 | if (is_error_page(page)) { | ||
585 | kvm_release_page_clean(page); | ||
586 | return NULL; | ||
587 | } | ||
588 | return page; | ||
589 | } | ||
590 | |||
591 | static void nested_release_page(struct page *page) | ||
592 | { | ||
593 | kvm_release_page_dirty(page); | ||
594 | } | ||
595 | |||
596 | static void nested_release_page_clean(struct page *page) | ||
597 | { | ||
598 | kvm_release_page_clean(page); | ||
599 | } | ||
600 | |||
195 | static u64 construct_eptp(unsigned long root_hpa); | 601 | static u64 construct_eptp(unsigned long root_hpa); |
196 | static void kvm_cpu_vmxon(u64 addr); | 602 | static void kvm_cpu_vmxon(u64 addr); |
197 | static void kvm_cpu_vmxoff(void); | 603 | static void kvm_cpu_vmxoff(void); |
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | |||
200 | 606 | ||
201 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 607 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
202 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 608 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
203 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); | 609 | /* |
610 | * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed | ||
611 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | ||
612 | */ | ||
613 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | ||
204 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); | 614 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); |
205 | 615 | ||
206 | static unsigned long *vmx_io_bitmap_a; | 616 | static unsigned long *vmx_io_bitmap_a; |
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void) | |||
442 | return flexpriority_enabled; | 852 | return flexpriority_enabled; |
443 | } | 853 | } |
444 | 854 | ||
855 | static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) | ||
856 | { | ||
857 | return vmcs12->cpu_based_vm_exec_control & bit; | ||
858 | } | ||
859 | |||
860 | static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) | ||
861 | { | ||
862 | return (vmcs12->cpu_based_vm_exec_control & | ||
863 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
864 | (vmcs12->secondary_vm_exec_control & bit); | ||
865 | } | ||
866 | |||
867 | static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, | ||
868 | struct kvm_vcpu *vcpu) | ||
869 | { | ||
870 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; | ||
871 | } | ||
872 | |||
873 | static inline bool is_exception(u32 intr_info) | ||
874 | { | ||
875 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
876 | == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); | ||
877 | } | ||
878 | |||
879 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); | ||
880 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
881 | struct vmcs12 *vmcs12, | ||
882 | u32 reason, unsigned long qualification); | ||
883 | |||
445 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 884 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
446 | { | 885 | { |
447 | int i; | 886 | int i; |
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
501 | vmcs, phys_addr); | 940 | vmcs, phys_addr); |
502 | } | 941 | } |
503 | 942 | ||
943 | static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) | ||
944 | { | ||
945 | vmcs_clear(loaded_vmcs->vmcs); | ||
946 | loaded_vmcs->cpu = -1; | ||
947 | loaded_vmcs->launched = 0; | ||
948 | } | ||
949 | |||
504 | static void vmcs_load(struct vmcs *vmcs) | 950 | static void vmcs_load(struct vmcs *vmcs) |
505 | { | 951 | { |
506 | u64 phys_addr = __pa(vmcs); | 952 | u64 phys_addr = __pa(vmcs); |
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs) | |||
510 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) | 956 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
511 | : "cc", "memory"); | 957 | : "cc", "memory"); |
512 | if (error) | 958 | if (error) |
513 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 959 | printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", |
514 | vmcs, phys_addr); | 960 | vmcs, phys_addr); |
515 | } | 961 | } |
516 | 962 | ||
517 | static void __vcpu_clear(void *arg) | 963 | static void __loaded_vmcs_clear(void *arg) |
518 | { | 964 | { |
519 | struct vcpu_vmx *vmx = arg; | 965 | struct loaded_vmcs *loaded_vmcs = arg; |
520 | int cpu = raw_smp_processor_id(); | 966 | int cpu = raw_smp_processor_id(); |
521 | 967 | ||
522 | if (vmx->vcpu.cpu == cpu) | 968 | if (loaded_vmcs->cpu != cpu) |
523 | vmcs_clear(vmx->vmcs); | 969 | return; /* vcpu migration can race with cpu offline */ |
524 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 970 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) |
525 | per_cpu(current_vmcs, cpu) = NULL; | 971 | per_cpu(current_vmcs, cpu) = NULL; |
526 | list_del(&vmx->local_vcpus_link); | 972 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); |
527 | vmx->vcpu.cpu = -1; | 973 | loaded_vmcs_init(loaded_vmcs); |
528 | vmx->launched = 0; | ||
529 | } | 974 | } |
530 | 975 | ||
531 | static void vcpu_clear(struct vcpu_vmx *vmx) | 976 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) |
532 | { | 977 | { |
533 | if (vmx->vcpu.cpu == -1) | 978 | if (loaded_vmcs->cpu != -1) |
534 | return; | 979 | smp_call_function_single( |
535 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); | 980 | loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); |
536 | } | 981 | } |
537 | 982 | ||
538 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) | 983 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) |
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
585 | } | 1030 | } |
586 | } | 1031 | } |
587 | 1032 | ||
588 | static unsigned long vmcs_readl(unsigned long field) | 1033 | static __always_inline unsigned long vmcs_readl(unsigned long field) |
589 | { | 1034 | { |
590 | unsigned long value = 0; | 1035 | unsigned long value; |
591 | 1036 | ||
592 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 1037 | asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") |
593 | : "+a"(value) : "d"(field) : "cc"); | 1038 | : "=a"(value) : "d"(field) : "cc"); |
594 | return value; | 1039 | return value; |
595 | } | 1040 | } |
596 | 1041 | ||
597 | static u16 vmcs_read16(unsigned long field) | 1042 | static __always_inline u16 vmcs_read16(unsigned long field) |
598 | { | 1043 | { |
599 | return vmcs_readl(field); | 1044 | return vmcs_readl(field); |
600 | } | 1045 | } |
601 | 1046 | ||
602 | static u32 vmcs_read32(unsigned long field) | 1047 | static __always_inline u32 vmcs_read32(unsigned long field) |
603 | { | 1048 | { |
604 | return vmcs_readl(field); | 1049 | return vmcs_readl(field); |
605 | } | 1050 | } |
606 | 1051 | ||
607 | static u64 vmcs_read64(unsigned long field) | 1052 | static __always_inline u64 vmcs_read64(unsigned long field) |
608 | { | 1053 | { |
609 | #ifdef CONFIG_X86_64 | 1054 | #ifdef CONFIG_X86_64 |
610 | return vmcs_readl(field); | 1055 | return vmcs_readl(field); |
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
731 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 1176 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
732 | if (vcpu->fpu_active) | 1177 | if (vcpu->fpu_active) |
733 | eb &= ~(1u << NM_VECTOR); | 1178 | eb &= ~(1u << NM_VECTOR); |
1179 | |||
1180 | /* When we are running a nested L2 guest and L1 specified for it a | ||
1181 | * certain exception bitmap, we must trap the same exceptions and pass | ||
1182 | * them to L1. When running L2, we will only handle the exceptions | ||
1183 | * specified above if L1 did not want them. | ||
1184 | */ | ||
1185 | if (is_guest_mode(vcpu)) | ||
1186 | eb |= get_vmcs12(vcpu)->exception_bitmap; | ||
1187 | |||
734 | vmcs_write32(EXCEPTION_BITMAP, eb); | 1188 | vmcs_write32(EXCEPTION_BITMAP, eb); |
735 | } | 1189 | } |
736 | 1190 | ||
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
971 | 1425 | ||
972 | if (!vmm_exclusive) | 1426 | if (!vmm_exclusive) |
973 | kvm_cpu_vmxon(phys_addr); | 1427 | kvm_cpu_vmxon(phys_addr); |
974 | else if (vcpu->cpu != cpu) | 1428 | else if (vmx->loaded_vmcs->cpu != cpu) |
975 | vcpu_clear(vmx); | 1429 | loaded_vmcs_clear(vmx->loaded_vmcs); |
976 | 1430 | ||
977 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | 1431 | if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { |
978 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | 1432 | per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; |
979 | vmcs_load(vmx->vmcs); | 1433 | vmcs_load(vmx->loaded_vmcs->vmcs); |
980 | } | 1434 | } |
981 | 1435 | ||
982 | if (vcpu->cpu != cpu) { | 1436 | if (vmx->loaded_vmcs->cpu != cpu) { |
983 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); | 1437 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
984 | unsigned long sysenter_esp; | 1438 | unsigned long sysenter_esp; |
985 | 1439 | ||
986 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 1440 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
987 | local_irq_disable(); | 1441 | local_irq_disable(); |
988 | list_add(&vmx->local_vcpus_link, | 1442 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, |
989 | &per_cpu(vcpus_on_cpu, cpu)); | 1443 | &per_cpu(loaded_vmcss_on_cpu, cpu)); |
990 | local_irq_enable(); | 1444 | local_irq_enable(); |
991 | 1445 | ||
992 | /* | 1446 | /* |
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
998 | 1452 | ||
999 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | 1453 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); |
1000 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | 1454 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
1455 | vmx->loaded_vmcs->cpu = cpu; | ||
1001 | } | 1456 | } |
1002 | } | 1457 | } |
1003 | 1458 | ||
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | |||
1005 | { | 1460 | { |
1006 | __vmx_load_host_state(to_vmx(vcpu)); | 1461 | __vmx_load_host_state(to_vmx(vcpu)); |
1007 | if (!vmm_exclusive) { | 1462 | if (!vmm_exclusive) { |
1008 | __vcpu_clear(to_vmx(vcpu)); | 1463 | __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); |
1464 | vcpu->cpu = -1; | ||
1009 | kvm_cpu_vmxoff(); | 1465 | kvm_cpu_vmxoff(); |
1010 | } | 1466 | } |
1011 | } | 1467 | } |
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | |||
1023 | vmcs_writel(GUEST_CR0, cr0); | 1479 | vmcs_writel(GUEST_CR0, cr0); |
1024 | update_exception_bitmap(vcpu); | 1480 | update_exception_bitmap(vcpu); |
1025 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | 1481 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; |
1482 | if (is_guest_mode(vcpu)) | ||
1483 | vcpu->arch.cr0_guest_owned_bits &= | ||
1484 | ~get_vmcs12(vcpu)->cr0_guest_host_mask; | ||
1026 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1485 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1027 | } | 1486 | } |
1028 | 1487 | ||
1029 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); | 1488 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); |
1030 | 1489 | ||
1490 | /* | ||
1491 | * Return the cr0 value that a nested guest would read. This is a combination | ||
1492 | * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by | ||
1493 | * its hypervisor (cr0_read_shadow). | ||
1494 | */ | ||
1495 | static inline unsigned long nested_read_cr0(struct vmcs12 *fields) | ||
1496 | { | ||
1497 | return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | | ||
1498 | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); | ||
1499 | } | ||
1500 | static inline unsigned long nested_read_cr4(struct vmcs12 *fields) | ||
1501 | { | ||
1502 | return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | | ||
1503 | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); | ||
1504 | } | ||
1505 | |||
1031 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | 1506 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) |
1032 | { | 1507 | { |
1508 | /* Note that there is no vcpu->fpu_active = 0 here. The caller must | ||
1509 | * set this *before* calling this function. | ||
1510 | */ | ||
1033 | vmx_decache_cr0_guest_bits(vcpu); | 1511 | vmx_decache_cr0_guest_bits(vcpu); |
1034 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); | 1512 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); |
1035 | update_exception_bitmap(vcpu); | 1513 | update_exception_bitmap(vcpu); |
1036 | vcpu->arch.cr0_guest_owned_bits = 0; | 1514 | vcpu->arch.cr0_guest_owned_bits = 0; |
1037 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1515 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1038 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 1516 | if (is_guest_mode(vcpu)) { |
1517 | /* | ||
1518 | * L1's specified read shadow might not contain the TS bit, | ||
1519 | * so now that we turned on shadowing of this bit, we need to | ||
1520 | * set this bit of the shadow. Like in nested_vmx_run we need | ||
1521 | * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet | ||
1522 | * up-to-date here because we just decached cr0.TS (and we'll | ||
1523 | * only update vmcs12->guest_cr0 on nested exit). | ||
1524 | */ | ||
1525 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1526 | vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | | ||
1527 | (vcpu->arch.cr0 & X86_CR0_TS); | ||
1528 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
1529 | } else | ||
1530 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | ||
1039 | } | 1531 | } |
1040 | 1532 | ||
1041 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | 1533 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) |
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | |||
1119 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | 1611 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
1120 | } | 1612 | } |
1121 | 1613 | ||
1614 | /* | ||
1615 | * KVM wants to inject page-faults which it got to the guest. This function | ||
1616 | * checks whether in a nested guest, we need to inject them to L1 or L2. | ||
1617 | * This function assumes it is called with the exit reason in vmcs02 being | ||
1618 | * a #PF exception (this is the only case in which KVM injects a #PF when L2 | ||
1619 | * is running). | ||
1620 | */ | ||
1621 | static int nested_pf_handled(struct kvm_vcpu *vcpu) | ||
1622 | { | ||
1623 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1624 | |||
1625 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ | ||
1626 | if (!(vmcs12->exception_bitmap & PF_VECTOR)) | ||
1627 | return 0; | ||
1628 | |||
1629 | nested_vmx_vmexit(vcpu); | ||
1630 | return 1; | ||
1631 | } | ||
1632 | |||
1122 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1633 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1123 | bool has_error_code, u32 error_code, | 1634 | bool has_error_code, u32 error_code, |
1124 | bool reinject) | 1635 | bool reinject) |
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1126 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1637 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1127 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | 1638 | u32 intr_info = nr | INTR_INFO_VALID_MASK; |
1128 | 1639 | ||
1640 | if (nr == PF_VECTOR && is_guest_mode(vcpu) && | ||
1641 | nested_pf_handled(vcpu)) | ||
1642 | return; | ||
1643 | |||
1129 | if (has_error_code) { | 1644 | if (has_error_code) { |
1130 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | 1645 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); |
1131 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 1646 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | |||
1248 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | 1763 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1249 | { | 1764 | { |
1250 | vmcs_write64(TSC_OFFSET, offset); | 1765 | vmcs_write64(TSC_OFFSET, offset); |
1766 | if (is_guest_mode(vcpu)) | ||
1767 | /* | ||
1768 | * We're here if L1 chose not to trap the TSC MSR. Since | ||
1769 | * prepare_vmcs12() does not copy tsc_offset, we need to also | ||
1770 | * set the vmcs12 field here. | ||
1771 | */ | ||
1772 | get_vmcs12(vcpu)->tsc_offset = offset - | ||
1773 | to_vmx(vcpu)->nested.vmcs01_tsc_offset; | ||
1251 | } | 1774 | } |
1252 | 1775 | ||
1253 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 1776 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) |
1254 | { | 1777 | { |
1255 | u64 offset = vmcs_read64(TSC_OFFSET); | 1778 | u64 offset = vmcs_read64(TSC_OFFSET); |
1256 | vmcs_write64(TSC_OFFSET, offset + adjustment); | 1779 | vmcs_write64(TSC_OFFSET, offset + adjustment); |
1780 | if (is_guest_mode(vcpu)) { | ||
1781 | /* Even when running L2, the adjustment needs to apply to L1 */ | ||
1782 | to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; | ||
1783 | } | ||
1257 | } | 1784 | } |
1258 | 1785 | ||
1259 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | 1786 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | |||
1261 | return target_tsc - native_read_tsc(); | 1788 | return target_tsc - native_read_tsc(); |
1262 | } | 1789 | } |
1263 | 1790 | ||
1791 | static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) | ||
1792 | { | ||
1793 | struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
1794 | return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); | ||
1795 | } | ||
1796 | |||
1797 | /* | ||
1798 | * nested_vmx_allowed() checks whether a guest should be allowed to use VMX | ||
1799 | * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for | ||
1800 | * all guests if the "nested" module option is off, and can also be disabled | ||
1801 | * for a single guest by disabling its VMX cpuid bit. | ||
1802 | */ | ||
1803 | static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | ||
1804 | { | ||
1805 | return nested && guest_cpuid_has_vmx(vcpu); | ||
1806 | } | ||
1807 | |||
1808 | /* | ||
1809 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | ||
1810 | * returned for the various VMX controls MSRs when nested VMX is enabled. | ||
1811 | * The same values should also be used to verify that vmcs12 control fields are | ||
1812 | * valid during nested entry from L1 to L2. | ||
1813 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | ||
1814 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | ||
1815 | * bit in the high half is on if the corresponding bit in the control field | ||
1816 | * may be on. See also vmx_control_verify(). | ||
1817 | * TODO: allow these variables to be modified (downgraded) by module options | ||
1818 | * or other means. | ||
1819 | */ | ||
1820 | static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; | ||
1821 | static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; | ||
1822 | static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; | ||
1823 | static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; | ||
1824 | static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; | ||
1825 | static __init void nested_vmx_setup_ctls_msrs(void) | ||
1826 | { | ||
1827 | /* | ||
1828 | * Note that as a general rule, the high half of the MSRs (bits in | ||
1829 | * the control fields which may be 1) should be initialized by the | ||
1830 | * intersection of the underlying hardware's MSR (i.e., features which | ||
1831 | * can be supported) and the list of features we want to expose - | ||
1832 | * because they are known to be properly supported in our code. | ||
1833 | * Also, usually, the low half of the MSRs (bits which must be 1) can | ||
1834 | * be set to 0, meaning that L1 may turn off any of these bits. The | ||
1835 | * reason is that if one of these bits is necessary, it will appear | ||
1836 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | ||
1837 | * fields of vmcs01 and vmcs02, will turn these bits off - and | ||
1838 | * nested_vmx_exit_handled() will not pass related exits to L1. | ||
1839 | * These rules have exceptions below. | ||
1840 | */ | ||
1841 | |||
1842 | /* pin-based controls */ | ||
1843 | /* | ||
1844 | * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is | ||
1845 | * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. | ||
1846 | */ | ||
1847 | nested_vmx_pinbased_ctls_low = 0x16 ; | ||
1848 | nested_vmx_pinbased_ctls_high = 0x16 | | ||
1849 | PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | | ||
1850 | PIN_BASED_VIRTUAL_NMIS; | ||
1851 | |||
1852 | /* exit controls */ | ||
1853 | nested_vmx_exit_ctls_low = 0; | ||
1854 | /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ | ||
1855 | #ifdef CONFIG_X86_64 | ||
1856 | nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
1857 | #else | ||
1858 | nested_vmx_exit_ctls_high = 0; | ||
1859 | #endif | ||
1860 | |||
1861 | /* entry controls */ | ||
1862 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | ||
1863 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); | ||
1864 | nested_vmx_entry_ctls_low = 0; | ||
1865 | nested_vmx_entry_ctls_high &= | ||
1866 | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; | ||
1867 | |||
1868 | /* cpu-based controls */ | ||
1869 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | ||
1870 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); | ||
1871 | nested_vmx_procbased_ctls_low = 0; | ||
1872 | nested_vmx_procbased_ctls_high &= | ||
1873 | CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | | ||
1874 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | ||
1875 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | ||
1876 | CPU_BASED_CR3_STORE_EXITING | | ||
1877 | #ifdef CONFIG_X86_64 | ||
1878 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | ||
1879 | #endif | ||
1880 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | ||
1881 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | | ||
1882 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
1883 | /* | ||
1884 | * We can allow some features even when not supported by the | ||
1885 | * hardware. For example, L1 can specify an MSR bitmap - and we | ||
1886 | * can use it to avoid exits to L1 - even when L0 runs L2 | ||
1887 | * without MSR bitmaps. | ||
1888 | */ | ||
1889 | nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; | ||
1890 | |||
1891 | /* secondary cpu-based controls */ | ||
1892 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
1893 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); | ||
1894 | nested_vmx_secondary_ctls_low = 0; | ||
1895 | nested_vmx_secondary_ctls_high &= | ||
1896 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1897 | } | ||
1898 | |||
1899 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | ||
1900 | { | ||
1901 | /* | ||
1902 | * Bits 0 in high must be 0, and bits 1 in low must be 1. | ||
1903 | */ | ||
1904 | return ((control & high) | low) == control; | ||
1905 | } | ||
1906 | |||
1907 | static inline u64 vmx_control_msr(u32 low, u32 high) | ||
1908 | { | ||
1909 | return low | ((u64)high << 32); | ||
1910 | } | ||
1911 | |||
1912 | /* | ||
1913 | * If we allow our guest to use VMX instructions (i.e., nested VMX), we should | ||
1914 | * also let it use VMX-specific MSRs. | ||
1915 | * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a | ||
1916 | * VMX-specific MSR, or 0 when we haven't (and the caller should handle it | ||
1917 | * like all other MSRs). | ||
1918 | */ | ||
1919 | static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1920 | { | ||
1921 | if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && | ||
1922 | msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { | ||
1923 | /* | ||
1924 | * According to the spec, processors which do not support VMX | ||
1925 | * should throw a #GP(0) when VMX capability MSRs are read. | ||
1926 | */ | ||
1927 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
1928 | return 1; | ||
1929 | } | ||
1930 | |||
1931 | switch (msr_index) { | ||
1932 | case MSR_IA32_FEATURE_CONTROL: | ||
1933 | *pdata = 0; | ||
1934 | break; | ||
1935 | case MSR_IA32_VMX_BASIC: | ||
1936 | /* | ||
1937 | * This MSR reports some information about VMX support. We | ||
1938 | * should return information about the VMX we emulate for the | ||
1939 | * guest, and the VMCS structure we give it - not about the | ||
1940 | * VMX support of the underlying hardware. | ||
1941 | */ | ||
1942 | *pdata = VMCS12_REVISION | | ||
1943 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | ||
1944 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | ||
1945 | break; | ||
1946 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
1947 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
1948 | *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, | ||
1949 | nested_vmx_pinbased_ctls_high); | ||
1950 | break; | ||
1951 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
1952 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
1953 | *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, | ||
1954 | nested_vmx_procbased_ctls_high); | ||
1955 | break; | ||
1956 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
1957 | case MSR_IA32_VMX_EXIT_CTLS: | ||
1958 | *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, | ||
1959 | nested_vmx_exit_ctls_high); | ||
1960 | break; | ||
1961 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
1962 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
1963 | *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, | ||
1964 | nested_vmx_entry_ctls_high); | ||
1965 | break; | ||
1966 | case MSR_IA32_VMX_MISC: | ||
1967 | *pdata = 0; | ||
1968 | break; | ||
1969 | /* | ||
1970 | * These MSRs specify bits which the guest must keep fixed (on or off) | ||
1971 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | ||
1972 | * We picked the standard core2 setting. | ||
1973 | */ | ||
1974 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | ||
1975 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | ||
1976 | case MSR_IA32_VMX_CR0_FIXED0: | ||
1977 | *pdata = VMXON_CR0_ALWAYSON; | ||
1978 | break; | ||
1979 | case MSR_IA32_VMX_CR0_FIXED1: | ||
1980 | *pdata = -1ULL; | ||
1981 | break; | ||
1982 | case MSR_IA32_VMX_CR4_FIXED0: | ||
1983 | *pdata = VMXON_CR4_ALWAYSON; | ||
1984 | break; | ||
1985 | case MSR_IA32_VMX_CR4_FIXED1: | ||
1986 | *pdata = -1ULL; | ||
1987 | break; | ||
1988 | case MSR_IA32_VMX_VMCS_ENUM: | ||
1989 | *pdata = 0x1f; | ||
1990 | break; | ||
1991 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
1992 | *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, | ||
1993 | nested_vmx_secondary_ctls_high); | ||
1994 | break; | ||
1995 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
1996 | /* Currently, no nested ept or nested vpid */ | ||
1997 | *pdata = 0; | ||
1998 | break; | ||
1999 | default: | ||
2000 | return 0; | ||
2001 | } | ||
2002 | |||
2003 | return 1; | ||
2004 | } | ||
2005 | |||
2006 | static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
2007 | { | ||
2008 | if (!nested_vmx_allowed(vcpu)) | ||
2009 | return 0; | ||
2010 | |||
2011 | if (msr_index == MSR_IA32_FEATURE_CONTROL) | ||
2012 | /* TODO: the right thing. */ | ||
2013 | return 1; | ||
2014 | /* | ||
2015 | * No need to treat VMX capability MSRs specially: If we don't handle | ||
2016 | * them, handle_wrmsr will #GP(0), which is correct (they are readonly) | ||
2017 | */ | ||
2018 | return 0; | ||
2019 | } | ||
2020 | |||
1264 | /* | 2021 | /* |
1265 | * Reads an msr value (of 'msr_index') into 'pdata'. | 2022 | * Reads an msr value (of 'msr_index') into 'pdata'. |
1266 | * Returns 0 on success, non-0 otherwise. | 2023 | * Returns 0 on success, non-0 otherwise. |
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
1309 | /* Otherwise falls through */ | 2066 | /* Otherwise falls through */ |
1310 | default: | 2067 | default: |
1311 | vmx_load_host_state(to_vmx(vcpu)); | 2068 | vmx_load_host_state(to_vmx(vcpu)); |
2069 | if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) | ||
2070 | return 0; | ||
1312 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 2071 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
1313 | if (msr) { | 2072 | if (msr) { |
1314 | vmx_load_host_state(to_vmx(vcpu)); | 2073 | vmx_load_host_state(to_vmx(vcpu)); |
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1380 | return 1; | 2139 | return 1; |
1381 | /* Otherwise falls through */ | 2140 | /* Otherwise falls through */ |
1382 | default: | 2141 | default: |
2142 | if (vmx_set_vmx_msr(vcpu, msr_index, data)) | ||
2143 | break; | ||
1383 | msr = find_msr_entry(vmx, msr_index); | 2144 | msr = find_msr_entry(vmx, msr_index); |
1384 | if (msr) { | 2145 | if (msr) { |
1385 | vmx_load_host_state(vmx); | 2146 | vmx_load_host_state(vmx); |
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage) | |||
1469 | if (read_cr4() & X86_CR4_VMXE) | 2230 | if (read_cr4() & X86_CR4_VMXE) |
1470 | return -EBUSY; | 2231 | return -EBUSY; |
1471 | 2232 | ||
1472 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 2233 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); |
1473 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 2234 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1474 | 2235 | ||
1475 | test_bits = FEATURE_CONTROL_LOCKED; | 2236 | test_bits = FEATURE_CONTROL_LOCKED; |
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage) | |||
1493 | return 0; | 2254 | return 0; |
1494 | } | 2255 | } |
1495 | 2256 | ||
1496 | static void vmclear_local_vcpus(void) | 2257 | static void vmclear_local_loaded_vmcss(void) |
1497 | { | 2258 | { |
1498 | int cpu = raw_smp_processor_id(); | 2259 | int cpu = raw_smp_processor_id(); |
1499 | struct vcpu_vmx *vmx, *n; | 2260 | struct loaded_vmcs *v, *n; |
1500 | 2261 | ||
1501 | list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), | 2262 | list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), |
1502 | local_vcpus_link) | 2263 | loaded_vmcss_on_cpu_link) |
1503 | __vcpu_clear(vmx); | 2264 | __loaded_vmcs_clear(v); |
1504 | } | 2265 | } |
1505 | 2266 | ||
1506 | 2267 | ||
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void) | |||
1515 | static void hardware_disable(void *garbage) | 2276 | static void hardware_disable(void *garbage) |
1516 | { | 2277 | { |
1517 | if (vmm_exclusive) { | 2278 | if (vmm_exclusive) { |
1518 | vmclear_local_vcpus(); | 2279 | vmclear_local_loaded_vmcss(); |
1519 | kvm_cpu_vmxoff(); | 2280 | kvm_cpu_vmxoff(); |
1520 | } | 2281 | } |
1521 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | 2282 | write_cr4(read_cr4() & ~X86_CR4_VMXE); |
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs) | |||
1696 | free_pages((unsigned long)vmcs, vmcs_config.order); | 2457 | free_pages((unsigned long)vmcs, vmcs_config.order); |
1697 | } | 2458 | } |
1698 | 2459 | ||
2460 | /* | ||
2461 | * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded | ||
2462 | */ | ||
2463 | static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
2464 | { | ||
2465 | if (!loaded_vmcs->vmcs) | ||
2466 | return; | ||
2467 | loaded_vmcs_clear(loaded_vmcs); | ||
2468 | free_vmcs(loaded_vmcs->vmcs); | ||
2469 | loaded_vmcs->vmcs = NULL; | ||
2470 | } | ||
2471 | |||
1699 | static void free_kvm_area(void) | 2472 | static void free_kvm_area(void) |
1700 | { | 2473 | { |
1701 | int cpu; | 2474 | int cpu; |
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void) | |||
1756 | if (!cpu_has_vmx_ple()) | 2529 | if (!cpu_has_vmx_ple()) |
1757 | ple_gap = 0; | 2530 | ple_gap = 0; |
1758 | 2531 | ||
2532 | if (nested) | ||
2533 | nested_vmx_setup_ctls_msrs(); | ||
2534 | |||
1759 | return alloc_kvm_area(); | 2535 | return alloc_kvm_area(); |
1760 | } | 2536 | } |
1761 | 2537 | ||
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | |||
2041 | (unsigned long *)&vcpu->arch.regs_dirty); | 2817 | (unsigned long *)&vcpu->arch.regs_dirty); |
2042 | } | 2818 | } |
2043 | 2819 | ||
2044 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 2820 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
2045 | 2821 | ||
2046 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | 2822 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
2047 | unsigned long cr0, | 2823 | unsigned long cr0, |
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
2139 | vmcs_writel(GUEST_CR3, guest_cr3); | 2915 | vmcs_writel(GUEST_CR3, guest_cr3); |
2140 | } | 2916 | } |
2141 | 2917 | ||
2142 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 2918 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
2143 | { | 2919 | { |
2144 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? | 2920 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? |
2145 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 2921 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
2146 | 2922 | ||
2923 | if (cr4 & X86_CR4_VMXE) { | ||
2924 | /* | ||
2925 | * To use VMXON (and later other VMX instructions), a guest | ||
2926 | * must first be able to turn on cr4.VMXE (see handle_vmon()). | ||
2927 | * So basically the check on whether to allow nested VMX | ||
2928 | * is here. | ||
2929 | */ | ||
2930 | if (!nested_vmx_allowed(vcpu)) | ||
2931 | return 1; | ||
2932 | } else if (to_vmx(vcpu)->nested.vmxon) | ||
2933 | return 1; | ||
2934 | |||
2147 | vcpu->arch.cr4 = cr4; | 2935 | vcpu->arch.cr4 = cr4; |
2148 | if (enable_ept) { | 2936 | if (enable_ept) { |
2149 | if (!is_paging(vcpu)) { | 2937 | if (!is_paging(vcpu)) { |
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
2156 | 2944 | ||
2157 | vmcs_writel(CR4_READ_SHADOW, cr4); | 2945 | vmcs_writel(CR4_READ_SHADOW, cr4); |
2158 | vmcs_writel(GUEST_CR4, hw_cr4); | 2946 | vmcs_writel(GUEST_CR4, hw_cr4); |
2947 | return 0; | ||
2159 | } | 2948 | } |
2160 | 2949 | ||
2161 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | 2950 | static void vmx_get_segment(struct kvm_vcpu *vcpu, |
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | |||
2721 | } | 3510 | } |
2722 | 3511 | ||
2723 | /* | 3512 | /* |
3513 | * Set up the vmcs's constant host-state fields, i.e., host-state fields that | ||
3514 | * will not change in the lifetime of the guest. | ||
3515 | * Note that host-state that does change is set elsewhere. E.g., host-state | ||
3516 | * that is set differently for each CPU is set in vmx_vcpu_load(), not here. | ||
3517 | */ | ||
3518 | static void vmx_set_constant_host_state(void) | ||
3519 | { | ||
3520 | u32 low32, high32; | ||
3521 | unsigned long tmpl; | ||
3522 | struct desc_ptr dt; | ||
3523 | |||
3524 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
3525 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
3526 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
3527 | |||
3528 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
3529 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3530 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3531 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3532 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
3533 | |||
3534 | native_store_idt(&dt); | ||
3535 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
3536 | |||
3537 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); | ||
3538 | vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ | ||
3539 | |||
3540 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | ||
3541 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | ||
3542 | rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); | ||
3543 | vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ | ||
3544 | |||
3545 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
3546 | rdmsr(MSR_IA32_CR_PAT, low32, high32); | ||
3547 | vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); | ||
3548 | } | ||
3549 | } | ||
3550 | |||
3551 | static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) | ||
3552 | { | ||
3553 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | ||
3554 | if (enable_ept) | ||
3555 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
3556 | if (is_guest_mode(&vmx->vcpu)) | ||
3557 | vmx->vcpu.arch.cr4_guest_owned_bits &= | ||
3558 | ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; | ||
3559 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
3560 | } | ||
3561 | |||
3562 | static u32 vmx_exec_control(struct vcpu_vmx *vmx) | ||
3563 | { | ||
3564 | u32 exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
3565 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
3566 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
3567 | #ifdef CONFIG_X86_64 | ||
3568 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
3569 | CPU_BASED_CR8_LOAD_EXITING; | ||
3570 | #endif | ||
3571 | } | ||
3572 | if (!enable_ept) | ||
3573 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
3574 | CPU_BASED_CR3_LOAD_EXITING | | ||
3575 | CPU_BASED_INVLPG_EXITING; | ||
3576 | return exec_control; | ||
3577 | } | ||
3578 | |||
3579 | static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | ||
3580 | { | ||
3581 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
3582 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
3583 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
3584 | if (vmx->vpid == 0) | ||
3585 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
3586 | if (!enable_ept) { | ||
3587 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
3588 | enable_unrestricted_guest = 0; | ||
3589 | } | ||
3590 | if (!enable_unrestricted_guest) | ||
3591 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
3592 | if (!ple_gap) | ||
3593 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
3594 | return exec_control; | ||
3595 | } | ||
3596 | |||
3597 | static void ept_set_mmio_spte_mask(void) | ||
3598 | { | ||
3599 | /* | ||
3600 | * EPT Misconfigurations can be generated if the value of bits 2:0 | ||
3601 | * of an EPT paging-structure entry is 110b (write/execute). | ||
3602 | * Also, magic bits (0xffull << 49) is set to quickly identify mmio | ||
3603 | * spte. | ||
3604 | */ | ||
3605 | kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); | ||
3606 | } | ||
3607 | |||
3608 | /* | ||
2724 | * Sets up the vmcs for emulated real mode. | 3609 | * Sets up the vmcs for emulated real mode. |
2725 | */ | 3610 | */ |
2726 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | 3611 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) |
2727 | { | 3612 | { |
2728 | u32 host_sysenter_cs, msr_low, msr_high; | 3613 | #ifdef CONFIG_X86_64 |
2729 | u32 junk; | ||
2730 | u64 host_pat; | ||
2731 | unsigned long a; | 3614 | unsigned long a; |
2732 | struct desc_ptr dt; | 3615 | #endif |
2733 | int i; | 3616 | int i; |
2734 | unsigned long kvm_vmx_return; | ||
2735 | u32 exec_control; | ||
2736 | 3617 | ||
2737 | /* I/O */ | 3618 | /* I/O */ |
2738 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); | 3619 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); |
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2747 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | 3628 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, |
2748 | vmcs_config.pin_based_exec_ctrl); | 3629 | vmcs_config.pin_based_exec_ctrl); |
2749 | 3630 | ||
2750 | exec_control = vmcs_config.cpu_based_exec_ctrl; | 3631 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); |
2751 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
2752 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
2753 | #ifdef CONFIG_X86_64 | ||
2754 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
2755 | CPU_BASED_CR8_LOAD_EXITING; | ||
2756 | #endif | ||
2757 | } | ||
2758 | if (!enable_ept) | ||
2759 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
2760 | CPU_BASED_CR3_LOAD_EXITING | | ||
2761 | CPU_BASED_INVLPG_EXITING; | ||
2762 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
2763 | 3632 | ||
2764 | if (cpu_has_secondary_exec_ctrls()) { | 3633 | if (cpu_has_secondary_exec_ctrls()) { |
2765 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | 3634 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, |
2766 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | 3635 | vmx_secondary_exec_control(vmx)); |
2767 | exec_control &= | ||
2768 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
2769 | if (vmx->vpid == 0) | ||
2770 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
2771 | if (!enable_ept) { | ||
2772 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
2773 | enable_unrestricted_guest = 0; | ||
2774 | } | ||
2775 | if (!enable_unrestricted_guest) | ||
2776 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
2777 | if (!ple_gap) | ||
2778 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
2779 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
2780 | } | 3636 | } |
2781 | 3637 | ||
2782 | if (ple_gap) { | 3638 | if (ple_gap) { |
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2784 | vmcs_write32(PLE_WINDOW, ple_window); | 3640 | vmcs_write32(PLE_WINDOW, ple_window); |
2785 | } | 3641 | } |
2786 | 3642 | ||
2787 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | 3643 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); |
2788 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | 3644 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); |
2789 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 3645 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
2790 | 3646 | ||
2791 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
2792 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
2793 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
2794 | |||
2795 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
2796 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2797 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2798 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ | 3647 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ |
2799 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ | 3648 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ |
2800 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3649 | vmx_set_constant_host_state(); |
2801 | #ifdef CONFIG_X86_64 | 3650 | #ifdef CONFIG_X86_64 |
2802 | rdmsrl(MSR_FS_BASE, a); | 3651 | rdmsrl(MSR_FS_BASE, a); |
2803 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | 3652 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ |
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2808 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | 3657 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ |
2809 | #endif | 3658 | #endif |
2810 | 3659 | ||
2811 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
2812 | |||
2813 | native_store_idt(&dt); | ||
2814 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
2815 | |||
2816 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
2817 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
2818 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 3660 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); |
2819 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 3661 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); |
2820 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); | 3662 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); |
2821 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | 3663 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); |
2822 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); | 3664 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); |
2823 | 3665 | ||
2824 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
2825 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
2826 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
2827 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
2828 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
2829 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
2830 | |||
2831 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
2832 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | ||
2833 | host_pat = msr_low | ((u64) msr_high << 32); | ||
2834 | vmcs_write64(HOST_IA32_PAT, host_pat); | ||
2835 | } | ||
2836 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 3666 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
3667 | u32 msr_low, msr_high; | ||
3668 | u64 host_pat; | ||
2837 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | 3669 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); |
2838 | host_pat = msr_low | ((u64) msr_high << 32); | 3670 | host_pat = msr_low | ((u64) msr_high << 32); |
2839 | /* Write the default value follow host pat */ | 3671 | /* Write the default value follow host pat */ |
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2863 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 3695 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); |
2864 | 3696 | ||
2865 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 3697 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
2866 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | 3698 | set_cr4_guest_host_mask(vmx); |
2867 | if (enable_ept) | ||
2868 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
2869 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
2870 | 3699 | ||
2871 | kvm_write_tsc(&vmx->vcpu, 0); | 3700 | kvm_write_tsc(&vmx->vcpu, 0); |
2872 | 3701 | ||
@@ -2990,9 +3819,25 @@ out: | |||
2990 | return ret; | 3819 | return ret; |
2991 | } | 3820 | } |
2992 | 3821 | ||
3822 | /* | ||
3823 | * In nested virtualization, check if L1 asked to exit on external interrupts. | ||
3824 | * For most existing hypervisors, this will always return true. | ||
3825 | */ | ||
3826 | static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | ||
3827 | { | ||
3828 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | ||
3829 | PIN_BASED_EXT_INTR_MASK; | ||
3830 | } | ||
3831 | |||
2993 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 3832 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
2994 | { | 3833 | { |
2995 | u32 cpu_based_vm_exec_control; | 3834 | u32 cpu_based_vm_exec_control; |
3835 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) | ||
3836 | /* We can get here when nested_run_pending caused | ||
3837 | * vmx_interrupt_allowed() to return false. In this case, do | ||
3838 | * nothing - the interrupt will be injected later. | ||
3839 | */ | ||
3840 | return; | ||
2996 | 3841 | ||
2997 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 3842 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2998 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | 3843 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; |
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
3049 | { | 3894 | { |
3050 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3895 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3051 | 3896 | ||
3897 | if (is_guest_mode(vcpu)) | ||
3898 | return; | ||
3899 | |||
3052 | if (!cpu_has_virtual_nmis()) { | 3900 | if (!cpu_has_virtual_nmis()) { |
3053 | /* | 3901 | /* |
3054 | * Tracking the NMI-blocked state in software is built upon | 3902 | * Tracking the NMI-blocked state in software is built upon |
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
3115 | 3963 | ||
3116 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | 3964 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) |
3117 | { | 3965 | { |
3966 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { | ||
3967 | struct vmcs12 *vmcs12; | ||
3968 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
3969 | return 0; | ||
3970 | nested_vmx_vmexit(vcpu); | ||
3971 | vmcs12 = get_vmcs12(vcpu); | ||
3972 | vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; | ||
3973 | vmcs12->vm_exit_intr_info = 0; | ||
3974 | /* fall through to normal code, but now in L1, not L2 */ | ||
3975 | } | ||
3976 | |||
3118 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | 3977 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
3119 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3978 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
3120 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); | 3979 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); |
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3356 | hypercall[2] = 0xc1; | 4215 | hypercall[2] = 0xc1; |
3357 | } | 4216 | } |
3358 | 4217 | ||
4218 | /* called to set cr0 as approriate for a mov-to-cr0 exit. */ | ||
4219 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | ||
4220 | { | ||
4221 | if (to_vmx(vcpu)->nested.vmxon && | ||
4222 | ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) | ||
4223 | return 1; | ||
4224 | |||
4225 | if (is_guest_mode(vcpu)) { | ||
4226 | /* | ||
4227 | * We get here when L2 changed cr0 in a way that did not change | ||
4228 | * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), | ||
4229 | * but did change L0 shadowed bits. This can currently happen | ||
4230 | * with the TS bit: L0 may want to leave TS on (for lazy fpu | ||
4231 | * loading) while pretending to allow the guest to change it. | ||
4232 | */ | ||
4233 | if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | | ||
4234 | (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) | ||
4235 | return 1; | ||
4236 | vmcs_writel(CR0_READ_SHADOW, val); | ||
4237 | return 0; | ||
4238 | } else | ||
4239 | return kvm_set_cr0(vcpu, val); | ||
4240 | } | ||
4241 | |||
4242 | static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) | ||
4243 | { | ||
4244 | if (is_guest_mode(vcpu)) { | ||
4245 | if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | | ||
4246 | (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) | ||
4247 | return 1; | ||
4248 | vmcs_writel(CR4_READ_SHADOW, val); | ||
4249 | return 0; | ||
4250 | } else | ||
4251 | return kvm_set_cr4(vcpu, val); | ||
4252 | } | ||
4253 | |||
4254 | /* called to set cr0 as approriate for clts instruction exit. */ | ||
4255 | static void handle_clts(struct kvm_vcpu *vcpu) | ||
4256 | { | ||
4257 | if (is_guest_mode(vcpu)) { | ||
4258 | /* | ||
4259 | * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS | ||
4260 | * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, | ||
4261 | * just pretend it's off (also in arch.cr0 for fpu_activate). | ||
4262 | */ | ||
4263 | vmcs_writel(CR0_READ_SHADOW, | ||
4264 | vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); | ||
4265 | vcpu->arch.cr0 &= ~X86_CR0_TS; | ||
4266 | } else | ||
4267 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | ||
4268 | } | ||
4269 | |||
3359 | static int handle_cr(struct kvm_vcpu *vcpu) | 4270 | static int handle_cr(struct kvm_vcpu *vcpu) |
3360 | { | 4271 | { |
3361 | unsigned long exit_qualification, val; | 4272 | unsigned long exit_qualification, val; |
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3372 | trace_kvm_cr_write(cr, val); | 4283 | trace_kvm_cr_write(cr, val); |
3373 | switch (cr) { | 4284 | switch (cr) { |
3374 | case 0: | 4285 | case 0: |
3375 | err = kvm_set_cr0(vcpu, val); | 4286 | err = handle_set_cr0(vcpu, val); |
3376 | kvm_complete_insn_gp(vcpu, err); | 4287 | kvm_complete_insn_gp(vcpu, err); |
3377 | return 1; | 4288 | return 1; |
3378 | case 3: | 4289 | case 3: |
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3380 | kvm_complete_insn_gp(vcpu, err); | 4291 | kvm_complete_insn_gp(vcpu, err); |
3381 | return 1; | 4292 | return 1; |
3382 | case 4: | 4293 | case 4: |
3383 | err = kvm_set_cr4(vcpu, val); | 4294 | err = handle_set_cr4(vcpu, val); |
3384 | kvm_complete_insn_gp(vcpu, err); | 4295 | kvm_complete_insn_gp(vcpu, err); |
3385 | return 1; | 4296 | return 1; |
3386 | case 8: { | 4297 | case 8: { |
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3398 | }; | 4309 | }; |
3399 | break; | 4310 | break; |
3400 | case 2: /* clts */ | 4311 | case 2: /* clts */ |
3401 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | 4312 | handle_clts(vcpu); |
3402 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); | 4313 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); |
3403 | skip_emulated_instruction(vcpu); | 4314 | skip_emulated_instruction(vcpu); |
3404 | vmx_fpu_activate(vcpu); | 4315 | vmx_fpu_activate(vcpu); |
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) | |||
3574 | return 1; | 4485 | return 1; |
3575 | } | 4486 | } |
3576 | 4487 | ||
3577 | static int handle_vmx_insn(struct kvm_vcpu *vcpu) | ||
3578 | { | ||
3579 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
3580 | return 1; | ||
3581 | } | ||
3582 | |||
3583 | static int handle_invd(struct kvm_vcpu *vcpu) | 4488 | static int handle_invd(struct kvm_vcpu *vcpu) |
3584 | { | 4489 | { |
3585 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | 4490 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, | |||
3777 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | 4682 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) |
3778 | { | 4683 | { |
3779 | u64 sptes[4]; | 4684 | u64 sptes[4]; |
3780 | int nr_sptes, i; | 4685 | int nr_sptes, i, ret; |
3781 | gpa_t gpa; | 4686 | gpa_t gpa; |
3782 | 4687 | ||
3783 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 4688 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3784 | 4689 | ||
4690 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); | ||
4691 | if (likely(ret == 1)) | ||
4692 | return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == | ||
4693 | EMULATE_DONE; | ||
4694 | if (unlikely(!ret)) | ||
4695 | return 1; | ||
4696 | |||
4697 | /* It is the real ept misconfig */ | ||
3785 | printk(KERN_ERR "EPT: Misconfiguration.\n"); | 4698 | printk(KERN_ERR "EPT: Misconfiguration.\n"); |
3786 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); | 4699 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); |
3787 | 4700 | ||
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) | |||
3866 | } | 4779 | } |
3867 | 4780 | ||
3868 | /* | 4781 | /* |
4782 | * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. | ||
4783 | * We could reuse a single VMCS for all the L2 guests, but we also want the | ||
4784 | * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this | ||
4785 | * allows keeping them loaded on the processor, and in the future will allow | ||
4786 | * optimizations where prepare_vmcs02 doesn't need to set all the fields on | ||
4787 | * every entry if they never change. | ||
4788 | * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE | ||
4789 | * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. | ||
4790 | * | ||
4791 | * The following functions allocate and free a vmcs02 in this pool. | ||
4792 | */ | ||
4793 | |||
4794 | /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ | ||
4795 | static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) | ||
4796 | { | ||
4797 | struct vmcs02_list *item; | ||
4798 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4799 | if (item->vmptr == vmx->nested.current_vmptr) { | ||
4800 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4801 | return &item->vmcs02; | ||
4802 | } | ||
4803 | |||
4804 | if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { | ||
4805 | /* Recycle the least recently used VMCS. */ | ||
4806 | item = list_entry(vmx->nested.vmcs02_pool.prev, | ||
4807 | struct vmcs02_list, list); | ||
4808 | item->vmptr = vmx->nested.current_vmptr; | ||
4809 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4810 | return &item->vmcs02; | ||
4811 | } | ||
4812 | |||
4813 | /* Create a new VMCS */ | ||
4814 | item = (struct vmcs02_list *) | ||
4815 | kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); | ||
4816 | if (!item) | ||
4817 | return NULL; | ||
4818 | item->vmcs02.vmcs = alloc_vmcs(); | ||
4819 | if (!item->vmcs02.vmcs) { | ||
4820 | kfree(item); | ||
4821 | return NULL; | ||
4822 | } | ||
4823 | loaded_vmcs_init(&item->vmcs02); | ||
4824 | item->vmptr = vmx->nested.current_vmptr; | ||
4825 | list_add(&(item->list), &(vmx->nested.vmcs02_pool)); | ||
4826 | vmx->nested.vmcs02_num++; | ||
4827 | return &item->vmcs02; | ||
4828 | } | ||
4829 | |||
4830 | /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ | ||
4831 | static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) | ||
4832 | { | ||
4833 | struct vmcs02_list *item; | ||
4834 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4835 | if (item->vmptr == vmptr) { | ||
4836 | free_loaded_vmcs(&item->vmcs02); | ||
4837 | list_del(&item->list); | ||
4838 | kfree(item); | ||
4839 | vmx->nested.vmcs02_num--; | ||
4840 | return; | ||
4841 | } | ||
4842 | } | ||
4843 | |||
4844 | /* | ||
4845 | * Free all VMCSs saved for this vcpu, except the one pointed by | ||
4846 | * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one | ||
4847 | * currently used, if running L2), and vmcs01 when running L2. | ||
4848 | */ | ||
4849 | static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) | ||
4850 | { | ||
4851 | struct vmcs02_list *item, *n; | ||
4852 | list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { | ||
4853 | if (vmx->loaded_vmcs != &item->vmcs02) | ||
4854 | free_loaded_vmcs(&item->vmcs02); | ||
4855 | list_del(&item->list); | ||
4856 | kfree(item); | ||
4857 | } | ||
4858 | vmx->nested.vmcs02_num = 0; | ||
4859 | |||
4860 | if (vmx->loaded_vmcs != &vmx->vmcs01) | ||
4861 | free_loaded_vmcs(&vmx->vmcs01); | ||
4862 | } | ||
4863 | |||
4864 | /* | ||
4865 | * Emulate the VMXON instruction. | ||
4866 | * Currently, we just remember that VMX is active, and do not save or even | ||
4867 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | ||
4868 | * do not currently need to store anything in that guest-allocated memory | ||
4869 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | ||
4870 | * argument is different from the VMXON pointer (which the spec says they do). | ||
4871 | */ | ||
4872 | static int handle_vmon(struct kvm_vcpu *vcpu) | ||
4873 | { | ||
4874 | struct kvm_segment cs; | ||
4875 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4876 | |||
4877 | /* The Intel VMX Instruction Reference lists a bunch of bits that | ||
4878 | * are prerequisite to running VMXON, most notably cr4.VMXE must be | ||
4879 | * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). | ||
4880 | * Otherwise, we should fail with #UD. We test these now: | ||
4881 | */ | ||
4882 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || | ||
4883 | !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || | ||
4884 | (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { | ||
4885 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4886 | return 1; | ||
4887 | } | ||
4888 | |||
4889 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4890 | if (is_long_mode(vcpu) && !cs.l) { | ||
4891 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4892 | return 1; | ||
4893 | } | ||
4894 | |||
4895 | if (vmx_get_cpl(vcpu)) { | ||
4896 | kvm_inject_gp(vcpu, 0); | ||
4897 | return 1; | ||
4898 | } | ||
4899 | |||
4900 | INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); | ||
4901 | vmx->nested.vmcs02_num = 0; | ||
4902 | |||
4903 | vmx->nested.vmxon = true; | ||
4904 | |||
4905 | skip_emulated_instruction(vcpu); | ||
4906 | return 1; | ||
4907 | } | ||
4908 | |||
4909 | /* | ||
4910 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | ||
4911 | * for running VMX instructions (except VMXON, whose prerequisites are | ||
4912 | * slightly different). It also specifies what exception to inject otherwise. | ||
4913 | */ | ||
4914 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | ||
4915 | { | ||
4916 | struct kvm_segment cs; | ||
4917 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4918 | |||
4919 | if (!vmx->nested.vmxon) { | ||
4920 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4921 | return 0; | ||
4922 | } | ||
4923 | |||
4924 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4925 | if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || | ||
4926 | (is_long_mode(vcpu) && !cs.l)) { | ||
4927 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4928 | return 0; | ||
4929 | } | ||
4930 | |||
4931 | if (vmx_get_cpl(vcpu)) { | ||
4932 | kvm_inject_gp(vcpu, 0); | ||
4933 | return 0; | ||
4934 | } | ||
4935 | |||
4936 | return 1; | ||
4937 | } | ||
4938 | |||
4939 | /* | ||
4940 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | ||
4941 | * just stops using VMX. | ||
4942 | */ | ||
4943 | static void free_nested(struct vcpu_vmx *vmx) | ||
4944 | { | ||
4945 | if (!vmx->nested.vmxon) | ||
4946 | return; | ||
4947 | vmx->nested.vmxon = false; | ||
4948 | if (vmx->nested.current_vmptr != -1ull) { | ||
4949 | kunmap(vmx->nested.current_vmcs12_page); | ||
4950 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
4951 | vmx->nested.current_vmptr = -1ull; | ||
4952 | vmx->nested.current_vmcs12 = NULL; | ||
4953 | } | ||
4954 | /* Unpin physical memory we referred to in current vmcs02 */ | ||
4955 | if (vmx->nested.apic_access_page) { | ||
4956 | nested_release_page(vmx->nested.apic_access_page); | ||
4957 | vmx->nested.apic_access_page = 0; | ||
4958 | } | ||
4959 | |||
4960 | nested_free_all_saved_vmcss(vmx); | ||
4961 | } | ||
4962 | |||
4963 | /* Emulate the VMXOFF instruction */ | ||
4964 | static int handle_vmoff(struct kvm_vcpu *vcpu) | ||
4965 | { | ||
4966 | if (!nested_vmx_check_permission(vcpu)) | ||
4967 | return 1; | ||
4968 | free_nested(to_vmx(vcpu)); | ||
4969 | skip_emulated_instruction(vcpu); | ||
4970 | return 1; | ||
4971 | } | ||
4972 | |||
4973 | /* | ||
4974 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
4975 | * exit caused by such an instruction (run by a guest hypervisor). | ||
4976 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
4977 | * #UD or #GP. | ||
4978 | */ | ||
4979 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
4980 | unsigned long exit_qualification, | ||
4981 | u32 vmx_instruction_info, gva_t *ret) | ||
4982 | { | ||
4983 | /* | ||
4984 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
4985 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
4986 | * addressing components of the operand. Only the displacement part | ||
4987 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
4988 | * For how an actual address is calculated from all these components, | ||
4989 | * refer to Vol. 1, "Operand Addressing". | ||
4990 | */ | ||
4991 | int scaling = vmx_instruction_info & 3; | ||
4992 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
4993 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
4994 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
4995 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
4996 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
4997 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
4998 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
4999 | |||
5000 | if (is_reg) { | ||
5001 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
5002 | return 1; | ||
5003 | } | ||
5004 | |||
5005 | /* Addr = segment_base + offset */ | ||
5006 | /* offset = base + [index * scale] + displacement */ | ||
5007 | *ret = vmx_get_segment_base(vcpu, seg_reg); | ||
5008 | if (base_is_valid) | ||
5009 | *ret += kvm_register_read(vcpu, base_reg); | ||
5010 | if (index_is_valid) | ||
5011 | *ret += kvm_register_read(vcpu, index_reg)<<scaling; | ||
5012 | *ret += exit_qualification; /* holds the displacement */ | ||
5013 | |||
5014 | if (addr_size == 1) /* 32 bit */ | ||
5015 | *ret &= 0xffffffff; | ||
5016 | |||
5017 | /* | ||
5018 | * TODO: throw #GP (and return 1) in various cases that the VM* | ||
5019 | * instructions require it - e.g., offset beyond segment limit, | ||
5020 | * unusable or unreadable/unwritable segment, non-canonical 64-bit | ||
5021 | * address, and so on. Currently these are not checked. | ||
5022 | */ | ||
5023 | return 0; | ||
5024 | } | ||
5025 | |||
5026 | /* | ||
5027 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | ||
5028 | * set the success or error code of an emulated VMX instruction, as specified | ||
5029 | * by Vol 2B, VMX Instruction Reference, "Conventions". | ||
5030 | */ | ||
5031 | static void nested_vmx_succeed(struct kvm_vcpu *vcpu) | ||
5032 | { | ||
5033 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | ||
5034 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5035 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | ||
5036 | } | ||
5037 | |||
5038 | static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | ||
5039 | { | ||
5040 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5041 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | ||
5042 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5043 | | X86_EFLAGS_CF); | ||
5044 | } | ||
5045 | |||
5046 | static void nested_vmx_failValid(struct kvm_vcpu *vcpu, | ||
5047 | u32 vm_instruction_error) | ||
5048 | { | ||
5049 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { | ||
5050 | /* | ||
5051 | * failValid writes the error number to the current VMCS, which | ||
5052 | * can't be done there isn't a current VMCS. | ||
5053 | */ | ||
5054 | nested_vmx_failInvalid(vcpu); | ||
5055 | return; | ||
5056 | } | ||
5057 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5058 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5059 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5060 | | X86_EFLAGS_ZF); | ||
5061 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | ||
5062 | } | ||
5063 | |||
5064 | /* Emulate the VMCLEAR instruction */ | ||
5065 | static int handle_vmclear(struct kvm_vcpu *vcpu) | ||
5066 | { | ||
5067 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5068 | gva_t gva; | ||
5069 | gpa_t vmptr; | ||
5070 | struct vmcs12 *vmcs12; | ||
5071 | struct page *page; | ||
5072 | struct x86_exception e; | ||
5073 | |||
5074 | if (!nested_vmx_check_permission(vcpu)) | ||
5075 | return 1; | ||
5076 | |||
5077 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5078 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5079 | return 1; | ||
5080 | |||
5081 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5082 | sizeof(vmptr), &e)) { | ||
5083 | kvm_inject_page_fault(vcpu, &e); | ||
5084 | return 1; | ||
5085 | } | ||
5086 | |||
5087 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5088 | nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
5089 | skip_emulated_instruction(vcpu); | ||
5090 | return 1; | ||
5091 | } | ||
5092 | |||
5093 | if (vmptr == vmx->nested.current_vmptr) { | ||
5094 | kunmap(vmx->nested.current_vmcs12_page); | ||
5095 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5096 | vmx->nested.current_vmptr = -1ull; | ||
5097 | vmx->nested.current_vmcs12 = NULL; | ||
5098 | } | ||
5099 | |||
5100 | page = nested_get_page(vcpu, vmptr); | ||
5101 | if (page == NULL) { | ||
5102 | /* | ||
5103 | * For accurate processor emulation, VMCLEAR beyond available | ||
5104 | * physical memory should do nothing at all. However, it is | ||
5105 | * possible that a nested vmx bug, not a guest hypervisor bug, | ||
5106 | * resulted in this case, so let's shut down before doing any | ||
5107 | * more damage: | ||
5108 | */ | ||
5109 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
5110 | return 1; | ||
5111 | } | ||
5112 | vmcs12 = kmap(page); | ||
5113 | vmcs12->launch_state = 0; | ||
5114 | kunmap(page); | ||
5115 | nested_release_page(page); | ||
5116 | |||
5117 | nested_free_vmcs02(vmx, vmptr); | ||
5118 | |||
5119 | skip_emulated_instruction(vcpu); | ||
5120 | nested_vmx_succeed(vcpu); | ||
5121 | return 1; | ||
5122 | } | ||
5123 | |||
5124 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | ||
5125 | |||
5126 | /* Emulate the VMLAUNCH instruction */ | ||
5127 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | ||
5128 | { | ||
5129 | return nested_vmx_run(vcpu, true); | ||
5130 | } | ||
5131 | |||
5132 | /* Emulate the VMRESUME instruction */ | ||
5133 | static int handle_vmresume(struct kvm_vcpu *vcpu) | ||
5134 | { | ||
5135 | |||
5136 | return nested_vmx_run(vcpu, false); | ||
5137 | } | ||
5138 | |||
5139 | enum vmcs_field_type { | ||
5140 | VMCS_FIELD_TYPE_U16 = 0, | ||
5141 | VMCS_FIELD_TYPE_U64 = 1, | ||
5142 | VMCS_FIELD_TYPE_U32 = 2, | ||
5143 | VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 | ||
5144 | }; | ||
5145 | |||
5146 | static inline int vmcs_field_type(unsigned long field) | ||
5147 | { | ||
5148 | if (0x1 & field) /* the *_HIGH fields are all 32 bit */ | ||
5149 | return VMCS_FIELD_TYPE_U32; | ||
5150 | return (field >> 13) & 0x3 ; | ||
5151 | } | ||
5152 | |||
5153 | static inline int vmcs_field_readonly(unsigned long field) | ||
5154 | { | ||
5155 | return (((field >> 10) & 0x3) == 1); | ||
5156 | } | ||
5157 | |||
5158 | /* | ||
5159 | * Read a vmcs12 field. Since these can have varying lengths and we return | ||
5160 | * one type, we chose the biggest type (u64) and zero-extend the return value | ||
5161 | * to that size. Note that the caller, handle_vmread, might need to use only | ||
5162 | * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of | ||
5163 | * 64-bit fields are to be returned). | ||
5164 | */ | ||
5165 | static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, | ||
5166 | unsigned long field, u64 *ret) | ||
5167 | { | ||
5168 | short offset = vmcs_field_to_offset(field); | ||
5169 | char *p; | ||
5170 | |||
5171 | if (offset < 0) | ||
5172 | return 0; | ||
5173 | |||
5174 | p = ((char *)(get_vmcs12(vcpu))) + offset; | ||
5175 | |||
5176 | switch (vmcs_field_type(field)) { | ||
5177 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5178 | *ret = *((natural_width *)p); | ||
5179 | return 1; | ||
5180 | case VMCS_FIELD_TYPE_U16: | ||
5181 | *ret = *((u16 *)p); | ||
5182 | return 1; | ||
5183 | case VMCS_FIELD_TYPE_U32: | ||
5184 | *ret = *((u32 *)p); | ||
5185 | return 1; | ||
5186 | case VMCS_FIELD_TYPE_U64: | ||
5187 | *ret = *((u64 *)p); | ||
5188 | return 1; | ||
5189 | default: | ||
5190 | return 0; /* can never happen. */ | ||
5191 | } | ||
5192 | } | ||
5193 | |||
5194 | /* | ||
5195 | * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was | ||
5196 | * used before) all generate the same failure when it is missing. | ||
5197 | */ | ||
5198 | static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) | ||
5199 | { | ||
5200 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5201 | if (vmx->nested.current_vmptr == -1ull) { | ||
5202 | nested_vmx_failInvalid(vcpu); | ||
5203 | skip_emulated_instruction(vcpu); | ||
5204 | return 0; | ||
5205 | } | ||
5206 | return 1; | ||
5207 | } | ||
5208 | |||
5209 | static int handle_vmread(struct kvm_vcpu *vcpu) | ||
5210 | { | ||
5211 | unsigned long field; | ||
5212 | u64 field_value; | ||
5213 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5214 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5215 | gva_t gva = 0; | ||
5216 | |||
5217 | if (!nested_vmx_check_permission(vcpu) || | ||
5218 | !nested_vmx_check_vmcs12(vcpu)) | ||
5219 | return 1; | ||
5220 | |||
5221 | /* Decode instruction info and find the field to read */ | ||
5222 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5223 | /* Read the field, zero-extended to a u64 field_value */ | ||
5224 | if (!vmcs12_read_any(vcpu, field, &field_value)) { | ||
5225 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5226 | skip_emulated_instruction(vcpu); | ||
5227 | return 1; | ||
5228 | } | ||
5229 | /* | ||
5230 | * Now copy part of this value to register or memory, as requested. | ||
5231 | * Note that the number of bits actually copied is 32 or 64 depending | ||
5232 | * on the guest's mode (32 or 64 bit), not on the given field's length. | ||
5233 | */ | ||
5234 | if (vmx_instruction_info & (1u << 10)) { | ||
5235 | kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | ||
5236 | field_value); | ||
5237 | } else { | ||
5238 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5239 | vmx_instruction_info, &gva)) | ||
5240 | return 1; | ||
5241 | /* _system ok, as nested_vmx_check_permission verified cpl=0 */ | ||
5242 | kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, | ||
5243 | &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); | ||
5244 | } | ||
5245 | |||
5246 | nested_vmx_succeed(vcpu); | ||
5247 | skip_emulated_instruction(vcpu); | ||
5248 | return 1; | ||
5249 | } | ||
5250 | |||
5251 | |||
5252 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | ||
5253 | { | ||
5254 | unsigned long field; | ||
5255 | gva_t gva; | ||
5256 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5257 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5258 | char *p; | ||
5259 | short offset; | ||
5260 | /* The value to write might be 32 or 64 bits, depending on L1's long | ||
5261 | * mode, and eventually we need to write that into a field of several | ||
5262 | * possible lengths. The code below first zero-extends the value to 64 | ||
5263 | * bit (field_value), and then copies only the approriate number of | ||
5264 | * bits into the vmcs12 field. | ||
5265 | */ | ||
5266 | u64 field_value = 0; | ||
5267 | struct x86_exception e; | ||
5268 | |||
5269 | if (!nested_vmx_check_permission(vcpu) || | ||
5270 | !nested_vmx_check_vmcs12(vcpu)) | ||
5271 | return 1; | ||
5272 | |||
5273 | if (vmx_instruction_info & (1u << 10)) | ||
5274 | field_value = kvm_register_read(vcpu, | ||
5275 | (((vmx_instruction_info) >> 3) & 0xf)); | ||
5276 | else { | ||
5277 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5278 | vmx_instruction_info, &gva)) | ||
5279 | return 1; | ||
5280 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, | ||
5281 | &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { | ||
5282 | kvm_inject_page_fault(vcpu, &e); | ||
5283 | return 1; | ||
5284 | } | ||
5285 | } | ||
5286 | |||
5287 | |||
5288 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5289 | if (vmcs_field_readonly(field)) { | ||
5290 | nested_vmx_failValid(vcpu, | ||
5291 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | ||
5292 | skip_emulated_instruction(vcpu); | ||
5293 | return 1; | ||
5294 | } | ||
5295 | |||
5296 | offset = vmcs_field_to_offset(field); | ||
5297 | if (offset < 0) { | ||
5298 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5299 | skip_emulated_instruction(vcpu); | ||
5300 | return 1; | ||
5301 | } | ||
5302 | p = ((char *) get_vmcs12(vcpu)) + offset; | ||
5303 | |||
5304 | switch (vmcs_field_type(field)) { | ||
5305 | case VMCS_FIELD_TYPE_U16: | ||
5306 | *(u16 *)p = field_value; | ||
5307 | break; | ||
5308 | case VMCS_FIELD_TYPE_U32: | ||
5309 | *(u32 *)p = field_value; | ||
5310 | break; | ||
5311 | case VMCS_FIELD_TYPE_U64: | ||
5312 | *(u64 *)p = field_value; | ||
5313 | break; | ||
5314 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5315 | *(natural_width *)p = field_value; | ||
5316 | break; | ||
5317 | default: | ||
5318 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5319 | skip_emulated_instruction(vcpu); | ||
5320 | return 1; | ||
5321 | } | ||
5322 | |||
5323 | nested_vmx_succeed(vcpu); | ||
5324 | skip_emulated_instruction(vcpu); | ||
5325 | return 1; | ||
5326 | } | ||
5327 | |||
5328 | /* Emulate the VMPTRLD instruction */ | ||
5329 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | ||
5330 | { | ||
5331 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5332 | gva_t gva; | ||
5333 | gpa_t vmptr; | ||
5334 | struct x86_exception e; | ||
5335 | |||
5336 | if (!nested_vmx_check_permission(vcpu)) | ||
5337 | return 1; | ||
5338 | |||
5339 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5340 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5341 | return 1; | ||
5342 | |||
5343 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5344 | sizeof(vmptr), &e)) { | ||
5345 | kvm_inject_page_fault(vcpu, &e); | ||
5346 | return 1; | ||
5347 | } | ||
5348 | |||
5349 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5350 | nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
5351 | skip_emulated_instruction(vcpu); | ||
5352 | return 1; | ||
5353 | } | ||
5354 | |||
5355 | if (vmx->nested.current_vmptr != vmptr) { | ||
5356 | struct vmcs12 *new_vmcs12; | ||
5357 | struct page *page; | ||
5358 | page = nested_get_page(vcpu, vmptr); | ||
5359 | if (page == NULL) { | ||
5360 | nested_vmx_failInvalid(vcpu); | ||
5361 | skip_emulated_instruction(vcpu); | ||
5362 | return 1; | ||
5363 | } | ||
5364 | new_vmcs12 = kmap(page); | ||
5365 | if (new_vmcs12->revision_id != VMCS12_REVISION) { | ||
5366 | kunmap(page); | ||
5367 | nested_release_page_clean(page); | ||
5368 | nested_vmx_failValid(vcpu, | ||
5369 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
5370 | skip_emulated_instruction(vcpu); | ||
5371 | return 1; | ||
5372 | } | ||
5373 | if (vmx->nested.current_vmptr != -1ull) { | ||
5374 | kunmap(vmx->nested.current_vmcs12_page); | ||
5375 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5376 | } | ||
5377 | |||
5378 | vmx->nested.current_vmptr = vmptr; | ||
5379 | vmx->nested.current_vmcs12 = new_vmcs12; | ||
5380 | vmx->nested.current_vmcs12_page = page; | ||
5381 | } | ||
5382 | |||
5383 | nested_vmx_succeed(vcpu); | ||
5384 | skip_emulated_instruction(vcpu); | ||
5385 | return 1; | ||
5386 | } | ||
5387 | |||
5388 | /* Emulate the VMPTRST instruction */ | ||
5389 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | ||
5390 | { | ||
5391 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5392 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5393 | gva_t vmcs_gva; | ||
5394 | struct x86_exception e; | ||
5395 | |||
5396 | if (!nested_vmx_check_permission(vcpu)) | ||
5397 | return 1; | ||
5398 | |||
5399 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5400 | vmx_instruction_info, &vmcs_gva)) | ||
5401 | return 1; | ||
5402 | /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ | ||
5403 | if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, | ||
5404 | (void *)&to_vmx(vcpu)->nested.current_vmptr, | ||
5405 | sizeof(u64), &e)) { | ||
5406 | kvm_inject_page_fault(vcpu, &e); | ||
5407 | return 1; | ||
5408 | } | ||
5409 | nested_vmx_succeed(vcpu); | ||
5410 | skip_emulated_instruction(vcpu); | ||
5411 | return 1; | ||
5412 | } | ||
5413 | |||
5414 | /* | ||
3869 | * The exit handlers return 1 if the exit was handled fully and guest execution | 5415 | * The exit handlers return 1 if the exit was handled fully and guest execution |
3870 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 5416 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
3871 | * to be done to userspace and return 0. | 5417 | * to be done to userspace and return 0. |
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3886 | [EXIT_REASON_INVD] = handle_invd, | 5432 | [EXIT_REASON_INVD] = handle_invd, |
3887 | [EXIT_REASON_INVLPG] = handle_invlpg, | 5433 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3888 | [EXIT_REASON_VMCALL] = handle_vmcall, | 5434 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3889 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 5435 | [EXIT_REASON_VMCLEAR] = handle_vmclear, |
3890 | [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, | 5436 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, |
3891 | [EXIT_REASON_VMPTRLD] = handle_vmx_insn, | 5437 | [EXIT_REASON_VMPTRLD] = handle_vmptrld, |
3892 | [EXIT_REASON_VMPTRST] = handle_vmx_insn, | 5438 | [EXIT_REASON_VMPTRST] = handle_vmptrst, |
3893 | [EXIT_REASON_VMREAD] = handle_vmx_insn, | 5439 | [EXIT_REASON_VMREAD] = handle_vmread, |
3894 | [EXIT_REASON_VMRESUME] = handle_vmx_insn, | 5440 | [EXIT_REASON_VMRESUME] = handle_vmresume, |
3895 | [EXIT_REASON_VMWRITE] = handle_vmx_insn, | 5441 | [EXIT_REASON_VMWRITE] = handle_vmwrite, |
3896 | [EXIT_REASON_VMOFF] = handle_vmx_insn, | 5442 | [EXIT_REASON_VMOFF] = handle_vmoff, |
3897 | [EXIT_REASON_VMON] = handle_vmx_insn, | 5443 | [EXIT_REASON_VMON] = handle_vmon, |
3898 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 5444 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
3899 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 5445 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3900 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 5446 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3911 | static const int kvm_vmx_max_exit_handlers = | 5457 | static const int kvm_vmx_max_exit_handlers = |
3912 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 5458 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3913 | 5459 | ||
5460 | /* | ||
5461 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | ||
5462 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | ||
5463 | * disinterest in the current event (read or write a specific MSR) by using an | ||
5464 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | ||
5465 | */ | ||
5466 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | ||
5467 | struct vmcs12 *vmcs12, u32 exit_reason) | ||
5468 | { | ||
5469 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
5470 | gpa_t bitmap; | ||
5471 | |||
5472 | if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) | ||
5473 | return 1; | ||
5474 | |||
5475 | /* | ||
5476 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | ||
5477 | * for the four combinations of read/write and low/high MSR numbers. | ||
5478 | * First we need to figure out which of the four to use: | ||
5479 | */ | ||
5480 | bitmap = vmcs12->msr_bitmap; | ||
5481 | if (exit_reason == EXIT_REASON_MSR_WRITE) | ||
5482 | bitmap += 2048; | ||
5483 | if (msr_index >= 0xc0000000) { | ||
5484 | msr_index -= 0xc0000000; | ||
5485 | bitmap += 1024; | ||
5486 | } | ||
5487 | |||
5488 | /* Then read the msr_index'th bit from this bitmap: */ | ||
5489 | if (msr_index < 1024*8) { | ||
5490 | unsigned char b; | ||
5491 | kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); | ||
5492 | return 1 & (b >> (msr_index & 7)); | ||
5493 | } else | ||
5494 | return 1; /* let L1 handle the wrong parameter */ | ||
5495 | } | ||
5496 | |||
5497 | /* | ||
5498 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | ||
5499 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | ||
5500 | * intercept (via guest_host_mask etc.) the current event. | ||
5501 | */ | ||
5502 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | ||
5503 | struct vmcs12 *vmcs12) | ||
5504 | { | ||
5505 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5506 | int cr = exit_qualification & 15; | ||
5507 | int reg = (exit_qualification >> 8) & 15; | ||
5508 | unsigned long val = kvm_register_read(vcpu, reg); | ||
5509 | |||
5510 | switch ((exit_qualification >> 4) & 3) { | ||
5511 | case 0: /* mov to cr */ | ||
5512 | switch (cr) { | ||
5513 | case 0: | ||
5514 | if (vmcs12->cr0_guest_host_mask & | ||
5515 | (val ^ vmcs12->cr0_read_shadow)) | ||
5516 | return 1; | ||
5517 | break; | ||
5518 | case 3: | ||
5519 | if ((vmcs12->cr3_target_count >= 1 && | ||
5520 | vmcs12->cr3_target_value0 == val) || | ||
5521 | (vmcs12->cr3_target_count >= 2 && | ||
5522 | vmcs12->cr3_target_value1 == val) || | ||
5523 | (vmcs12->cr3_target_count >= 3 && | ||
5524 | vmcs12->cr3_target_value2 == val) || | ||
5525 | (vmcs12->cr3_target_count >= 4 && | ||
5526 | vmcs12->cr3_target_value3 == val)) | ||
5527 | return 0; | ||
5528 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | ||
5529 | return 1; | ||
5530 | break; | ||
5531 | case 4: | ||
5532 | if (vmcs12->cr4_guest_host_mask & | ||
5533 | (vmcs12->cr4_read_shadow ^ val)) | ||
5534 | return 1; | ||
5535 | break; | ||
5536 | case 8: | ||
5537 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | ||
5538 | return 1; | ||
5539 | break; | ||
5540 | } | ||
5541 | break; | ||
5542 | case 2: /* clts */ | ||
5543 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | ||
5544 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | ||
5545 | return 1; | ||
5546 | break; | ||
5547 | case 1: /* mov from cr */ | ||
5548 | switch (cr) { | ||
5549 | case 3: | ||
5550 | if (vmcs12->cpu_based_vm_exec_control & | ||
5551 | CPU_BASED_CR3_STORE_EXITING) | ||
5552 | return 1; | ||
5553 | break; | ||
5554 | case 8: | ||
5555 | if (vmcs12->cpu_based_vm_exec_control & | ||
5556 | CPU_BASED_CR8_STORE_EXITING) | ||
5557 | return 1; | ||
5558 | break; | ||
5559 | } | ||
5560 | break; | ||
5561 | case 3: /* lmsw */ | ||
5562 | /* | ||
5563 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | ||
5564 | * cr0. Other attempted changes are ignored, with no exit. | ||
5565 | */ | ||
5566 | if (vmcs12->cr0_guest_host_mask & 0xe & | ||
5567 | (val ^ vmcs12->cr0_read_shadow)) | ||
5568 | return 1; | ||
5569 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | ||
5570 | !(vmcs12->cr0_read_shadow & 0x1) && | ||
5571 | (val & 0x1)) | ||
5572 | return 1; | ||
5573 | break; | ||
5574 | } | ||
5575 | return 0; | ||
5576 | } | ||
5577 | |||
5578 | /* | ||
5579 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | ||
5580 | * should handle it ourselves in L0 (and then continue L2). Only call this | ||
5581 | * when in is_guest_mode (L2). | ||
5582 | */ | ||
5583 | static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | ||
5584 | { | ||
5585 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
5586 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
5587 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5588 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5589 | |||
5590 | if (vmx->nested.nested_run_pending) | ||
5591 | return 0; | ||
5592 | |||
5593 | if (unlikely(vmx->fail)) { | ||
5594 | printk(KERN_INFO "%s failed vm entry %x\n", | ||
5595 | __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
5596 | return 1; | ||
5597 | } | ||
5598 | |||
5599 | switch (exit_reason) { | ||
5600 | case EXIT_REASON_EXCEPTION_NMI: | ||
5601 | if (!is_exception(intr_info)) | ||
5602 | return 0; | ||
5603 | else if (is_page_fault(intr_info)) | ||
5604 | return enable_ept; | ||
5605 | return vmcs12->exception_bitmap & | ||
5606 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | ||
5607 | case EXIT_REASON_EXTERNAL_INTERRUPT: | ||
5608 | return 0; | ||
5609 | case EXIT_REASON_TRIPLE_FAULT: | ||
5610 | return 1; | ||
5611 | case EXIT_REASON_PENDING_INTERRUPT: | ||
5612 | case EXIT_REASON_NMI_WINDOW: | ||
5613 | /* | ||
5614 | * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit | ||
5615 | * (aka Interrupt Window Exiting) only when L1 turned it on, | ||
5616 | * so if we got a PENDING_INTERRUPT exit, this must be for L1. | ||
5617 | * Same for NMI Window Exiting. | ||
5618 | */ | ||
5619 | return 1; | ||
5620 | case EXIT_REASON_TASK_SWITCH: | ||
5621 | return 1; | ||
5622 | case EXIT_REASON_CPUID: | ||
5623 | return 1; | ||
5624 | case EXIT_REASON_HLT: | ||
5625 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | ||
5626 | case EXIT_REASON_INVD: | ||
5627 | return 1; | ||
5628 | case EXIT_REASON_INVLPG: | ||
5629 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
5630 | case EXIT_REASON_RDPMC: | ||
5631 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | ||
5632 | case EXIT_REASON_RDTSC: | ||
5633 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | ||
5634 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | ||
5635 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | ||
5636 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: | ||
5637 | case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: | ||
5638 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | ||
5639 | /* | ||
5640 | * VMX instructions trap unconditionally. This allows L1 to | ||
5641 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | ||
5642 | */ | ||
5643 | return 1; | ||
5644 | case EXIT_REASON_CR_ACCESS: | ||
5645 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | ||
5646 | case EXIT_REASON_DR_ACCESS: | ||
5647 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | ||
5648 | case EXIT_REASON_IO_INSTRUCTION: | ||
5649 | /* TODO: support IO bitmaps */ | ||
5650 | return 1; | ||
5651 | case EXIT_REASON_MSR_READ: | ||
5652 | case EXIT_REASON_MSR_WRITE: | ||
5653 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | ||
5654 | case EXIT_REASON_INVALID_STATE: | ||
5655 | return 1; | ||
5656 | case EXIT_REASON_MWAIT_INSTRUCTION: | ||
5657 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | ||
5658 | case EXIT_REASON_MONITOR_INSTRUCTION: | ||
5659 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | ||
5660 | case EXIT_REASON_PAUSE_INSTRUCTION: | ||
5661 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | ||
5662 | nested_cpu_has2(vmcs12, | ||
5663 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | ||
5664 | case EXIT_REASON_MCE_DURING_VMENTRY: | ||
5665 | return 0; | ||
5666 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | ||
5667 | return 1; | ||
5668 | case EXIT_REASON_APIC_ACCESS: | ||
5669 | return nested_cpu_has2(vmcs12, | ||
5670 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
5671 | case EXIT_REASON_EPT_VIOLATION: | ||
5672 | case EXIT_REASON_EPT_MISCONFIG: | ||
5673 | return 0; | ||
5674 | case EXIT_REASON_WBINVD: | ||
5675 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | ||
5676 | case EXIT_REASON_XSETBV: | ||
5677 | return 1; | ||
5678 | default: | ||
5679 | return 1; | ||
5680 | } | ||
5681 | } | ||
5682 | |||
3914 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | 5683 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) |
3915 | { | 5684 | { |
3916 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | 5685 | *info1 = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3933 | if (vmx->emulation_required && emulate_invalid_guest_state) | 5702 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3934 | return handle_invalid_guest_state(vcpu); | 5703 | return handle_invalid_guest_state(vcpu); |
3935 | 5704 | ||
5705 | /* | ||
5706 | * the KVM_REQ_EVENT optimization bit is only on for one entry, and if | ||
5707 | * we did not inject a still-pending event to L1 now because of | ||
5708 | * nested_run_pending, we need to re-enable this bit. | ||
5709 | */ | ||
5710 | if (vmx->nested.nested_run_pending) | ||
5711 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5712 | |||
5713 | if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || | ||
5714 | exit_reason == EXIT_REASON_VMRESUME)) | ||
5715 | vmx->nested.nested_run_pending = 1; | ||
5716 | else | ||
5717 | vmx->nested.nested_run_pending = 0; | ||
5718 | |||
5719 | if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { | ||
5720 | nested_vmx_vmexit(vcpu); | ||
5721 | return 1; | ||
5722 | } | ||
5723 | |||
3936 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 5724 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3937 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 5725 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3938 | vcpu->run->fail_entry.hardware_entry_failure_reason | 5726 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3955 | "(0x%x) and exit reason is 0x%x\n", | 5743 | "(0x%x) and exit reason is 0x%x\n", |
3956 | __func__, vectoring_info, exit_reason); | 5744 | __func__, vectoring_info, exit_reason); |
3957 | 5745 | ||
3958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { | 5746 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && |
5747 | !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( | ||
5748 | get_vmcs12(vcpu), vcpu)))) { | ||
3959 | if (vmx_interrupt_allowed(vcpu)) { | 5749 | if (vmx_interrupt_allowed(vcpu)) { |
3960 | vmx->soft_vnmi_blocked = 0; | 5750 | vmx->soft_vnmi_blocked = 0; |
3961 | } else if (vmx->vnmi_blocked_time > 1000000000LL && | 5751 | } else if (vmx->vnmi_blocked_time > 1000000000LL && |
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, | |||
4118 | 5908 | ||
4119 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 5909 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
4120 | { | 5910 | { |
5911 | if (is_guest_mode(&vmx->vcpu)) | ||
5912 | return; | ||
4121 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, | 5913 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, |
4122 | VM_EXIT_INSTRUCTION_LEN, | 5914 | VM_EXIT_INSTRUCTION_LEN, |
4123 | IDT_VECTORING_ERROR_CODE); | 5915 | IDT_VECTORING_ERROR_CODE); |
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
4125 | 5917 | ||
4126 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) | 5918 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) |
4127 | { | 5919 | { |
5920 | if (is_guest_mode(vcpu)) | ||
5921 | return; | ||
4128 | __vmx_complete_interrupts(to_vmx(vcpu), | 5922 | __vmx_complete_interrupts(to_vmx(vcpu), |
4129 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | 5923 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), |
4130 | VM_ENTRY_INSTRUCTION_LEN, | 5924 | VM_ENTRY_INSTRUCTION_LEN, |
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4145 | { | 5939 | { |
4146 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 5940 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4147 | 5941 | ||
5942 | if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { | ||
5943 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5944 | if (vmcs12->idt_vectoring_info_field & | ||
5945 | VECTORING_INFO_VALID_MASK) { | ||
5946 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
5947 | vmcs12->idt_vectoring_info_field); | ||
5948 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
5949 | vmcs12->vm_exit_instruction_len); | ||
5950 | if (vmcs12->idt_vectoring_info_field & | ||
5951 | VECTORING_INFO_DELIVER_CODE_MASK) | ||
5952 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
5953 | vmcs12->idt_vectoring_error_code); | ||
5954 | } | ||
5955 | } | ||
5956 | |||
4148 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 5957 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
4149 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 5958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
4150 | vmx->entry_time = ktime_get(); | 5959 | vmx->entry_time = ktime_get(); |
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4167 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 5976 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
4168 | vmx_set_interrupt_shadow(vcpu, 0); | 5977 | vmx_set_interrupt_shadow(vcpu, 0); |
4169 | 5978 | ||
5979 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
4170 | asm( | 5980 | asm( |
4171 | /* Store host registers */ | 5981 | /* Store host registers */ |
4172 | "push %%"R"dx; push %%"R"bp;" | 5982 | "push %%"R"dx; push %%"R"bp;" |
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4237 | "pop %%"R"bp; pop %%"R"dx \n\t" | 6047 | "pop %%"R"bp; pop %%"R"dx \n\t" |
4238 | "setbe %c[fail](%0) \n\t" | 6048 | "setbe %c[fail](%0) \n\t" |
4239 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 6049 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
4240 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 6050 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), |
4241 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 6051 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
4242 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | 6052 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), |
4243 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 6053 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4276 | 6086 | ||
4277 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 6087 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
4278 | 6088 | ||
6089 | if (is_guest_mode(vcpu)) { | ||
6090 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6091 | vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; | ||
6092 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
6093 | vmcs12->idt_vectoring_error_code = | ||
6094 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6095 | vmcs12->vm_exit_instruction_len = | ||
6096 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6097 | } | ||
6098 | } | ||
6099 | |||
4279 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 6100 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
4280 | vmx->launched = 1; | 6101 | vmx->loaded_vmcs->launched = 1; |
4281 | 6102 | ||
4282 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | 6103 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); |
4283 | 6104 | ||
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4289 | #undef R | 6110 | #undef R |
4290 | #undef Q | 6111 | #undef Q |
4291 | 6112 | ||
4292 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
4293 | { | ||
4294 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4295 | |||
4296 | if (vmx->vmcs) { | ||
4297 | vcpu_clear(vmx); | ||
4298 | free_vmcs(vmx->vmcs); | ||
4299 | vmx->vmcs = NULL; | ||
4300 | } | ||
4301 | } | ||
4302 | |||
4303 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | 6113 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) |
4304 | { | 6114 | { |
4305 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6115 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4306 | 6116 | ||
4307 | free_vpid(vmx); | 6117 | free_vpid(vmx); |
4308 | vmx_free_vmcs(vcpu); | 6118 | free_nested(vmx); |
6119 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
4309 | kfree(vmx->guest_msrs); | 6120 | kfree(vmx->guest_msrs); |
4310 | kvm_vcpu_uninit(vcpu); | 6121 | kvm_vcpu_uninit(vcpu); |
4311 | kmem_cache_free(kvm_vcpu_cache, vmx); | 6122 | kmem_cache_free(kvm_vcpu_cache, vmx); |
4312 | } | 6123 | } |
4313 | 6124 | ||
4314 | static inline void vmcs_init(struct vmcs *vmcs) | ||
4315 | { | ||
4316 | u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); | ||
4317 | |||
4318 | if (!vmm_exclusive) | ||
4319 | kvm_cpu_vmxon(phys_addr); | ||
4320 | |||
4321 | vmcs_clear(vmcs); | ||
4322 | |||
4323 | if (!vmm_exclusive) | ||
4324 | kvm_cpu_vmxoff(); | ||
4325 | } | ||
4326 | |||
4327 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | 6125 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) |
4328 | { | 6126 | { |
4329 | int err; | 6127 | int err; |
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4345 | goto uninit_vcpu; | 6143 | goto uninit_vcpu; |
4346 | } | 6144 | } |
4347 | 6145 | ||
4348 | vmx->vmcs = alloc_vmcs(); | 6146 | vmx->loaded_vmcs = &vmx->vmcs01; |
4349 | if (!vmx->vmcs) | 6147 | vmx->loaded_vmcs->vmcs = alloc_vmcs(); |
6148 | if (!vmx->loaded_vmcs->vmcs) | ||
4350 | goto free_msrs; | 6149 | goto free_msrs; |
4351 | 6150 | if (!vmm_exclusive) | |
4352 | vmcs_init(vmx->vmcs); | 6151 | kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); |
6152 | loaded_vmcs_init(vmx->loaded_vmcs); | ||
6153 | if (!vmm_exclusive) | ||
6154 | kvm_cpu_vmxoff(); | ||
4353 | 6155 | ||
4354 | cpu = get_cpu(); | 6156 | cpu = get_cpu(); |
4355 | vmx_vcpu_load(&vmx->vcpu, cpu); | 6157 | vmx_vcpu_load(&vmx->vcpu, cpu); |
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4375 | goto free_vmcs; | 6177 | goto free_vmcs; |
4376 | } | 6178 | } |
4377 | 6179 | ||
6180 | vmx->nested.current_vmptr = -1ull; | ||
6181 | vmx->nested.current_vmcs12 = NULL; | ||
6182 | |||
4378 | return &vmx->vcpu; | 6183 | return &vmx->vcpu; |
4379 | 6184 | ||
4380 | free_vmcs: | 6185 | free_vmcs: |
4381 | free_vmcs(vmx->vmcs); | 6186 | free_vmcs(vmx->loaded_vmcs->vmcs); |
4382 | free_msrs: | 6187 | free_msrs: |
4383 | kfree(vmx->guest_msrs); | 6188 | kfree(vmx->guest_msrs); |
4384 | uninit_vcpu: | 6189 | uninit_vcpu: |
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
4512 | 6317 | ||
4513 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 6318 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
4514 | { | 6319 | { |
6320 | if (func == 1 && nested) | ||
6321 | entry->ecx |= bit(X86_FEATURE_VMX); | ||
6322 | } | ||
6323 | |||
6324 | /* | ||
6325 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | ||
6326 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | ||
6327 | * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 | ||
6328 | * guest in a way that will both be appropriate to L1's requests, and our | ||
6329 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | ||
6330 | * function also has additional necessary side-effects, like setting various | ||
6331 | * vcpu->arch fields. | ||
6332 | */ | ||
6333 | static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6334 | { | ||
6335 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6336 | u32 exec_control; | ||
6337 | |||
6338 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | ||
6339 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | ||
6340 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | ||
6341 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | ||
6342 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | ||
6343 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | ||
6344 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | ||
6345 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | ||
6346 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | ||
6347 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | ||
6348 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | ||
6349 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | ||
6350 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | ||
6351 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | ||
6352 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | ||
6353 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | ||
6354 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | ||
6355 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | ||
6356 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | ||
6357 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | ||
6358 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | ||
6359 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | ||
6360 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | ||
6361 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | ||
6362 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | ||
6363 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | ||
6364 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | ||
6365 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | ||
6366 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | ||
6367 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | ||
6368 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | ||
6369 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | ||
6370 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | ||
6371 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | ||
6372 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | ||
6373 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | ||
6374 | |||
6375 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | ||
6376 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
6377 | vmcs12->vm_entry_intr_info_field); | ||
6378 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
6379 | vmcs12->vm_entry_exception_error_code); | ||
6380 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
6381 | vmcs12->vm_entry_instruction_len); | ||
6382 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
6383 | vmcs12->guest_interruptibility_info); | ||
6384 | vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); | ||
6385 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | ||
6386 | vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); | ||
6387 | vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); | ||
6388 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | ||
6389 | vmcs12->guest_pending_dbg_exceptions); | ||
6390 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | ||
6391 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | ||
6392 | |||
6393 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
6394 | |||
6395 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
6396 | (vmcs_config.pin_based_exec_ctrl | | ||
6397 | vmcs12->pin_based_vm_exec_control)); | ||
6398 | |||
6399 | /* | ||
6400 | * Whether page-faults are trapped is determined by a combination of | ||
6401 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | ||
6402 | * If enable_ept, L0 doesn't care about page faults and we should | ||
6403 | * set all of these to L1's desires. However, if !enable_ept, L0 does | ||
6404 | * care about (at least some) page faults, and because it is not easy | ||
6405 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | ||
6406 | * to exit on each and every L2 page fault. This is done by setting | ||
6407 | * MASK=MATCH=0 and (see below) EB.PF=1. | ||
6408 | * Note that below we don't need special code to set EB.PF beyond the | ||
6409 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | ||
6410 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | ||
6411 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | ||
6412 | * | ||
6413 | * A problem with this approach (when !enable_ept) is that L1 may be | ||
6414 | * injected with more page faults than it asked for. This could have | ||
6415 | * caused problems, but in practice existing hypervisors don't care. | ||
6416 | * To fix this, we will need to emulate the PFEC checking (on the L1 | ||
6417 | * page tables), using walk_addr(), when injecting PFs to L1. | ||
6418 | */ | ||
6419 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | ||
6420 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | ||
6421 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | ||
6422 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | ||
6423 | |||
6424 | if (cpu_has_secondary_exec_ctrls()) { | ||
6425 | u32 exec_control = vmx_secondary_exec_control(vmx); | ||
6426 | if (!vmx->rdtscp_enabled) | ||
6427 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | ||
6428 | /* Take the following fields only from vmcs12 */ | ||
6429 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6430 | if (nested_cpu_has(vmcs12, | ||
6431 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) | ||
6432 | exec_control |= vmcs12->secondary_vm_exec_control; | ||
6433 | |||
6434 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { | ||
6435 | /* | ||
6436 | * Translate L1 physical address to host physical | ||
6437 | * address for vmcs02. Keep the page pinned, so this | ||
6438 | * physical address remains valid. We keep a reference | ||
6439 | * to it so we can release it later. | ||
6440 | */ | ||
6441 | if (vmx->nested.apic_access_page) /* shouldn't happen */ | ||
6442 | nested_release_page(vmx->nested.apic_access_page); | ||
6443 | vmx->nested.apic_access_page = | ||
6444 | nested_get_page(vcpu, vmcs12->apic_access_addr); | ||
6445 | /* | ||
6446 | * If translation failed, no matter: This feature asks | ||
6447 | * to exit when accessing the given address, and if it | ||
6448 | * can never be accessed, this feature won't do | ||
6449 | * anything anyway. | ||
6450 | */ | ||
6451 | if (!vmx->nested.apic_access_page) | ||
6452 | exec_control &= | ||
6453 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6454 | else | ||
6455 | vmcs_write64(APIC_ACCESS_ADDR, | ||
6456 | page_to_phys(vmx->nested.apic_access_page)); | ||
6457 | } | ||
6458 | |||
6459 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
6460 | } | ||
6461 | |||
6462 | |||
6463 | /* | ||
6464 | * Set host-state according to L0's settings (vmcs12 is irrelevant here) | ||
6465 | * Some constant fields are set here by vmx_set_constant_host_state(). | ||
6466 | * Other fields are different per CPU, and will be set later when | ||
6467 | * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. | ||
6468 | */ | ||
6469 | vmx_set_constant_host_state(); | ||
6470 | |||
6471 | /* | ||
6472 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
6473 | * entry, but only if the current (host) sp changed from the value | ||
6474 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
6475 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
6476 | * here we just force the write to happen on entry. | ||
6477 | */ | ||
6478 | vmx->host_rsp = 0; | ||
6479 | |||
6480 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | ||
6481 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
6482 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | ||
6483 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
6484 | exec_control |= vmcs12->cpu_based_vm_exec_control; | ||
6485 | /* | ||
6486 | * Merging of IO and MSR bitmaps not currently supported. | ||
6487 | * Rather, exit every time. | ||
6488 | */ | ||
6489 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | ||
6490 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | ||
6491 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | ||
6492 | |||
6493 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
6494 | |||
6495 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | ||
6496 | * bitwise-or of what L1 wants to trap for L2, and what we want to | ||
6497 | * trap. Note that CR0.TS also needs updating - we do this later. | ||
6498 | */ | ||
6499 | update_exception_bitmap(vcpu); | ||
6500 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | ||
6501 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6502 | |||
6503 | /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ | ||
6504 | vmcs_write32(VM_EXIT_CONTROLS, | ||
6505 | vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); | ||
6506 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | | ||
6507 | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); | ||
6508 | |||
6509 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) | ||
6510 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | ||
6511 | else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) | ||
6512 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
6513 | |||
6514 | |||
6515 | set_cr4_guest_host_mask(vmx); | ||
6516 | |||
6517 | vmcs_write64(TSC_OFFSET, | ||
6518 | vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); | ||
6519 | |||
6520 | if (enable_vpid) { | ||
6521 | /* | ||
6522 | * Trivially support vpid by letting L2s share their parent | ||
6523 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6524 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6525 | */ | ||
6526 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
6527 | vmx_flush_tlb(vcpu); | ||
6528 | } | ||
6529 | |||
6530 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) | ||
6531 | vcpu->arch.efer = vmcs12->guest_ia32_efer; | ||
6532 | if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | ||
6533 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6534 | else | ||
6535 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6536 | /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | ||
6537 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6538 | |||
6539 | /* | ||
6540 | * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified | ||
6541 | * TS bit (for lazy fpu) and bits which we consider mandatory enabled. | ||
6542 | * The CR0_READ_SHADOW is what L2 should have expected to read given | ||
6543 | * the specifications by L1; It's not enough to take | ||
6544 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | ||
6545 | * have more bits than L1 expected. | ||
6546 | */ | ||
6547 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | ||
6548 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
6549 | |||
6550 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | ||
6551 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | ||
6552 | |||
6553 | /* shadow page tables on either EPT or shadow page tables */ | ||
6554 | kvm_set_cr3(vcpu, vmcs12->guest_cr3); | ||
6555 | kvm_mmu_reset_context(vcpu); | ||
6556 | |||
6557 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | ||
6558 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | ||
6559 | } | ||
6560 | |||
6561 | /* | ||
6562 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | ||
6563 | * for running an L2 nested guest. | ||
6564 | */ | ||
6565 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | ||
6566 | { | ||
6567 | struct vmcs12 *vmcs12; | ||
6568 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6569 | int cpu; | ||
6570 | struct loaded_vmcs *vmcs02; | ||
6571 | |||
6572 | if (!nested_vmx_check_permission(vcpu) || | ||
6573 | !nested_vmx_check_vmcs12(vcpu)) | ||
6574 | return 1; | ||
6575 | |||
6576 | skip_emulated_instruction(vcpu); | ||
6577 | vmcs12 = get_vmcs12(vcpu); | ||
6578 | |||
6579 | /* | ||
6580 | * The nested entry process starts with enforcing various prerequisites | ||
6581 | * on vmcs12 as required by the Intel SDM, and act appropriately when | ||
6582 | * they fail: As the SDM explains, some conditions should cause the | ||
6583 | * instruction to fail, while others will cause the instruction to seem | ||
6584 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | ||
6585 | * To speed up the normal (success) code path, we should avoid checking | ||
6586 | * for misconfigurations which will anyway be caught by the processor | ||
6587 | * when using the merged vmcs02. | ||
6588 | */ | ||
6589 | if (vmcs12->launch_state == launch) { | ||
6590 | nested_vmx_failValid(vcpu, | ||
6591 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | ||
6592 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | ||
6593 | return 1; | ||
6594 | } | ||
6595 | |||
6596 | if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && | ||
6597 | !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { | ||
6598 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6599 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6600 | return 1; | ||
6601 | } | ||
6602 | |||
6603 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | ||
6604 | !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { | ||
6605 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6606 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6607 | return 1; | ||
6608 | } | ||
6609 | |||
6610 | if (vmcs12->vm_entry_msr_load_count > 0 || | ||
6611 | vmcs12->vm_exit_msr_load_count > 0 || | ||
6612 | vmcs12->vm_exit_msr_store_count > 0) { | ||
6613 | if (printk_ratelimit()) | ||
6614 | printk(KERN_WARNING | ||
6615 | "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); | ||
6616 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6617 | return 1; | ||
6618 | } | ||
6619 | |||
6620 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | ||
6621 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || | ||
6622 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | ||
6623 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || | ||
6624 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, | ||
6625 | nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || | ||
6626 | !vmx_control_verify(vmcs12->vm_exit_controls, | ||
6627 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || | ||
6628 | !vmx_control_verify(vmcs12->vm_entry_controls, | ||
6629 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) | ||
6630 | { | ||
6631 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6632 | return 1; | ||
6633 | } | ||
6634 | |||
6635 | if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6636 | ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6637 | nested_vmx_failValid(vcpu, | ||
6638 | VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); | ||
6639 | return 1; | ||
6640 | } | ||
6641 | |||
6642 | if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6643 | ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6644 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6645 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); | ||
6646 | return 1; | ||
6647 | } | ||
6648 | if (vmcs12->vmcs_link_pointer != -1ull) { | ||
6649 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6650 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); | ||
6651 | return 1; | ||
6652 | } | ||
6653 | |||
6654 | /* | ||
6655 | * We're finally done with prerequisite checking, and can start with | ||
6656 | * the nested entry. | ||
6657 | */ | ||
6658 | |||
6659 | vmcs02 = nested_get_current_vmcs02(vmx); | ||
6660 | if (!vmcs02) | ||
6661 | return -ENOMEM; | ||
6662 | |||
6663 | enter_guest_mode(vcpu); | ||
6664 | |||
6665 | vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); | ||
6666 | |||
6667 | cpu = get_cpu(); | ||
6668 | vmx->loaded_vmcs = vmcs02; | ||
6669 | vmx_vcpu_put(vcpu); | ||
6670 | vmx_vcpu_load(vcpu, cpu); | ||
6671 | vcpu->cpu = cpu; | ||
6672 | put_cpu(); | ||
6673 | |||
6674 | vmcs12->launch_state = 1; | ||
6675 | |||
6676 | prepare_vmcs02(vcpu, vmcs12); | ||
6677 | |||
6678 | /* | ||
6679 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | ||
6680 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | ||
6681 | * returned as far as L1 is concerned. It will only return (and set | ||
6682 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | ||
6683 | */ | ||
6684 | return 1; | ||
6685 | } | ||
6686 | |||
6687 | /* | ||
6688 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | ||
6689 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | ||
6690 | * This function returns the new value we should put in vmcs12.guest_cr0. | ||
6691 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | ||
6692 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | ||
6693 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | ||
6694 | * didn't trap the bit, because if L1 did, so would L0). | ||
6695 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | ||
6696 | * been modified by L2, and L1 knows it. So just leave the old value of | ||
6697 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | ||
6698 | * isn't relevant, because if L0 traps this bit it can set it to anything. | ||
6699 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | ||
6700 | * changed these bits, and therefore they need to be updated, but L0 | ||
6701 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | ||
6702 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | ||
6703 | */ | ||
6704 | static inline unsigned long | ||
6705 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6706 | { | ||
6707 | return | ||
6708 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | ||
6709 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | ||
6710 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | ||
6711 | vcpu->arch.cr0_guest_owned_bits)); | ||
6712 | } | ||
6713 | |||
6714 | static inline unsigned long | ||
6715 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6716 | { | ||
6717 | return | ||
6718 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | ||
6719 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | ||
6720 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | ||
6721 | vcpu->arch.cr4_guest_owned_bits)); | ||
6722 | } | ||
6723 | |||
6724 | /* | ||
6725 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | ||
6726 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | ||
6727 | * and this function updates it to reflect the changes to the guest state while | ||
6728 | * L2 was running (and perhaps made some exits which were handled directly by L0 | ||
6729 | * without going back to L1), and to reflect the exit reason. | ||
6730 | * Note that we do not have to copy here all VMCS fields, just those that | ||
6731 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | ||
6732 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | ||
6733 | * which already writes to vmcs12 directly. | ||
6734 | */ | ||
6735 | void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6736 | { | ||
6737 | /* update guest state fields: */ | ||
6738 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | ||
6739 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | ||
6740 | |||
6741 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | ||
6742 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
6743 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
6744 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | ||
6745 | |||
6746 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | ||
6747 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | ||
6748 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | ||
6749 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | ||
6750 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | ||
6751 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | ||
6752 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | ||
6753 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
6754 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | ||
6755 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | ||
6756 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
6757 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | ||
6758 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | ||
6759 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | ||
6760 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | ||
6761 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | ||
6762 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
6763 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
6764 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | ||
6765 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | ||
6766 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | ||
6767 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | ||
6768 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | ||
6769 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | ||
6770 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | ||
6771 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | ||
6772 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | ||
6773 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | ||
6774 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | ||
6775 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | ||
6776 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | ||
6777 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | ||
6778 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | ||
6779 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | ||
6780 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | ||
6781 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | ||
6782 | |||
6783 | vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); | ||
6784 | vmcs12->guest_interruptibility_info = | ||
6785 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
6786 | vmcs12->guest_pending_dbg_exceptions = | ||
6787 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | ||
6788 | |||
6789 | /* TODO: These cannot have changed unless we have MSR bitmaps and | ||
6790 | * the relevant bit asks not to trap the change */ | ||
6791 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
6792 | if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) | ||
6793 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | ||
6794 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | ||
6795 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | ||
6796 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | ||
6797 | |||
6798 | /* update exit information fields: */ | ||
6799 | |||
6800 | vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
6801 | vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
6802 | |||
6803 | vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
6804 | vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
6805 | vmcs12->idt_vectoring_info_field = | ||
6806 | vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
6807 | vmcs12->idt_vectoring_error_code = | ||
6808 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6809 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6810 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
6811 | |||
6812 | /* clear vm-entry fields which are to be cleared on exit */ | ||
6813 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
6814 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | ||
6815 | } | ||
6816 | |||
6817 | /* | ||
6818 | * A part of what we need to when the nested L2 guest exits and we want to | ||
6819 | * run its L1 parent, is to reset L1's guest state to the host state specified | ||
6820 | * in vmcs12. | ||
6821 | * This function is to be called not only on normal nested exit, but also on | ||
6822 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | ||
6823 | * Failures During or After Loading Guest State"). | ||
6824 | * This function should be called when the active VMCS is L1's (vmcs01). | ||
6825 | */ | ||
6826 | void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6827 | { | ||
6828 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | ||
6829 | vcpu->arch.efer = vmcs12->host_ia32_efer; | ||
6830 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
6831 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6832 | else | ||
6833 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6834 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6835 | |||
6836 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | ||
6837 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | ||
6838 | /* | ||
6839 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | ||
6840 | * actually changed, because it depends on the current state of | ||
6841 | * fpu_active (which may have changed). | ||
6842 | * Note that vmx_set_cr0 refers to efer set above. | ||
6843 | */ | ||
6844 | kvm_set_cr0(vcpu, vmcs12->host_cr0); | ||
6845 | /* | ||
6846 | * If we did fpu_activate()/fpu_deactivate() during L2's run, we need | ||
6847 | * to apply the same changes to L1's vmcs. We just set cr0 correctly, | ||
6848 | * but we also need to update cr0_guest_host_mask and exception_bitmap. | ||
6849 | */ | ||
6850 | update_exception_bitmap(vcpu); | ||
6851 | vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); | ||
6852 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6853 | |||
6854 | /* | ||
6855 | * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 | ||
6856 | * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); | ||
6857 | */ | ||
6858 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
6859 | kvm_set_cr4(vcpu, vmcs12->host_cr4); | ||
6860 | |||
6861 | /* shadow page tables on either EPT or shadow page tables */ | ||
6862 | kvm_set_cr3(vcpu, vmcs12->host_cr3); | ||
6863 | kvm_mmu_reset_context(vcpu); | ||
6864 | |||
6865 | if (enable_vpid) { | ||
6866 | /* | ||
6867 | * Trivially support vpid by letting L2s share their parent | ||
6868 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6869 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6870 | */ | ||
6871 | vmx_flush_tlb(vcpu); | ||
6872 | } | ||
6873 | |||
6874 | |||
6875 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | ||
6876 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | ||
6877 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | ||
6878 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | ||
6879 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | ||
6880 | vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); | ||
6881 | vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); | ||
6882 | vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); | ||
6883 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); | ||
6884 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); | ||
6885 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); | ||
6886 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); | ||
6887 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); | ||
6888 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); | ||
6889 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); | ||
6890 | |||
6891 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) | ||
6892 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | ||
6893 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
6894 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | ||
6895 | vmcs12->host_ia32_perf_global_ctrl); | ||
6896 | } | ||
6897 | |||
6898 | /* | ||
6899 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | ||
6900 | * and modify vmcs12 to make it see what it would expect to see there if | ||
6901 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | ||
6902 | */ | ||
6903 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) | ||
6904 | { | ||
6905 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6906 | int cpu; | ||
6907 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6908 | |||
6909 | leave_guest_mode(vcpu); | ||
6910 | prepare_vmcs12(vcpu, vmcs12); | ||
6911 | |||
6912 | cpu = get_cpu(); | ||
6913 | vmx->loaded_vmcs = &vmx->vmcs01; | ||
6914 | vmx_vcpu_put(vcpu); | ||
6915 | vmx_vcpu_load(vcpu, cpu); | ||
6916 | vcpu->cpu = cpu; | ||
6917 | put_cpu(); | ||
6918 | |||
6919 | /* if no vmcs02 cache requested, remove the one we used */ | ||
6920 | if (VMCS02_POOL_SIZE == 0) | ||
6921 | nested_free_vmcs02(vmx, vmx->nested.current_vmptr); | ||
6922 | |||
6923 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6924 | |||
6925 | /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ | ||
6926 | vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); | ||
6927 | |||
6928 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
6929 | vmx->host_rsp = 0; | ||
6930 | |||
6931 | /* Unpin physical memory we referred to in vmcs02 */ | ||
6932 | if (vmx->nested.apic_access_page) { | ||
6933 | nested_release_page(vmx->nested.apic_access_page); | ||
6934 | vmx->nested.apic_access_page = 0; | ||
6935 | } | ||
6936 | |||
6937 | /* | ||
6938 | * Exiting from L2 to L1, we're now back to L1 which thinks it just | ||
6939 | * finished a VMLAUNCH or VMRESUME instruction, so we need to set the | ||
6940 | * success or failure flag accordingly. | ||
6941 | */ | ||
6942 | if (unlikely(vmx->fail)) { | ||
6943 | vmx->fail = 0; | ||
6944 | nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
6945 | } else | ||
6946 | nested_vmx_succeed(vcpu); | ||
6947 | } | ||
6948 | |||
6949 | /* | ||
6950 | * L1's failure to enter L2 is a subset of a normal exit, as explained in | ||
6951 | * 23.7 "VM-entry failures during or after loading guest state" (this also | ||
6952 | * lists the acceptable exit-reason and exit-qualification parameters). | ||
6953 | * It should only be called before L2 actually succeeded to run, and when | ||
6954 | * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). | ||
6955 | */ | ||
6956 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
6957 | struct vmcs12 *vmcs12, | ||
6958 | u32 reason, unsigned long qualification) | ||
6959 | { | ||
6960 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6961 | vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | ||
6962 | vmcs12->exit_qualification = qualification; | ||
6963 | nested_vmx_succeed(vcpu); | ||
4515 | } | 6964 | } |
4516 | 6965 | ||
4517 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | 6966 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, |
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void) | |||
4670 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7119 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
4671 | 7120 | ||
4672 | if (enable_ept) { | 7121 | if (enable_ept) { |
4673 | bypass_guest_pf = 0; | ||
4674 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 7122 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4675 | VMX_EPT_EXECUTABLE_MASK); | 7123 | VMX_EPT_EXECUTABLE_MASK); |
7124 | ept_set_mmio_spte_mask(); | ||
4676 | kvm_enable_tdp(); | 7125 | kvm_enable_tdp(); |
4677 | } else | 7126 | } else |
4678 | kvm_disable_tdp(); | 7127 | kvm_disable_tdp(); |
4679 | 7128 | ||
4680 | if (bypass_guest_pf) | ||
4681 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
4682 | |||
4683 | return 0; | 7129 | return 0; |
4684 | 7130 | ||
4685 | out3: | 7131 | out3: |