KVM: VMX: Add support for Pause-Loop Exiting

New NHM processors will support Pause-Loop Exiting by adding 2 VM-execution control fields: PLE_Gap - upper bound on the amount of time between two successive executions of PAUSE in a loop. PLE_Window - upper bound on the amount of time a guest is allowed to execute in a PAUSE loop If the time, between this execution of PAUSE and previous one, exceeds the PLE_Gap, processor consider this PAUSE belongs to a new loop. Otherwise, processor determins the the total execution time of this loop(since 1st PAUSE in this loop), and triggers a VM exit if total time exceeds the PLE_Window. * Refer SDM volume 3b section 21.6.13 & 22.1.3. Pause-Loop Exiting can be used to detect Lock-Holder Preemption, where one VP is sched-out after hold a spinlock, then other VPs for same lock are sched-in to waste the CPU time. Our tests indicate that most spinlocks are held for less than 212 cycles. Performance tests show that with 2X LP over-commitment we can get +2% perf improvement for kernel build(Even more perf gain with more LPs). Signed-off-by: Zhai Edwin <edwin.zhai@intel.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
author: Zhai, Edwin <edwin.zhai@intel.com> 2009-10-09 06:03:20 -0400
committer: Avi Kivity <avi@redhat.com> 2009-12-03 02:32:17 -0500
commit: 4b8d54f9726f1159330201c5ed2ea30bce7e63ea (patch)
tree: 903eaf21f46359f3e42a6d4582ca792b73d4b7b9 /arch/x86/kvm
parent: d255f4f2bac81eb798fcf76938147f1f6c756ae2 (diff)
1 files changed, 50 insertions, 1 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70020e505c2..a4580d65af5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,25 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap:    upper bound on the amount of time between two successive
+ *             executions of PAUSE in a loop. Also indicate if ple enabled.
+ *             According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ *             in a PAUSE loop. Tests indicate that most spinlocks are held for
+ *             less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP    41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -319,6 +338,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
                SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
+static inline int cpu_has_vmx_ple(void)
+{
+        return vmcs_config.cpu_based_2nd_exec_ctrl &
+                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
        return flexpriority_enabled &&
@@ -1240,7 +1265,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_WBINVD_EXITING |
                        SECONDARY_EXEC_ENABLE_VPID |
                        SECONDARY_EXEC_ENABLE_EPT |
-                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
+                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
+                        SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -1386,6 +1412,9 @@ static __init int hardware_setup(void)
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
+        if (!cpu_has_vmx_ple())
+                ple_gap = 0;
        return alloc_kvm_area();
 }
@@ -2298,9 +2327,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                if (!enable_unrestricted_guest)
                        exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+                if (!ple_gap)
+                        exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
+        if (ple_gap) {
+                vmcs_write32(PLE_GAP, ple_gap);
+                vmcs_write32(PLE_WINDOW, ple_window);
+        }
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -3348,6 +3384,18 @@ out:
 }
 /*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        skip_emulated_instruction(vcpu);
+        kvm_vcpu_on_spin(vcpu);
+        return 1;
+}
+/*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
 * to be done to userspace and return 0.
@@ -3383,6 +3431,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
+        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 };
 static const int kvm_vmx_max_exit_handlers =
author	Zhai, Edwin <edwin.zhai@intel.com>	2009-10-09 06:03:20 -0400
committer	Avi Kivity <avi@redhat.com>	2009-12-03 02:32:17 -0500
commit	4b8d54f9726f1159330201c5ed2ea30bce7e63ea (patch)
tree	903eaf21f46359f3e42a6d4582ca792b73d4b7b9 /arch/x86/kvm
parent	d255f4f2bac81eb798fcf76938147f1f6c756ae2 (diff)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 70020e505c2..a4580d65af5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,25 @@ module_param_named(unrestricted_guest,
61	static int __read_mostly emulate_invalid_guest_state = 0;	61	static int __read_mostly emulate_invalid_guest_state = 0;
62	module_param(emulate_invalid_guest_state, bool, S_IRUGO);	62	module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63		63
		64	/*
		65	* These 2 parameters are used to config the controls for Pause-Loop Exiting:
		66	* ple_gap: upper bound on the amount of time between two successive
		67	* executions of PAUSE in a loop. Also indicate if ple enabled.
		68	* According to test, this time is usually small than 41 cycles.
		69	* ple_window: upper bound on the amount of time a guest is allowed to execute
		70	* in a PAUSE loop. Tests indicate that most spinlocks are held for
		71	* less than 2^12 cycles
		72	* Time is measured based on a counter that runs at the same rate as the TSC,
		73	* refer SDM volume 3b section 21.6.13 & 22.1.3.
		74	*/
		75	#define KVM_VMX_DEFAULT_PLE_GAP 41
		76	#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
		77	static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
		78	module_param(ple_gap, int, S_IRUGO);
		79
		80	static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
		81	module_param(ple_window, int, S_IRUGO);
		82
64	struct vmcs {	83	struct vmcs {
65	u32 revision_id;	84	u32 revision_id;
66	u32 abort;	85	u32 abort;
@@ -319,6 +338,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
319	SECONDARY_EXEC_UNRESTRICTED_GUEST;	338	SECONDARY_EXEC_UNRESTRICTED_GUEST;
320	}	339	}
321		340
		341	static inline int cpu_has_vmx_ple(void)
		342	{
		343	return vmcs_config.cpu_based_2nd_exec_ctrl &
		344	SECONDARY_EXEC_PAUSE_LOOP_EXITING;
		345	}
		346
322	static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)	347	static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
323	{	348	{
324	return flexpriority_enabled &&	349	return flexpriority_enabled &&
@@ -1240,7 +1265,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1240	SECONDARY_EXEC_WBINVD_EXITING \|	1265	SECONDARY_EXEC_WBINVD_EXITING \|
1241	SECONDARY_EXEC_ENABLE_VPID \|	1266	SECONDARY_EXEC_ENABLE_VPID \|
1242	SECONDARY_EXEC_ENABLE_EPT \|	1267	SECONDARY_EXEC_ENABLE_EPT \|
1243	SECONDARY_EXEC_UNRESTRICTED_GUEST;	1268	SECONDARY_EXEC_UNRESTRICTED_GUEST \|
		1269	SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1244	if (adjust_vmx_controls(min2, opt2,	1270	if (adjust_vmx_controls(min2, opt2,
1245	MSR_IA32_VMX_PROCBASED_CTLS2,	1271	MSR_IA32_VMX_PROCBASED_CTLS2,
1246	&_cpu_based_2nd_exec_control) < 0)	1272	&_cpu_based_2nd_exec_control) < 0)
@@ -1386,6 +1412,9 @@ static __init int hardware_setup(void)
1386	if (enable_ept && !cpu_has_vmx_ept_2m_page())	1412	if (enable_ept && !cpu_has_vmx_ept_2m_page())
1387	kvm_disable_largepages();	1413	kvm_disable_largepages();
1388		1414
		1415	if (!cpu_has_vmx_ple())
		1416	ple_gap = 0;
		1417
1389	return alloc_kvm_area();	1418	return alloc_kvm_area();
1390	}	1419	}
1391		1420
@@ -2298,9 +2327,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2298	exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;	2327	exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2299	if (!enable_unrestricted_guest)	2328	if (!enable_unrestricted_guest)
2300	exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;	2329	exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
		2330	if (!ple_gap)
		2331	exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2301	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);	2332	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2302	}	2333	}
2303		2334
		2335	if (ple_gap) {
		2336	vmcs_write32(PLE_GAP, ple_gap);
		2337	vmcs_write32(PLE_WINDOW, ple_window);
		2338	}
		2339
2304	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);	2340	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2305	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);	2341	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2306	vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */	2342	vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -3348,6 +3384,18 @@ out:
3348	}	3384	}
3349		3385
3350	/*	3386	/*
		3387	* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
		3388	* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
		3389	*/
		3390	static int handle_pause(struct kvm_vcpu vcpu, struct kvm_run kvm_run)
		3391	{
		3392	skip_emulated_instruction(vcpu);
		3393	kvm_vcpu_on_spin(vcpu);
		3394
		3395	return 1;
		3396	}
		3397
		3398	/*
3351	* The exit handlers return 1 if the exit was handled fully and guest execution	3399	* The exit handlers return 1 if the exit was handled fully and guest execution
3352	* may resume. Otherwise they set the kvm_run parameter to indicate what needs	3400	* may resume. Otherwise they set the kvm_run parameter to indicate what needs
3353	* to be done to userspace and return 0.	3401	* to be done to userspace and return 0.
@@ -3383,6 +3431,7 @@ static int (kvm_vmx_exit_handlers[])(struct kvm_vcpu vcpu) = {
3383	[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,	3431	[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3384	[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,	3432	[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3385	[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,	3433	[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
		3434	[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3386	};	3435	};
3387		3436
3388	static const int kvm_vmx_max_exit_handlers =	3437	static const int kvm_vmx_max_exit_handlers =