KVM: PPC: Book3S HV: Implement halt polling

This patch introduces new halt polling functionality into the kvm_hv kernel module. When a vcore is idle it will poll for some period of time before scheduling itself out. When all of the runnable vcpus on a vcore have ceded (and thus the vcore is idle) we schedule ourselves out to allow something else to run. In the event that we need to wake up very quickly (for example an interrupt arrives), we are required to wait until we get scheduled again. Implement halt polling so that when a vcore is idle, and before scheduling ourselves, we poll for vcpus in the runnable_threads list which have pending exceptions or which leave the ceded state. If we poll successfully then we can get back into the guest very quickly without ever scheduling ourselves, otherwise we schedule ourselves out as before. There exists generic halt_polling code in virt/kvm_main.c, however on powerpc the polling conditions are different to the generic case. It would be nice if we could just implement an arch specific kvm_check_block() function, but there is still other arch specific things which need to be done for kvm_hv (for example manipulating vcore states) which means that a separate implementation is the best option. Testing of this patch with a TCP round robin test between two guests with virtio network interfaces has found a decrease in round trip time of ~15us on average. A performance gain is only seen when going out of and back into the guest often and quickly, otherwise there is no net benefit from the polling. The polling interval is adjusted such that when we are often scheduled out for long periods of time it is reduced, and when we often poll successfully it is increased. The rate at which the polling interval increases or decreases, and the maximum polling interval, can be set through module parameters. Based on the implementation in the generic kvm module by Wanpeng Li and Paolo Bonzini, and on direction from Paul Mackerras. Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
author: Suraj Jitindar Singh <sjitindarsingh@gmail.com> 2016-08-02 00:03:21 -0400
committer: Paul Mackerras <paulus@ozlabs.org> 2016-09-07 22:21:45 -0400
commit: 0cda69dd7cd64fdd54bdf584b5d6ba53767ba422 (patch)
tree: 0628a38e0f0035b7cb262a9308a909f580ba6f71 /arch/powerpc/kvm
parent: 7b5f8272c792d49da73d98e9ca32f4cbb6d53808 (diff)
2 files changed, 124 insertions, 14 deletions
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ebbab1b2206c..3c85c3b28fc5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -95,6 +95,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -2621,32 +2638,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
        finish_wait(&vcpu->arch.cpu_run, &wait);
 }
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+        /* 10us base */
+        if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+                vc->halt_poll_ns = 10000;
+        else
+                vc->halt_poll_ns *= halt_poll_ns_grow;
+        if (vc->halt_poll_ns > halt_poll_max_ns)
+                vc->halt_poll_ns = halt_poll_max_ns;
+}
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+        if (halt_poll_ns_shrink == 0)
+                vc->halt_poll_ns = 0;
+        else
+                vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+        struct kvm_vcpu *vcpu;
+        int i;
+        for_each_runnable_thread(i, vcpu, vc) {
+                if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+                        return 1;
+        }
+        return 0;
+}
 /*
 * All the vcpus in this vcore are idle, so wait for a decrementer
 * or external interrupt to one of the vcpus.  vc->lock is held.
 */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
-        struct kvm_vcpu *vcpu;
+        int do_sleep = 1;
-        int do_sleep = 1, i;
+        ktime_t cur, start;
+        u64 block_ns;
        DECLARE_SWAITQUEUE(wait);
-        prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+        /* Poll for pending exceptions and ceded state */
+        cur = start = ktime_get();
+        if (vc->halt_poll_ns) {
+                ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
-        /*
+                vc->vcore_state = VCORE_POLLING;
-         * Check one last time for pending exceptions and ceded state after
+                spin_unlock(&vc->lock);
-         * we put ourselves on the wait queue
-         */
+                do {
-        for_each_runnable_thread(i, vcpu, vc) {
+                        if (kvmppc_vcore_check_block(vc)) {
-                if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
+                                do_sleep = 0;
-                        do_sleep = 0;
+                                break;
-                        break;
+                        }
-                }
+                        cur = ktime_get();
+                } while (single_task_running() && ktime_before(cur, stop));
+                spin_lock(&vc->lock);
+                vc->vcore_state = VCORE_INACTIVE;
+                if (!do_sleep)
+                        goto out;
        }
-        if (!do_sleep) {
+        prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+        if (kvmppc_vcore_check_block(vc)) {
                finish_swait(&vc->wq, &wait);
-                return;
+                do_sleep = 0;
+                goto out;
        }
        vc->vcore_state = VCORE_SLEEPING;
@@ -2657,6 +2724,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
        trace_kvmppc_vcore_blocked(vc, 1);
+        cur = ktime_get();
+out:
+        block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+        /* Adjust poll time */
+        if (halt_poll_max_ns) {
+                if (block_ns <= vc->halt_poll_ns)
+                        ;
+                /* We slept and blocked for longer than the max halt time */
+                else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+                        shrink_halt_poll_ns(vc);
+                /* We slept and our poll time is too small */
+                else if (vc->halt_poll_ns < halt_poll_max_ns &&
+                                block_ns < halt_poll_max_ns)
+                        grow_halt_poll_ns(vc);
+        } else
+                vc->halt_poll_ns = 0;
+        trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 33d9daff5783..fb21990c0fb4 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
                   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
 );
+TRACE_EVENT(kvmppc_vcore_wakeup,
+        TP_PROTO(int do_sleep, __u64 ns),
+        TP_ARGS(do_sleep, ns),
+        TP_STRUCT__entry(
+                __field(__u64,  ns)
+                __field(int,    waited)
+                __field(pid_t,  tgid)
+        ),
+        TP_fast_assign(
+                __entry->ns     = ns;
+                __entry->waited = do_sleep;
+                __entry->tgid   = current->tgid;
+        ),
+        TP_printk("%s time %lld ns, tgid=%d",
+                __entry->waited ? "wait" : "poll",
+                __entry->ns, __entry->tgid)
+);
 TRACE_EVENT(kvmppc_run_vcpu_enter,
        TP_PROTO(struct kvm_vcpu *vcpu),
author	Suraj Jitindar Singh <sjitindarsingh@gmail.com>	2016-08-02 00:03:21 -0400
committer	Paul Mackerras <paulus@ozlabs.org>	2016-09-07 22:21:45 -0400
commit	0cda69dd7cd64fdd54bdf584b5d6ba53767ba422 (patch)
tree	0628a38e0f0035b7cb262a9308a909f580ba6f71 /arch/powerpc/kvm
parent	7b5f8272c792d49da73d98e9ca32f4cbb6d53808 (diff)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ebbab1b2206c..3c85c3b28fc5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c
@@ -95,6 +95,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
95	MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");	95	MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
96	#endif	96	#endif
97		97
		98	/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
		99	static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
		100	module_param(halt_poll_max_ns, uint, S_IRUGO \| S_IWUSR);
		101	MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
		102
		103	/* Factor by which the vcore halt poll interval is grown, default is to double
		104	*/
		105	static unsigned int halt_poll_ns_grow = 2;
		106	module_param(halt_poll_ns_grow, int, S_IRUGO);
		107	MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
		108
		109	/* Factor by which the vcore halt poll interval is shrunk, default is to reset
		110	*/
		111	static unsigned int halt_poll_ns_shrink;
		112	module_param(halt_poll_ns_shrink, int, S_IRUGO);
		113	MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
		114
98	static void kvmppc_end_cede(struct kvm_vcpu *vcpu);	115	static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
99	static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);	116	static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
100		117
@@ -2621,32 +2638,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
2621	finish_wait(&vcpu->arch.cpu_run, &wait);	2638	finish_wait(&vcpu->arch.cpu_run, &wait);
2622	}	2639	}
2623		2640
		2641	static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
		2642	{
		2643	/* 10us base */
		2644	if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
		2645	vc->halt_poll_ns = 10000;
		2646	else
		2647	vc->halt_poll_ns *= halt_poll_ns_grow;
		2648
		2649	if (vc->halt_poll_ns > halt_poll_max_ns)
		2650	vc->halt_poll_ns = halt_poll_max_ns;
		2651	}
		2652
		2653	static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
		2654	{
		2655	if (halt_poll_ns_shrink == 0)
		2656	vc->halt_poll_ns = 0;
		2657	else
		2658	vc->halt_poll_ns /= halt_poll_ns_shrink;
		2659	}
		2660
		2661	/* Check to see if any of the runnable vcpus on the vcore have pending
		2662	* exceptions or are no longer ceded
		2663	*/
		2664	static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
		2665	{
		2666	struct kvm_vcpu *vcpu;
		2667	int i;
		2668
		2669	for_each_runnable_thread(i, vcpu, vc) {
		2670	if (vcpu->arch.pending_exceptions \|\| !vcpu->arch.ceded)
		2671	return 1;
		2672	}
		2673
		2674	return 0;
		2675	}
		2676
2624	/*	2677	/*
2625	* All the vcpus in this vcore are idle, so wait for a decrementer	2678	* All the vcpus in this vcore are idle, so wait for a decrementer
2626	* or external interrupt to one of the vcpus. vc->lock is held.	2679	* or external interrupt to one of the vcpus. vc->lock is held.
2627	*/	2680	*/
2628	static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)	2681	static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2629	{	2682	{
2630	struct kvm_vcpu *vcpu;	2683	int do_sleep = 1;
2631	int do_sleep = 1, i;	2684	ktime_t cur, start;
		2685	u64 block_ns;
2632	DECLARE_SWAITQUEUE(wait);	2686	DECLARE_SWAITQUEUE(wait);
2633		2687
2634	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);	2688	/* Poll for pending exceptions and ceded state */
		2689	cur = start = ktime_get();
		2690	if (vc->halt_poll_ns) {
		2691	ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
2635		2692
2636	/*	2693	vc->vcore_state = VCORE_POLLING;
2637	* Check one last time for pending exceptions and ceded state after	2694	spin_unlock(&vc->lock);
2638	* we put ourselves on the wait queue	2695
2639	*/	2696	do {
2640	for_each_runnable_thread(i, vcpu, vc) {	2697	if (kvmppc_vcore_check_block(vc)) {
2641	if (vcpu->arch.pending_exceptions \|\| !vcpu->arch.ceded) {	2698	do_sleep = 0;
2642	do_sleep = 0;	2699	break;
2643	break;	2700	}
2644	}	2701	cur = ktime_get();
		2702	} while (single_task_running() && ktime_before(cur, stop));
		2703
		2704	spin_lock(&vc->lock);
		2705	vc->vcore_state = VCORE_INACTIVE;
		2706
		2707	if (!do_sleep)
		2708	goto out;
2645	}	2709	}
2646		2710
2647	if (!do_sleep) {	2711	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
		2712
		2713	if (kvmppc_vcore_check_block(vc)) {
2648	finish_swait(&vc->wq, &wait);	2714	finish_swait(&vc->wq, &wait);
2649	return;	2715	do_sleep = 0;
		2716	goto out;
2650	}	2717	}
2651		2718
2652	vc->vcore_state = VCORE_SLEEPING;	2719	vc->vcore_state = VCORE_SLEEPING;
@@ -2657,6 +2724,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2657	spin_lock(&vc->lock);	2724	spin_lock(&vc->lock);
2658	vc->vcore_state = VCORE_INACTIVE;	2725	vc->vcore_state = VCORE_INACTIVE;
2659	trace_kvmppc_vcore_blocked(vc, 1);	2726	trace_kvmppc_vcore_blocked(vc, 1);
		2727
		2728	cur = ktime_get();
		2729
		2730	out:
		2731	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
		2732
		2733	/* Adjust poll time */
		2734	if (halt_poll_max_ns) {
		2735	if (block_ns <= vc->halt_poll_ns)
		2736	;
		2737	/* We slept and blocked for longer than the max halt time */
		2738	else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
		2739	shrink_halt_poll_ns(vc);
		2740	/* We slept and our poll time is too small */
		2741	else if (vc->halt_poll_ns < halt_poll_max_ns &&
		2742	block_ns < halt_poll_max_ns)
		2743	grow_halt_poll_ns(vc);
		2744	} else
		2745	vc->halt_poll_ns = 0;
		2746
		2747	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
2660	}	2748	}
2661		2749
2662	static int kvmppc_run_vcpu(struct kvm_run kvm_run, struct kvm_vcpu vcpu)	2750	static int kvmppc_run_vcpu(struct kvm_run kvm_run, struct kvm_vcpu vcpu)


diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 33d9daff5783..fb21990c0fb4 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
432	__entry->runner_vcpu, __entry->n_runnable, __entry->tgid)	432	__entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
433	);	433	);
434		434
		435	TRACE_EVENT(kvmppc_vcore_wakeup,
		436	TP_PROTO(int do_sleep, __u64 ns),
		437
		438	TP_ARGS(do_sleep, ns),
		439
		440	TP_STRUCT__entry(
		441	__field(__u64, ns)
		442	__field(int, waited)
		443	__field(pid_t, tgid)
		444	),
		445
		446	TP_fast_assign(
		447	__entry->ns = ns;
		448	__entry->waited = do_sleep;
		449	__entry->tgid = current->tgid;
		450	),
		451
		452	TP_printk("%s time %lld ns, tgid=%d",
		453	__entry->waited ? "wait" : "poll",
		454	__entry->ns, __entry->tgid)
		455	);
		456
435	TRACE_EVENT(kvmppc_run_vcpu_enter,	457	TRACE_EVENT(kvmppc_run_vcpu_enter,
436	TP_PROTO(struct kvm_vcpu *vcpu),	458	TP_PROTO(struct kvm_vcpu *vcpu),
437		459