aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2017-11-14 16:54:23 -0500
committerRadim Krčmář <rkrcmar@redhat.com>2017-12-05 15:16:43 -0500
commitf775b13eedee2f7f3c6fdd4e90fb79090ce5d339 (patch)
tree944d1c4bdb8ffaa828b694bc1edea935dd667ffa
parent609b7002705ae72a6ca45b633b7ff1a09a7a0d86 (diff)
x86,kvm: move qemu/guest FPU switching out to vcpu_run
Currently, every time a VCPU is scheduled out, the host kernel will first save the guest FPU/xstate context, then load the qemu userspace FPU context, only to then immediately save the qemu userspace FPU context back to memory. When scheduling in a VCPU, the same extraneous FPU loads and saves are done. This could be avoided by moving from a model where the guest FPU is loaded and stored with preemption disabled, to a model where the qemu userspace FPU is swapped out for the guest FPU context for the duration of the KVM_RUN ioctl. This is done under the VCPU mutex, which is also taken when other tasks inspect the VCPU FPU context, so the code should already be safe for this change. That should come as no surprise, given that s390 already has this optimization. This can fix a bug where KVM calls get_user_pages while owning the FPU, and the file system ends up requesting the FPU again: [258270.527947] __warn+0xcb/0xf0 [258270.527948] warn_slowpath_null+0x1d/0x20 [258270.527951] kernel_fpu_disable+0x3f/0x50 [258270.527953] __kernel_fpu_begin+0x49/0x100 [258270.527955] kernel_fpu_begin+0xe/0x10 [258270.527958] crc32c_pcl_intel_update+0x84/0xb0 [258270.527961] crypto_shash_update+0x3f/0x110 [258270.527968] crc32c+0x63/0x8a [libcrc32c] [258270.527975] dm_bm_checksum+0x1b/0x20 [dm_persistent_data] [258270.527978] node_prepare_for_write+0x44/0x70 [dm_persistent_data] [258270.527985] dm_block_manager_write_callback+0x41/0x50 [dm_persistent_data] [258270.527988] submit_io+0x170/0x1b0 [dm_bufio] [258270.527992] __write_dirty_buffer+0x89/0x90 [dm_bufio] [258270.527994] __make_buffer_clean+0x4f/0x80 [dm_bufio] [258270.527996] __try_evict_buffer+0x42/0x60 [dm_bufio] [258270.527998] dm_bufio_shrink_scan+0xc0/0x130 [dm_bufio] [258270.528002] shrink_slab.part.40+0x1f5/0x420 [258270.528004] shrink_node+0x22c/0x320 [258270.528006] do_try_to_free_pages+0xf5/0x330 [258270.528008] try_to_free_pages+0xe9/0x190 [258270.528009] __alloc_pages_slowpath+0x40f/0xba0 [258270.528011] __alloc_pages_nodemask+0x209/0x260 [258270.528014] alloc_pages_vma+0x1f1/0x250 [258270.528017] do_huge_pmd_anonymous_page+0x123/0x660 [258270.528021] handle_mm_fault+0xfd3/0x1330 [258270.528025] __get_user_pages+0x113/0x640 [258270.528027] get_user_pages+0x4f/0x60 [258270.528063] __gfn_to_pfn_memslot+0x120/0x3f0 [kvm] [258270.528108] try_async_pf+0x66/0x230 [kvm] [258270.528135] tdp_page_fault+0x130/0x280 [kvm] [258270.528149] kvm_mmu_page_fault+0x60/0x120 [kvm] [258270.528158] handle_ept_violation+0x91/0x170 [kvm_intel] [258270.528162] vmx_handle_exit+0x1ca/0x1400 [kvm_intel] No performance changes were detected in quick ping-pong tests on my 4 socket system, which is expected since an FPU+xstate load is on the order of 0.1us, while ping-ponging between CPUs is on the order of 20us, and somewhat noisy. Cc: stable@vger.kernel.org Signed-off-by: Rik van Riel <riel@redhat.com> Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> [Fixed a bug where reset_vcpu called put_fpu without preceding load_fpu, which happened inside from KVM_CREATE_VCPU ioctl. - Radim] Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
-rw-r--r--arch/x86/include/asm/kvm_host.h13
-rw-r--r--arch/x86/kvm/x86.c39
-rw-r--r--include/linux/kvm_host.h2
3 files changed, 31 insertions, 23 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 977de5fb968b..62527e053ee4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,7 +536,20 @@ struct kvm_vcpu_arch {
536 struct kvm_mmu_memory_cache mmu_page_cache; 536 struct kvm_mmu_memory_cache mmu_page_cache;
537 struct kvm_mmu_memory_cache mmu_page_header_cache; 537 struct kvm_mmu_memory_cache mmu_page_header_cache;
538 538
539 /*
540 * QEMU userspace and the guest each have their own FPU state.
541 * In vcpu_run, we switch between the user and guest FPU contexts.
542 * While running a VCPU, the VCPU thread will have the guest FPU
543 * context.
544 *
545 * Note that while the PKRU state lives inside the fpu registers,
546 * it is switched out separately at VMENTER and VMEXIT time. The
547 * "guest_fpu" state here contains the guest FPU context, with the
548 * host PRKU bits.
549 */
550 struct fpu user_fpu;
539 struct fpu guest_fpu; 551 struct fpu guest_fpu;
552
540 u64 xcr0; 553 u64 xcr0;
541 u64 guest_supported_xcr0; 554 u64 guest_supported_xcr0;
542 u32 guest_xstate_size; 555 u32 guest_xstate_size;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eee8e7faf1af..c8da1680a7d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2937,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2937 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2937 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2938 pagefault_enable(); 2938 pagefault_enable();
2939 kvm_x86_ops->vcpu_put(vcpu); 2939 kvm_x86_ops->vcpu_put(vcpu);
2940 kvm_put_guest_fpu(vcpu);
2941 vcpu->arch.last_host_tsc = rdtsc(); 2940 vcpu->arch.last_host_tsc = rdtsc();
2942} 2941}
2943 2942
@@ -5254,13 +5253,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)
5254 5253
5255static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) 5254static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
5256{ 5255{
5257 preempt_disable();
5258 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
5259} 5256}
5260 5257
5261static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) 5258static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
5262{ 5259{
5263 preempt_enable();
5264} 5260}
5265 5261
5266static int emulator_intercept(struct x86_emulate_ctxt *ctxt, 5262static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
@@ -6952,7 +6948,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6952 preempt_disable(); 6948 preempt_disable();
6953 6949
6954 kvm_x86_ops->prepare_guest_switch(vcpu); 6950 kvm_x86_ops->prepare_guest_switch(vcpu);
6955 kvm_load_guest_fpu(vcpu);
6956 6951
6957 /* 6952 /*
6958 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt 6953 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -7297,12 +7292,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7297 } 7292 }
7298 } 7293 }
7299 7294
7295 kvm_load_guest_fpu(vcpu);
7296
7300 if (unlikely(vcpu->arch.complete_userspace_io)) { 7297 if (unlikely(vcpu->arch.complete_userspace_io)) {
7301 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; 7298 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
7302 vcpu->arch.complete_userspace_io = NULL; 7299 vcpu->arch.complete_userspace_io = NULL;
7303 r = cui(vcpu); 7300 r = cui(vcpu);
7304 if (r <= 0) 7301 if (r <= 0)
7305 goto out; 7302 goto out_fpu;
7306 } else 7303 } else
7307 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 7304 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
7308 7305
@@ -7311,6 +7308,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7311 else 7308 else
7312 r = vcpu_run(vcpu); 7309 r = vcpu_run(vcpu);
7313 7310
7311out_fpu:
7312 kvm_put_guest_fpu(vcpu);
7314out: 7313out:
7315 post_kvm_run_save(vcpu); 7314 post_kvm_run_save(vcpu);
7316 kvm_sigset_deactivate(vcpu); 7315 kvm_sigset_deactivate(vcpu);
@@ -7704,32 +7703,25 @@ static void fx_init(struct kvm_vcpu *vcpu)
7704 vcpu->arch.cr0 |= X86_CR0_ET; 7703 vcpu->arch.cr0 |= X86_CR0_ET;
7705} 7704}
7706 7705
7706/* Swap (qemu) user FPU context for the guest FPU context. */
7707void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 7707void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
7708{ 7708{
7709 if (vcpu->guest_fpu_loaded) 7709 preempt_disable();
7710 return; 7710 copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
7711
7712 /*
7713 * Restore all possible states in the guest,
7714 * and assume host would use all available bits.
7715 * Guest xcr0 would be loaded later.
7716 */
7717 vcpu->guest_fpu_loaded = 1;
7718 __kernel_fpu_begin();
7719 /* PKRU is separately restored in kvm_x86_ops->run. */ 7711 /* PKRU is separately restored in kvm_x86_ops->run. */
7720 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 7712 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
7721 ~XFEATURE_MASK_PKRU); 7713 ~XFEATURE_MASK_PKRU);
7714 preempt_enable();
7722 trace_kvm_fpu(1); 7715 trace_kvm_fpu(1);
7723} 7716}
7724 7717
7718/* When vcpu_run ends, restore user space FPU context. */
7725void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 7719void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
7726{ 7720{
7727 if (!vcpu->guest_fpu_loaded) 7721 preempt_disable();
7728 return;
7729
7730 vcpu->guest_fpu_loaded = 0;
7731 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 7722 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
7732 __kernel_fpu_end(); 7723 copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
7724 preempt_enable();
7733 ++vcpu->stat.fpu_reload; 7725 ++vcpu->stat.fpu_reload;
7734 trace_kvm_fpu(0); 7726 trace_kvm_fpu(0);
7735} 7727}
@@ -7846,7 +7838,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
7846 * To avoid have the INIT path from kvm_apic_has_events() that be 7838 * To avoid have the INIT path from kvm_apic_has_events() that be
7847 * called with loaded FPU and does not let userspace fix the state. 7839 * called with loaded FPU and does not let userspace fix the state.
7848 */ 7840 */
7849 kvm_put_guest_fpu(vcpu); 7841 if (init_event)
7842 kvm_put_guest_fpu(vcpu);
7850 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 7843 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
7851 XFEATURE_MASK_BNDREGS); 7844 XFEATURE_MASK_BNDREGS);
7852 if (mpx_state_buffer) 7845 if (mpx_state_buffer)
@@ -7855,6 +7848,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
7855 XFEATURE_MASK_BNDCSR); 7848 XFEATURE_MASK_BNDCSR);
7856 if (mpx_state_buffer) 7849 if (mpx_state_buffer)
7857 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); 7850 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
7851 if (init_event)
7852 kvm_load_guest_fpu(vcpu);
7858 } 7853 }
7859 7854
7860 if (!init_event) { 7855 if (!init_event) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 893d6d606cd0..6bdd4b9f6611 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -232,7 +232,7 @@ struct kvm_vcpu {
232 struct mutex mutex; 232 struct mutex mutex;
233 struct kvm_run *run; 233 struct kvm_run *run;
234 234
235 int guest_fpu_loaded, guest_xcr0_loaded; 235 int guest_xcr0_loaded;
236 struct swait_queue_head wq; 236 struct swait_queue_head wq;
237 struct pid __rcu *pid; 237 struct pid __rcu *pid;
238 int sigset_active; 238 int sigset_active;