From 7103f60de8bed21a0ad5d15d2ad5b7a333dda201 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 19 Aug 2014 16:45:56 +0200 Subject: KVM: avoid unnecessary synchronize_rcu We dont have to wait for a grace period if there is no oldpid that we are going to free. putpid also checks for NULL, so this patch only fences synchronize_rcu. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 33712fb26eb1..39b16035386f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu) struct pid *oldpid = vcpu->pid; struct pid *newpid = get_task_pid(current, PIDTYPE_PID); rcu_assign_pointer(vcpu->pid, newpid); - synchronize_rcu(); + if (oldpid) + synchronize_rcu(); put_pid(oldpid); } cpu = get_cpu(); -- cgit v1.2.2 From e790d9ef6405633b007339d746b709aed43a928d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 21 Aug 2014 18:08:05 +0200 Subject: KVM: add kvm_arch_sched_in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce preempt notifiers for architecture specific code. Advantage over creating a new notifier in every arch is slightly simpler code and guaranteed call order with respect to kvm_sched_in. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 39b16035386f..5a0817ee996e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3124,6 +3124,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) if (vcpu->preempted) vcpu->preempted = false; + kvm_arch_sched_in(vcpu, cpu); + kvm_arch_vcpu_load(vcpu, cpu); } -- cgit v1.2.2 From 64d831269ccbca1fc6d739a0f3c8aa24afb43a5e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 19 Aug 2014 12:15:00 +0200 Subject: KVM: Introduce gfn_to_hva_memslot_prot To support read-only memory regions on arm and arm64, we have a need to resolve a gfn to an hva given a pointer to a memslot to avoid looping through the memslots twice and to reuse the hva error checking of gfn_to_hva_prot(), add a new gfn_to_hva_memslot_prot() function and refactor gfn_to_hva_prot() to use this function. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/kvm_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a0817ee996e..76c92a7249c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1076,9 +1076,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva); * If writable is set to false, the hva returned by this function is only * allowed to be read. */ -unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, + gfn_t gfn, bool *writable) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) @@ -1087,6 +1087,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) return hva; } +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return gfn_to_hva_memslot_prot(slot, gfn, writable); +} + static int kvm_read_hva(void *data, void __user *hva, int len) { return __copy_from_user(data, hva, len); -- cgit v1.2.2 From 0f8a4de3e088797576ac76200b634b802e5c7781 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 26 Aug 2014 14:00:37 +0200 Subject: KVM: Unconditionally export KVM_CAP_READONLY_MEM The idea between capabilities and the KVM_CHECK_EXTENSION ioctl is that userspace can, at run-time, determine if a feature is supported or not. This allows KVM to being supporting a new feature with a new kernel version without any need to update user space. Unfortunately, since the definition of KVM_CAP_READONLY_MEM was guarded by #ifdef __KVM_HAVE_READONLY_MEM, such discovery still required a user space update. Therefore, unconditionally export KVM_CAP_READONLY_MEM and change the in-kernel conditional to rely on __KVM_HAVE_READONLY_MEM. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a0817ee996e..1d03967def40 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -708,7 +708,7 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef KVM_CAP_READONLY_MEM +#ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif -- cgit v1.2.2 From 13a34e067eab24fec882e1834fbf2cc31911d474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 28 Aug 2014 15:13:03 +0200 Subject: KVM: remove garbage arg to *hardware_{en,dis}able MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the beggining was on_each_cpu(), which required an unused argument to kvm_arch_ops.hardware_{en,dis}able, but this was soon forgotten. Remove unnecessary arguments that stem from this. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1d03967def40..7176929a4cda 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2669,7 +2669,7 @@ static void hardware_enable_nolock(void *junk) cpumask_set_cpu(cpu, cpus_hardware_enabled); - r = kvm_arch_hardware_enable(NULL); + r = kvm_arch_hardware_enable(); if (r) { cpumask_clear_cpu(cpu, cpus_hardware_enabled); @@ -2694,7 +2694,7 @@ static void hardware_disable_nolock(void *junk) if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; cpumask_clear_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_disable(NULL); + kvm_arch_hardware_disable(); } static void hardware_disable(void) -- cgit v1.2.2 From 00f034a12fdd81210d58116326d92780aac5c238 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Aug 2014 14:29:21 +0200 Subject: KVM: do not bias the generation number in kvm_current_mmio_generation The next patch will give a meaning (a la seqcount) to the low bit of the generation number. Ensure that it matches between kvm->memslots->generation and kvm_current_mmio_generation(). Cc: stable@vger.kernel.org Reviewed-by: David Matlack Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7176929a4cda..0bfdb673db26 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -477,6 +477,13 @@ static struct kvm *kvm_create_vm(unsigned long type) kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) goto out_err_no_srcu; + + /* + * Init kvm generation close to the maximum to easily test the + * code of handling generation number wrap-around. + */ + kvm->memslots->generation = -150; + kvm_init_memslots_id(kvm); if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; -- cgit v1.2.2 From ee3d1570b58677885b4552bce8217fda7b226a68 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 18 Aug 2014 15:46:06 -0700 Subject: kvm: fix potentially corrupt mmio cache vcpu exits and memslot mutations can run concurrently as long as the vcpu does not aquire the slots mutex. Thus it is theoretically possible for memslots to change underneath a vcpu that is handling an exit. If we increment the memslot generation number again after synchronize_srcu_expedited(), vcpus can safely cache memslot generation without maintaining a single rcu_dereference through an entire vm exit. And much of the x86/kvm code does not maintain a single rcu_dereference of the current memslots during each exit. We can prevent the following case: vcpu (CPU 0) | thread (CPU 1) --------------------------------------------+-------------------------- 1 vm exit | 2 srcu_read_unlock(&kvm->srcu) | 3 decide to cache something based on | old memslots | 4 | change memslots | (increments generation) 5 | synchronize_srcu(&kvm->srcu); 6 retrieve generation # from new memslots | 7 tag cache with new memslot generation | 8 srcu_read_unlock(&kvm->srcu) | ... | | ... | | | By incrementing the generation after synchronizing with kvm->srcu readers, we ensure that the generation retrieved in (6) will become invalid soon after (8). Keeping the existing increment is not strictly necessary, but we do keep it and just move it for consistency from update_memslots to install_new_memslots. It invalidates old cached MMIOs immediately, instead of having to wait for the end of synchronize_srcu_expedited, which makes the code more clearly correct in case CPU 1 is preempted right after synchronize_srcu() returns. To avoid halving the generation space in SPTEs, always presume that the low bit of the generation is zero when reconstructing a generation number out of an SPTE. This effectively disables MMIO caching in SPTEs during the call to synchronize_srcu_expedited. Using the low bit this way is somewhat like a seqcount---where the protected thing is a cache, and instead of retrying we can simply punt if we observe the low bit to be 1. Cc: stable@vger.kernel.org Signed-off-by: David Matlack Reviewed-by: Xiao Guangrong Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0bfdb673db26..bb8641b5d83b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -95,8 +95,6 @@ static int hardware_enable_all(void); static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, u64 last_generation); static void kvm_release_pfn_dirty(pfn_t pfn); static void mark_page_dirty_in_slot(struct kvm *kvm, @@ -695,8 +693,7 @@ static void sort_memslots(struct kvm_memslots *slots) } static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, - u64 last_generation) + struct kvm_memory_slot *new) { if (new) { int id = new->id; @@ -707,8 +704,6 @@ static void update_memslots(struct kvm_memslots *slots, if (new->npages != npages) sort_memslots(slots); } - - slots->generation = last_generation + 1; } static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) @@ -730,10 +725,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, { struct kvm_memslots *old_memslots = kvm->memslots; - update_memslots(slots, new, kvm->memslots->generation); + /* + * Set the low bit in the generation, which disables SPTE caching + * until the end of synchronize_srcu_expedited. + */ + WARN_ON(old_memslots->generation & 1); + slots->generation = old_memslots->generation + 1; + + update_memslots(slots, new); rcu_assign_pointer(kvm->memslots, slots); synchronize_srcu_expedited(&kvm->srcu); + /* + * Increment the new memslot generation a second time. This prevents + * vm exits that race with memslot updates from caching a memslot + * generation that will (potentially) be valid forever. + */ + slots->generation++; + kvm_arch_memslots_updated(kvm); return old_memslots; -- cgit v1.2.2 From 34656113182b704682e23d1363417536addfec97 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:31 +0200 Subject: KVM: remove redundant check of in_spin_loop The expression `vcpu->spin_loop.in_spin_loop' is always true, because it is evaluated only when the condition `!vcpu->spin_loop.in_spin_loop' is false. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bb8641b5d83b..cc7bd286d135 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1785,8 +1785,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || - (vcpu->spin_loop.in_spin_loop && - vcpu->spin_loop.dy_eligible); + vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); -- cgit v1.2.2 From a13f533b2f1d53a7c0baa7490498caeab7bc8ba5 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:32 +0200 Subject: KVM: remove redundant assigment of return value in kvm_dev_ioctl The first statement of kvm_dev_ioctl is long r = -EINVAL; No need to reassign the same value. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc7bd286d135..de1ae82ba192 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2627,7 +2627,6 @@ static long kvm_dev_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_API_VERSION: - r = -EINVAL; if (arg) goto out; r = KVM_API_VERSION; @@ -2639,7 +2638,6 @@ static long kvm_dev_ioctl(struct file *filp, r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ -- cgit v1.2.2 From f2a25160887e00434ce1361007009120e1fecbda Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:33 +0200 Subject: KVM: remove redundant assignments in __kvm_set_memory_region __kvm_set_memory_region sets r to EINVAL very early. Doing it again is not necessary. The same is true later on, where r is assigned -ENOMEM twice. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index de1ae82ba192..c338599804e0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -793,7 +793,6 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; if (npages > KVM_MEM_MAX_NR_PAGES) goto out; @@ -807,7 +806,6 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; @@ -863,7 +861,6 @@ int __kvm_set_memory_region(struct kvm *kvm, } if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - r = -ENOMEM; slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) -- cgit v1.2.2 From d60eacb07053142bfb9b41582074a89a790a9d46 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:33 +0100 Subject: KVM: device: add simple registration mechanism for kvm_device_ops kvm_ioctl_create_device currently has knowledge of all the device types and their associated ops. This is fairly inflexible when adding support for new in-kernel device emulations, so move what we currently have out into a table, which can support dynamic registration of ops by new drivers for virtual hardware. Cc: Alex Williamson Cc: Alex Graf Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Marc Zyngier Acked-by: Cornelia Huck Reviewed-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 65 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c338599804e0..686d783387a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2272,44 +2272,55 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } -static int kvm_ioctl_create_device(struct kvm *kvm, - struct kvm_create_device *cd) -{ - struct kvm_device_ops *ops = NULL; - struct kvm_device *dev; - bool test = cd->flags & KVM_CREATE_DEVICE_TEST; - int ret; - - switch (cd->type) { +static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC - case KVM_DEV_TYPE_FSL_MPIC_20: - case KVM_DEV_TYPE_FSL_MPIC_42: - ops = &kvm_mpic_ops; - break; + [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, + [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif + #ifdef CONFIG_KVM_XICS - case KVM_DEV_TYPE_XICS: - ops = &kvm_xics_ops; - break; + [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, #endif + #ifdef CONFIG_KVM_VFIO - case KVM_DEV_TYPE_VFIO: - ops = &kvm_vfio_ops; - break; + [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif + #ifdef CONFIG_KVM_ARM_VGIC - case KVM_DEV_TYPE_ARM_VGIC_V2: - ops = &kvm_arm_vgic_v2_ops; - break; + [KVM_DEV_TYPE_ARM_VGIC_V2] = &kvm_arm_vgic_v2_ops, #endif + #ifdef CONFIG_S390 - case KVM_DEV_TYPE_FLIC: - ops = &kvm_flic_ops; - break; + [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, #endif - default: +}; + +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENOSPC; + + if (kvm_device_ops_table[type] != NULL) + return -EEXIST; + + kvm_device_ops_table[type] = ops; + return 0; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, + struct kvm_create_device *cd) +{ + struct kvm_device_ops *ops = NULL; + struct kvm_device *dev; + bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int ret; + + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENODEV; + + ops = kvm_device_ops_table[cd->type]; + if (ops == NULL) return -ENODEV; - } if (test) return 0; -- cgit v1.2.2 From c06a841bf36340e9e917ce60d11a6425ac85d0bd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:34 +0100 Subject: KVM: ARM: vgic: register kvm_device_ops dynamically Now that we have a dynamic means to register kvm_device_ops, use that for the ARM VGIC, instead of relying on the static table. Cc: Gleb Natapov Cc: Paolo Bonzini Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 686d783387a0..68d96f5dbfe2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2286,10 +2286,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif -#ifdef CONFIG_KVM_ARM_VGIC - [KVM_DEV_TYPE_ARM_VGIC_V2] = &kvm_arm_vgic_v2_ops, -#endif - #ifdef CONFIG_S390 [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, #endif -- cgit v1.2.2 From 84877d93336de21a6251db00b841468a83c65906 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 2 Sep 2014 10:27:35 +0100 Subject: KVM: s390: register flic ops dynamically Using the new kvm_register_device_ops() interface makes us get rid of an #ifdef in common code. Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Cornelia Huck Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 68d96f5dbfe2..f4e792fedb4f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2285,10 +2285,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_VFIO [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif - -#ifdef CONFIG_S390 - [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, -#endif }; int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) -- cgit v1.2.2 From 80ce1639727e9d38729c34f162378508c307ca25 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:36 +0100 Subject: KVM: VFIO: register kvm_device_ops dynamically Now that we have a dynamic means to register kvm_device_ops, use that for the VFIO kvm device, instead of relying on the static table. This is achieved by a module_init call to register the ops with KVM. Cc: Gleb Natapov Cc: Paolo Bonzini Acked-by: Alex Williamson Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f4e792fedb4f..db57363cc287 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2281,10 +2281,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_XICS [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, #endif - -#ifdef CONFIG_KVM_VFIO - [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, -#endif }; int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) -- cgit v1.2.2 From 3c3c29fd0d7cddc32862c350d0700ce69953e3bd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 24 Sep 2014 13:02:46 +0200 Subject: kvm-vfio: do not use module_init /me got confused between the kernel and QEMU. In the kernel, you can only have one module_init function, and it will prevent unloading the module unless you also have the corresponding module_exit function. So, commit 80ce1639727e (KVM: VFIO: register kvm_device_ops dynamically, 2014-09-02) broke unloading of the kvm module, by adding a module_init function and no module_exit. Repair it by making kvm_vfio_ops_init weak, and checking it in kvm_init. Cc: Will Deacon Cc: Gleb Natapov Cc: Alex Williamson Fixes: 80ce1639727e9d38729c34f162378508c307ca25 Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index db57363cc287..499db0977f3c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -57,6 +57,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "vfio.h" #define CREATE_TRACE_POINTS #include @@ -3226,6 +3227,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_undebugfs; } + r = kvm_vfio_ops_init(); + WARN_ON(r); + return 0; out_undebugfs: -- cgit v1.2.2 From 234b239bea395316d7f78018c672f4a88b3cdf0d Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Wed, 17 Sep 2014 10:51:48 -0700 Subject: kvm: Faults which trigger IO release the mmap_sem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been swapped out or is behind a filemap, this will trigger async readahead and return immediately. The rationale is that KVM will kick back the guest with an "async page fault" and allow for some other guest process to take over. If async PFs are enabled the fault is retried asap from an async workqueue. If not, it's retried immediately in the same code path. In either case the retry will not relinquish the mmap semaphore and will block on the IO. This is a bad thing, as other mmap semaphore users now stall as a function of swap or filemap latency. This patch ensures both the regular and async PF path re-enter the fault allowing for the mmap semaphore to be relinquished in the case of IO wait. Reviewed-by: Radim Krčmář Signed-off-by: Andres Lagar-Cavilla Acked-by: Andrew Morton Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 499db0977f3c..1c6e8476b244 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1122,6 +1122,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); } +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, bool write_fault, + struct page **pagep) +{ + int npages; + int locked = 1; + int flags = FOLL_TOUCH | FOLL_HWPOISON | + (pagep ? FOLL_GET : 0) | + (write_fault ? FOLL_WRITE : 0); + + /* + * If retrying the fault, we get here *not* having allowed the filemap + * to wait on the page lock. We should now allow waiting on the IO with + * the mmap semaphore released. + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, + &locked); + if (!locked) { + VM_BUG_ON(npages != -EBUSY); + + if (!pagep) + return 0; + + /* + * The previous call has now waited on the IO. Now we can + * retry and complete. Pass TRIED to ensure we do not re + * schedule async IO (see e.g. filemap_fault). + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED, + pagep, NULL, NULL); + } + up_read(&mm->mmap_sem); + return npages; +} + static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; @@ -1184,9 +1221,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(current, current->mm, addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else - npages = get_user_pages_fast(addr, 1, write_fault, - page); + } else { + /* + * By now we have tried gup_fast, and possibly async_pf, and we + * are certainly not atomic. Time to retry the gup, allowing + * mmap semaphore to be relinquished in the case of IO. + */ + npages = kvm_get_user_page_io(current, current->mm, addr, + write_fault, page); + } if (npages != 1) return npages; -- cgit v1.2.2 From 2ea75be3219571d0ec009ce20d9971e54af96e09 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 19 Sep 2014 16:03:25 -0700 Subject: kvm: don't take vcpu mutex for obviously invalid vcpu ioctls vcpu ioctls can hang the calling thread if issued while a vcpu is running. However, invalid ioctls can happen when userspace tries to probe the kind of file descriptors (e.g. isatty() calls ioctl(TCGETS)); in that case, we know the ioctl is going to be rejected as invalid anyway and we can fail before trying to take the vcpu mutex. This patch does not change functionality, it just makes invalid ioctls fail faster. Cc: stable@vger.kernel.org Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1c6e8476b244..ff42b11d2b9c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -52,6 +52,7 @@ #include #include +#include #include #include @@ -2032,6 +2033,9 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; + if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) + return -EINVAL; + #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) /* * Special cases: vcpu ioctls that are asynchronous to vcpu execution, -- cgit v1.2.2 From 57128468080a8b6ea452223036d3e417f748af55 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Mon, 22 Sep 2014 14:54:42 -0700 Subject: kvm: Fix page ageing bugs 1. We were calling clear_flush_young_notify in unmap_one, but we are within an mmu notifier invalidate range scope. The spte exists no more (due to range_start) and the accessed bit info has already been propagated (due to kvm_pfn_set_accessed). Simply call clear_flush_young. 2. We clear_flush_young on a primary MMU PMD, but this may be mapped as a collection of PTEs by the secondary MMU (e.g. during log-dirty). This required expanding the interface of the clear_flush_young mmu notifier, so a lot of code has been trivially touched. 3. In the absence of shadow_accessed_mask (e.g. EPT A bit), we emulate the access bit by blowing the spte. This requires proper synchronizing with MMU notifier consumers, like every other removal of spte's does. Signed-off-by: Andres Lagar-Cavilla Acked-by: Rik van Riel Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ff42b11d2b9c..0316314d48f4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,7 +369,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -377,7 +378,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); + young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); -- cgit v1.2.2 From 445b8236959bfe624a5aa9bce89f44a3bec9b2b1 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 24 Sep 2014 15:57:55 +0800 Subject: kvm: Rename make_all_cpus_request() to kvm_make_all_cpus_request() and make it non-static Different architectures need different requests, and in fact we will use this function in architecture-specific code later. This will be outside kvm_main.c, so make it non-static and rename it to kvm_make_all_cpus_request(). Reviewed-by: Paolo Bonzini Signed-off-by: Tang Chen Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0316314d48f4..5b8ca365932a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -153,7 +153,7 @@ static void ack_flush(void *_completed) { } -static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { int i, cpu, me; cpumask_var_t cpus; @@ -190,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) long dirty_count = kvm->tlbs_dirty; smp_mb(); - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } @@ -198,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_reload_remote_mmus(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } void kvm_make_mclock_inprogress_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) -- cgit v1.2.2 From fe71557afbec641fee73711e40602bed37f6f33b Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 24 Sep 2014 15:57:57 +0800 Subject: kvm: Add arch specific mmu notifier for page invalidation This will be used to let the guest run while the APIC access page is not pinned. Because subsequent patches will fill in the function for x86, place the (still empty) x86 implementation in the x86.c file instead of adding an inline function in kvm_host.h. Signed-off-by: Tang Chen Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b8ca365932a..3f16f569169e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -296,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + kvm_arch_mmu_notifier_invalidate_page(kvm, address); + srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.2 From bb0ca6acd466af55c95b7ce508f29e23a24cabd9 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Thu, 25 Sep 2014 15:26:50 -0700 Subject: kvm: Fix kvm_get_page_retry_io __gup retval check Confusion around -EBUSY and zero (inside a BUG_ON no less). Reported-by: Andrea Arcangeli Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3f16f569169e..a1cf53ee0d28 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1146,7 +1146,7 @@ int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, &locked); if (!locked) { - VM_BUG_ON(npages != -EBUSY); + VM_BUG_ON(npages); if (!pagep) return 0; -- cgit v1.2.2