From 7103f60de8bed21a0ad5d15d2ad5b7a333dda201 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 19 Aug 2014 16:45:56 +0200 Subject: KVM: avoid unnecessary synchronize_rcu We dont have to wait for a grace period if there is no oldpid that we are going to free. putpid also checks for NULL, so this patch only fences synchronize_rcu. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 33712fb26eb1..39b16035386f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu) struct pid *oldpid = vcpu->pid; struct pid *newpid = get_task_pid(current, PIDTYPE_PID); rcu_assign_pointer(vcpu->pid, newpid); - synchronize_rcu(); + if (oldpid) + synchronize_rcu(); put_pid(oldpid); } cpu = get_cpu(); -- cgit v1.2.2 From e790d9ef6405633b007339d746b709aed43a928d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 21 Aug 2014 18:08:05 +0200 Subject: KVM: add kvm_arch_sched_in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce preempt notifiers for architecture specific code. Advantage over creating a new notifier in every arch is slightly simpler code and guaranteed call order with respect to kvm_sched_in. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 39b16035386f..5a0817ee996e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3124,6 +3124,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) if (vcpu->preempted) vcpu->preempted = false; + kvm_arch_sched_in(vcpu, cpu); + kvm_arch_vcpu_load(vcpu, cpu); } -- cgit v1.2.2 From 64d831269ccbca1fc6d739a0f3c8aa24afb43a5e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 19 Aug 2014 12:15:00 +0200 Subject: KVM: Introduce gfn_to_hva_memslot_prot To support read-only memory regions on arm and arm64, we have a need to resolve a gfn to an hva given a pointer to a memslot to avoid looping through the memslots twice and to reuse the hva error checking of gfn_to_hva_prot(), add a new gfn_to_hva_memslot_prot() function and refactor gfn_to_hva_prot() to use this function. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/kvm_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a0817ee996e..76c92a7249c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1076,9 +1076,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva); * If writable is set to false, the hva returned by this function is only * allowed to be read. */ -unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, + gfn_t gfn, bool *writable) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) @@ -1087,6 +1087,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) return hva; } +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return gfn_to_hva_memslot_prot(slot, gfn, writable); +} + static int kvm_read_hva(void *data, void __user *hva, int len) { return __copy_from_user(data, hva, len); -- cgit v1.2.2 From 1fa451bcc67fa921a04c5fac8dbcde7844d54512 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 Aug 2014 15:13:24 +0100 Subject: KVM: vgic: return int instead of bool when checking I/O ranges vgic_ioaddr_overlap claims to return a bool, but in reality it returns an int. Shut sparse up by fixing the type signature. Cc: Christoffer Dall Cc: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 73eba793b17f..d1cfe672b9d7 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1690,7 +1690,7 @@ out: return ret; } -static bool vgic_ioaddr_overlap(struct kvm *kvm) +static int vgic_ioaddr_overlap(struct kvm *kvm) { phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; -- cgit v1.2.2 From de56fb1923ca11f428bf557870e0faa99f38762e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 Aug 2014 15:13:25 +0100 Subject: KVM: vgic: declare probe function pointer as const We extract the vgic probe function from the of_device_id data pointer, which is const. Kill the sparse warning by ensuring that the local function pointer is also marked as const. Cc: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index d1cfe672b9d7..efe6eee2e7eb 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1557,8 +1557,8 @@ static const struct of_device_id vgic_ids[] = { int kvm_vgic_hyp_init(void) { const struct of_device_id *matched_id; - int (*vgic_probe)(struct device_node *,const struct vgic_ops **, - const struct vgic_params **); + const int (*vgic_probe)(struct device_node *,const struct vgic_ops **, + const struct vgic_params **); struct device_node *vgic_node; int ret; -- cgit v1.2.2 From 0f8a4de3e088797576ac76200b634b802e5c7781 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 26 Aug 2014 14:00:37 +0200 Subject: KVM: Unconditionally export KVM_CAP_READONLY_MEM The idea between capabilities and the KVM_CHECK_EXTENSION ioctl is that userspace can, at run-time, determine if a feature is supported or not. This allows KVM to being supporting a new feature with a new kernel version without any need to update user space. Unfortunately, since the definition of KVM_CAP_READONLY_MEM was guarded by #ifdef __KVM_HAVE_READONLY_MEM, such discovery still required a user space update. Therefore, unconditionally export KVM_CAP_READONLY_MEM and change the in-kernel conditional to rely on __KVM_HAVE_READONLY_MEM. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a0817ee996e..1d03967def40 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -708,7 +708,7 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef KVM_CAP_READONLY_MEM +#ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif -- cgit v1.2.2 From 13a34e067eab24fec882e1834fbf2cc31911d474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 28 Aug 2014 15:13:03 +0200 Subject: KVM: remove garbage arg to *hardware_{en,dis}able MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the beggining was on_each_cpu(), which required an unused argument to kvm_arch_ops.hardware_{en,dis}able, but this was soon forgotten. Remove unnecessary arguments that stem from this. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1d03967def40..7176929a4cda 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2669,7 +2669,7 @@ static void hardware_enable_nolock(void *junk) cpumask_set_cpu(cpu, cpus_hardware_enabled); - r = kvm_arch_hardware_enable(NULL); + r = kvm_arch_hardware_enable(); if (r) { cpumask_clear_cpu(cpu, cpus_hardware_enabled); @@ -2694,7 +2694,7 @@ static void hardware_disable_nolock(void *junk) if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; cpumask_clear_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_disable(NULL); + kvm_arch_hardware_disable(); } static void hardware_disable(void) -- cgit v1.2.2 From 00f034a12fdd81210d58116326d92780aac5c238 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Aug 2014 14:29:21 +0200 Subject: KVM: do not bias the generation number in kvm_current_mmio_generation The next patch will give a meaning (a la seqcount) to the low bit of the generation number. Ensure that it matches between kvm->memslots->generation and kvm_current_mmio_generation(). Cc: stable@vger.kernel.org Reviewed-by: David Matlack Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7176929a4cda..0bfdb673db26 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -477,6 +477,13 @@ static struct kvm *kvm_create_vm(unsigned long type) kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) goto out_err_no_srcu; + + /* + * Init kvm generation close to the maximum to easily test the + * code of handling generation number wrap-around. + */ + kvm->memslots->generation = -150; + kvm_init_memslots_id(kvm); if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; -- cgit v1.2.2 From ee3d1570b58677885b4552bce8217fda7b226a68 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 18 Aug 2014 15:46:06 -0700 Subject: kvm: fix potentially corrupt mmio cache vcpu exits and memslot mutations can run concurrently as long as the vcpu does not aquire the slots mutex. Thus it is theoretically possible for memslots to change underneath a vcpu that is handling an exit. If we increment the memslot generation number again after synchronize_srcu_expedited(), vcpus can safely cache memslot generation without maintaining a single rcu_dereference through an entire vm exit. And much of the x86/kvm code does not maintain a single rcu_dereference of the current memslots during each exit. We can prevent the following case: vcpu (CPU 0) | thread (CPU 1) --------------------------------------------+-------------------------- 1 vm exit | 2 srcu_read_unlock(&kvm->srcu) | 3 decide to cache something based on | old memslots | 4 | change memslots | (increments generation) 5 | synchronize_srcu(&kvm->srcu); 6 retrieve generation # from new memslots | 7 tag cache with new memslot generation | 8 srcu_read_unlock(&kvm->srcu) | ... | | ... | | | By incrementing the generation after synchronizing with kvm->srcu readers, we ensure that the generation retrieved in (6) will become invalid soon after (8). Keeping the existing increment is not strictly necessary, but we do keep it and just move it for consistency from update_memslots to install_new_memslots. It invalidates old cached MMIOs immediately, instead of having to wait for the end of synchronize_srcu_expedited, which makes the code more clearly correct in case CPU 1 is preempted right after synchronize_srcu() returns. To avoid halving the generation space in SPTEs, always presume that the low bit of the generation is zero when reconstructing a generation number out of an SPTE. This effectively disables MMIO caching in SPTEs during the call to synchronize_srcu_expedited. Using the low bit this way is somewhat like a seqcount---where the protected thing is a cache, and instead of retrying we can simply punt if we observe the low bit to be 1. Cc: stable@vger.kernel.org Signed-off-by: David Matlack Reviewed-by: Xiao Guangrong Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0bfdb673db26..bb8641b5d83b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -95,8 +95,6 @@ static int hardware_enable_all(void); static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, u64 last_generation); static void kvm_release_pfn_dirty(pfn_t pfn); static void mark_page_dirty_in_slot(struct kvm *kvm, @@ -695,8 +693,7 @@ static void sort_memslots(struct kvm_memslots *slots) } static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, - u64 last_generation) + struct kvm_memory_slot *new) { if (new) { int id = new->id; @@ -707,8 +704,6 @@ static void update_memslots(struct kvm_memslots *slots, if (new->npages != npages) sort_memslots(slots); } - - slots->generation = last_generation + 1; } static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) @@ -730,10 +725,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, { struct kvm_memslots *old_memslots = kvm->memslots; - update_memslots(slots, new, kvm->memslots->generation); + /* + * Set the low bit in the generation, which disables SPTE caching + * until the end of synchronize_srcu_expedited. + */ + WARN_ON(old_memslots->generation & 1); + slots->generation = old_memslots->generation + 1; + + update_memslots(slots, new); rcu_assign_pointer(kvm->memslots, slots); synchronize_srcu_expedited(&kvm->srcu); + /* + * Increment the new memslot generation a second time. This prevents + * vm exits that race with memslot updates from caching a memslot + * generation that will (potentially) be valid forever. + */ + slots->generation++; + kvm_arch_memslots_updated(kvm); return old_memslots; -- cgit v1.2.2 From 34656113182b704682e23d1363417536addfec97 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:31 +0200 Subject: KVM: remove redundant check of in_spin_loop The expression `vcpu->spin_loop.in_spin_loop' is always true, because it is evaluated only when the condition `!vcpu->spin_loop.in_spin_loop' is false. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bb8641b5d83b..cc7bd286d135 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1785,8 +1785,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || - (vcpu->spin_loop.in_spin_loop && - vcpu->spin_loop.dy_eligible); + vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); -- cgit v1.2.2 From a13f533b2f1d53a7c0baa7490498caeab7bc8ba5 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:32 +0200 Subject: KVM: remove redundant assigment of return value in kvm_dev_ioctl The first statement of kvm_dev_ioctl is long r = -EINVAL; No need to reassign the same value. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc7bd286d135..de1ae82ba192 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2627,7 +2627,6 @@ static long kvm_dev_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_API_VERSION: - r = -EINVAL; if (arg) goto out; r = KVM_API_VERSION; @@ -2639,7 +2638,6 @@ static long kvm_dev_ioctl(struct file *filp, r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ -- cgit v1.2.2 From f2a25160887e00434ce1361007009120e1fecbda Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Sep 2014 21:13:33 +0200 Subject: KVM: remove redundant assignments in __kvm_set_memory_region __kvm_set_memory_region sets r to EINVAL very early. Doing it again is not necessary. The same is true later on, where r is assigned -ENOMEM twice. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index de1ae82ba192..c338599804e0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -793,7 +793,6 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; if (npages > KVM_MEM_MAX_NR_PAGES) goto out; @@ -807,7 +806,6 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; @@ -863,7 +861,6 @@ int __kvm_set_memory_region(struct kvm *kvm, } if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - r = -ENOMEM; slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) -- cgit v1.2.2 From 0ba09511ddc3ff0b462f37b4fe4b9c4dccc054ec Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 1 Sep 2014 09:36:08 +0100 Subject: KVM: EVENTFD: remove inclusion of irq.h No more needed. irq.h would be void on ARM. Acked-by: Paolo Bonzini Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/eventfd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 3c5981c87c3f..0c712a779b44 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -36,7 +36,6 @@ #include #include -#include "irq.h" #include "iodev.h" #ifdef CONFIG_HAVE_KVM_IRQFD -- cgit v1.2.2 From 184564efae4d775225c8fe3b762a56956fb1f827 Mon Sep 17 00:00:00 2001 From: Zhang Haoyu Date: Thu, 11 Sep 2014 16:47:04 +0800 Subject: kvm: ioapic: conditionally delay irq delivery duringeoi broadcast Currently, we call ioapic_service() immediately when we find the irq is still active during eoi broadcast. But for real hardware, there's some delay between the EOI writing and irq delivery. If we do not emulate this behavior, and re-inject the interrupt immediately after the guest sends an EOI and re-enables interrupts, a guest might spend all its time in the ISR if it has a broken handler for a level-triggered interrupt. Such livelock actually happens with Windows guests when resuming from hibernation. As there's no way to recognize the broken handle from new raised ones, this patch delays an interrupt if 10.000 consecutive EOIs found that the interrupt was still high. The guest can then make a little forward progress, until a proper IRQ handler is set or until some detection routine in the guest (such as Linux's note_interrupt()) recognizes the situation. Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: Zhang Haoyu Signed-off-by: Paolo Bonzini --- virt/kvm/ioapic.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- virt/kvm/ioapic.h | 2 ++ 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index e8ce34c9db32..0ba4057d271b 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -405,6 +405,26 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) spin_unlock(&ioapic->lock); } +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) +{ + int i; + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, + eoi_inject.work); + spin_lock(&ioapic->lock); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) + continue; + + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) + ioapic_service(ioapic, i, false); + } + spin_unlock(&ioapic->lock); +} + +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 + static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { @@ -435,8 +455,26 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (ioapic->irr & (1 << i)) - ioapic_service(ioapic, i, false); + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { + ++ioapic->irq_eoi[i]; + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[i] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); + } else { + ioapic_service(ioapic, i, false); + } + } else { + ioapic->irq_eoi[i] = 0; + } } } @@ -565,12 +603,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; + cancel_delayed_work_sync(&ioapic->eoi_inject); for (i = 0; i < IOAPIC_NUM_PINS; i++) ioapic->redirtbl[i].fields.mask = 1; ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); update_handled_vectors(ioapic); } @@ -589,6 +629,7 @@ int kvm_ioapic_init(struct kvm *kvm) if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); @@ -609,6 +650,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + cancel_delayed_work_sync(&ioapic->eoi_inject); if (ioapic) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 90d43e95dcf8..e23b70634f1e 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -59,6 +59,8 @@ struct kvm_ioapic { spinlock_t lock; DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; + struct delayed_work eoi_inject; + u32 irq_eoi[IOAPIC_NUM_PINS]; }; #ifdef DEBUG -- cgit v1.2.2 From d60eacb07053142bfb9b41582074a89a790a9d46 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:33 +0100 Subject: KVM: device: add simple registration mechanism for kvm_device_ops kvm_ioctl_create_device currently has knowledge of all the device types and their associated ops. This is fairly inflexible when adding support for new in-kernel device emulations, so move what we currently have out into a table, which can support dynamic registration of ops by new drivers for virtual hardware. Cc: Alex Williamson Cc: Alex Graf Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Marc Zyngier Acked-by: Cornelia Huck Reviewed-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 65 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c338599804e0..686d783387a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2272,44 +2272,55 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } -static int kvm_ioctl_create_device(struct kvm *kvm, - struct kvm_create_device *cd) -{ - struct kvm_device_ops *ops = NULL; - struct kvm_device *dev; - bool test = cd->flags & KVM_CREATE_DEVICE_TEST; - int ret; - - switch (cd->type) { +static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC - case KVM_DEV_TYPE_FSL_MPIC_20: - case KVM_DEV_TYPE_FSL_MPIC_42: - ops = &kvm_mpic_ops; - break; + [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, + [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif + #ifdef CONFIG_KVM_XICS - case KVM_DEV_TYPE_XICS: - ops = &kvm_xics_ops; - break; + [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, #endif + #ifdef CONFIG_KVM_VFIO - case KVM_DEV_TYPE_VFIO: - ops = &kvm_vfio_ops; - break; + [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif + #ifdef CONFIG_KVM_ARM_VGIC - case KVM_DEV_TYPE_ARM_VGIC_V2: - ops = &kvm_arm_vgic_v2_ops; - break; + [KVM_DEV_TYPE_ARM_VGIC_V2] = &kvm_arm_vgic_v2_ops, #endif + #ifdef CONFIG_S390 - case KVM_DEV_TYPE_FLIC: - ops = &kvm_flic_ops; - break; + [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, #endif - default: +}; + +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENOSPC; + + if (kvm_device_ops_table[type] != NULL) + return -EEXIST; + + kvm_device_ops_table[type] = ops; + return 0; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, + struct kvm_create_device *cd) +{ + struct kvm_device_ops *ops = NULL; + struct kvm_device *dev; + bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int ret; + + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENODEV; + + ops = kvm_device_ops_table[cd->type]; + if (ops == NULL) return -ENODEV; - } if (test) return 0; -- cgit v1.2.2 From c06a841bf36340e9e917ce60d11a6425ac85d0bd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:34 +0100 Subject: KVM: ARM: vgic: register kvm_device_ops dynamically Now that we have a dynamic means to register kvm_device_ops, use that for the ARM VGIC, instead of relying on the static table. Cc: Gleb Natapov Cc: Paolo Bonzini Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/arm/vgic.c | 157 ++++++++++++++++++++++++++-------------------------- virt/kvm/kvm_main.c | 4 -- 2 files changed, 79 insertions(+), 82 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 73eba793b17f..3ee3ce06bbec 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1522,83 +1522,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) return 0; } -static void vgic_init_maintenance_interrupt(void *info) -{ - enable_percpu_irq(vgic->maint_irq, 0); -} - -static int vgic_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) -{ - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: - vgic_init_maintenance_interrupt(NULL); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - disable_percpu_irq(vgic->maint_irq); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block vgic_cpu_nb = { - .notifier_call = vgic_cpu_notify, -}; - -static const struct of_device_id vgic_ids[] = { - { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, - { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, - {}, -}; - -int kvm_vgic_hyp_init(void) -{ - const struct of_device_id *matched_id; - int (*vgic_probe)(struct device_node *,const struct vgic_ops **, - const struct vgic_params **); - struct device_node *vgic_node; - int ret; - - vgic_node = of_find_matching_node_and_match(NULL, - vgic_ids, &matched_id); - if (!vgic_node) { - kvm_err("error: no compatible GIC node found\n"); - return -ENODEV; - } - - vgic_probe = matched_id->data; - ret = vgic_probe(vgic_node, &vgic_ops, &vgic); - if (ret) - return ret; - - ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); - return ret; - } - - ret = __register_cpu_notifier(&vgic_cpu_nb); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - /* Callback into for arch code for setup */ - vgic_arch_setup(vgic); - - on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); - - return 0; - -out_free_irq: - free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus()); - return ret; -} - /** * kvm_vgic_init - Initialize global VGIC state before running any VCPUs * @kvm: pointer to the kvm struct @@ -2062,7 +1985,7 @@ static int vgic_create(struct kvm_device *dev, u32 type) return kvm_vgic_create(dev->kvm); } -struct kvm_device_ops kvm_arm_vgic_v2_ops = { +static struct kvm_device_ops kvm_arm_vgic_v2_ops = { .name = "kvm-arm-vgic", .create = vgic_create, .destroy = vgic_destroy, @@ -2070,3 +1993,81 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = { .get_attr = vgic_get_attr, .has_attr = vgic_has_attr, }; + +static void vgic_init_maintenance_interrupt(void *info) +{ + enable_percpu_irq(vgic->maint_irq, 0); +} + +static int vgic_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + vgic_init_maintenance_interrupt(NULL); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + disable_percpu_irq(vgic->maint_irq); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block vgic_cpu_nb = { + .notifier_call = vgic_cpu_notify, +}; + +static const struct of_device_id vgic_ids[] = { + { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, + { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, + {}, +}; + +int kvm_vgic_hyp_init(void) +{ + const struct of_device_id *matched_id; + int (*vgic_probe)(struct device_node *,const struct vgic_ops **, + const struct vgic_params **); + struct device_node *vgic_node; + int ret; + + vgic_node = of_find_matching_node_and_match(NULL, + vgic_ids, &matched_id); + if (!vgic_node) { + kvm_err("error: no compatible GIC node found\n"); + return -ENODEV; + } + + vgic_probe = matched_id->data; + ret = vgic_probe(vgic_node, &vgic_ops, &vgic); + if (ret) + return ret; + + ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); + return ret; + } + + ret = __register_cpu_notifier(&vgic_cpu_nb); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + /* Callback into for arch code for setup */ + vgic_arch_setup(vgic); + + on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); + + return kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + +out_free_irq: + free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus()); + return ret; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 686d783387a0..68d96f5dbfe2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2286,10 +2286,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif -#ifdef CONFIG_KVM_ARM_VGIC - [KVM_DEV_TYPE_ARM_VGIC_V2] = &kvm_arm_vgic_v2_ops, -#endif - #ifdef CONFIG_S390 [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, #endif -- cgit v1.2.2 From 84877d93336de21a6251db00b841468a83c65906 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 2 Sep 2014 10:27:35 +0100 Subject: KVM: s390: register flic ops dynamically Using the new kvm_register_device_ops() interface makes us get rid of an #ifdef in common code. Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Cornelia Huck Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 68d96f5dbfe2..f4e792fedb4f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2285,10 +2285,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_VFIO [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif - -#ifdef CONFIG_S390 - [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, -#endif }; int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) -- cgit v1.2.2 From 80ce1639727e9d38729c34f162378508c307ca25 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:36 +0100 Subject: KVM: VFIO: register kvm_device_ops dynamically Now that we have a dynamic means to register kvm_device_ops, use that for the VFIO kvm device, instead of relying on the static table. This is achieved by a module_init call to register the ops with KVM. Cc: Gleb Natapov Cc: Paolo Bonzini Acked-by: Alex Williamson Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- virt/kvm/vfio.c | 22 +++++++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f4e792fedb4f..db57363cc287 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2281,10 +2281,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_XICS [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, #endif - -#ifdef CONFIG_KVM_VFIO - [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, -#endif }; int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ba1a93f935c7..bb11b36ee8a2 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -246,6 +246,16 @@ static void kvm_vfio_destroy(struct kvm_device *dev) kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ } +static int kvm_vfio_create(struct kvm_device *dev, u32 type); + +static struct kvm_device_ops kvm_vfio_ops = { + .name = "kvm-vfio", + .create = kvm_vfio_create, + .destroy = kvm_vfio_destroy, + .set_attr = kvm_vfio_set_attr, + .has_attr = kvm_vfio_has_attr, +}; + static int kvm_vfio_create(struct kvm_device *dev, u32 type) { struct kvm_device *tmp; @@ -268,10 +278,8 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) return 0; } -struct kvm_device_ops kvm_vfio_ops = { - .name = "kvm-vfio", - .create = kvm_vfio_create, - .destroy = kvm_vfio_destroy, - .set_attr = kvm_vfio_set_attr, - .has_attr = kvm_vfio_has_attr, -}; +static int __init kvm_vfio_ops_init(void) +{ + return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); +} +module_init(kvm_vfio_ops_init); -- cgit v1.2.2 From 227844f53864077ccaefe01d0960fcccc03445ce Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 9 Jun 2014 12:27:18 +0200 Subject: arm/arm64: KVM: Rename irq_state to irq_pending The irq_state field on the distributor struct is ambiguous in its meaning; the comment says it's the level of the input put, but that doesn't make much sense for edge-triggered interrupts. The code actually uses this state variable to check if the interrupt is in the pending state on the distributor so clarify the comment and rename the actual variable and accessor methods. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index eeb23b37f87c..7e86a36f3fc5 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -37,7 +37,7 @@ * * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if * something is pending - * - VGIC pending interrupts are stored on the vgic.irq_state vgic + * - VGIC pending interrupts are stored on the vgic.irq_pending vgic * bitmap (this bitmap is updated by both user land ioctls and guest * mmio ops, and other in-kernel peripherals such as the * arch. timers) and indicate the 'wire' state. @@ -45,8 +45,8 @@ * recalculated * - To calculate the oracle, we need info for each cpu from * compute_pending_for_cpu, which considers: - * - PPI: dist->irq_state & dist->irq_enable - * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target + * - PPI: dist->irq_pending & dist->irq_enable + * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target * - irq_spi_target is a 'formatted' version of the GICD_ICFGR * registers, stored on each vcpu. We only keep one bit of * information per interrupt, making sure that only one vcpu can @@ -221,21 +221,21 @@ static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); + return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq); } -static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); + vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1); } -static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); + vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0); } static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) @@ -409,7 +409,7 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending, vcpu->vcpu_id, offset); vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); @@ -425,7 +425,7 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending, vcpu->vcpu_id, offset); vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); @@ -651,7 +651,7 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * is fine, then we are only setting a few bits that were * already set. */ - vgic_dist_irq_set(vcpu, lr.irq); + vgic_dist_irq_set_pending(vcpu, lr.irq); if (lr.irq < VGIC_NR_SGIS) dist->irq_sgi_sources[vcpu_id][lr.irq] |= 1 << lr.source; lr.state &= ~LR_STATE_PENDING; @@ -932,7 +932,7 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) kvm_for_each_vcpu(c, vcpu, kvm) { if (target_cpus & 1) { /* Flag the SGI as pending */ - vgic_dist_irq_set(vcpu, sgi); + vgic_dist_irq_set_pending(vcpu, sgi); dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); } @@ -952,11 +952,11 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; pend_shared = vcpu->arch.vgic_cpu.pending_shared; - pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); + pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id); enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); - pending = vgic_bitmap_get_shared_map(&dist->irq_state); + pending = vgic_bitmap_get_shared_map(&dist->irq_pending); enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); bitmap_and(pend_shared, pend_shared, @@ -1160,7 +1160,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) * our emulated gic and can get rid of them. */ if (!sources) { - vgic_dist_irq_clear(vcpu, irq); + vgic_dist_irq_clear_pending(vcpu, irq); vgic_cpu_irq_clear(vcpu, irq); return true; } @@ -1175,7 +1175,7 @@ static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) if (vgic_queue_irq(vcpu, 0, irq)) { if (vgic_irq_is_edge(vcpu, irq)) { - vgic_dist_irq_clear(vcpu, irq); + vgic_dist_irq_clear_pending(vcpu, irq); vgic_cpu_irq_clear(vcpu, irq); } else { vgic_irq_set_active(vcpu, irq); @@ -1376,7 +1376,7 @@ static void vgic_kick_vcpus(struct kvm *kvm) static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) { - int is_edge = vgic_irq_is_edge(vcpu, irq); + int edge_triggered = vgic_irq_is_edge(vcpu, irq); int state = vgic_dist_irq_is_pending(vcpu, irq); /* @@ -1384,26 +1384,26 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) * - edge triggered and we have a rising edge * - level triggered and we change level */ - if (is_edge) + if (edge_triggered) return level > state; else return level != state; } -static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, +static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int is_edge, is_level; + int edge_triggered, level_triggered; int enabled; bool ret = true; spin_lock(&dist->lock); vcpu = kvm_get_vcpu(kvm, cpuid); - is_edge = vgic_irq_is_edge(vcpu, irq_num); - is_level = !is_edge; + edge_triggered = vgic_irq_is_edge(vcpu, irq_num); + level_triggered = !edge_triggered; if (!vgic_validate_injection(vcpu, irq_num, level)) { ret = false; @@ -1418,9 +1418,9 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); if (level) - vgic_dist_irq_set(vcpu, irq_num); + vgic_dist_irq_set_pending(vcpu, irq_num); else - vgic_dist_irq_clear(vcpu, irq_num); + vgic_dist_irq_clear_pending(vcpu, irq_num); enabled = vgic_irq_is_enabled(vcpu, irq_num); @@ -1429,7 +1429,7 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, goto out; } - if (is_level && vgic_irq_is_active(vcpu, irq_num)) { + if (level_triggered && vgic_irq_is_active(vcpu, irq_num)) { /* * Level interrupt in progress, will be picked up * when EOId. @@ -1466,7 +1466,7 @@ out: int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level) { - if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) + if (vgic_update_irq_pending(kvm, cpuid, irq_num, level)) vgic_kick_vcpus(kvm); return 0; -- cgit v1.2.2 From dbf20f9d8105cca531614c8bff9a74351e8e67e7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 9 Jun 2014 12:55:13 +0200 Subject: arm/arm64: KVM: Rename irq_active to irq_queued We have a special bitmap on the distributor struct to keep track of when level-triggered interrupts are queued on the list registers. This was named irq_active, which is confusing, because the active state of an interrupt as per the GIC spec is a different thing, not specifically related to edge-triggered/level-triggered configurations but rather indicates an interrupt which has been ack'ed but not yet eoi'ed. Rename the bitmap and the corresponding accessor functions to irq_queued to clarify what this is actually used for. Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 7e86a36f3fc5..ce1a2d17ee81 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -60,12 +60,12 @@ * the 'line' again. This is achieved as such: * * - When a level interrupt is moved onto a vcpu, the corresponding - * bit in irq_active is set. As long as this bit is set, the line + * bit in irq_queued is set. As long as this bit is set, the line * will be ignored for further interrupts. The interrupt is injected * into the vcpu with the GICH_LR_EOI bit set (generate a * maintenance interrupt on EOI). * - When the interrupt is EOIed, the maintenance interrupt fires, - * and clears the corresponding bit in irq_active. This allow the + * and clears the corresponding bit in irq_queued. This allows the * interrupt line to be sampled again. */ @@ -196,25 +196,25 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); } -static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) +static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); + return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq); } -static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) +static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); + vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1); } -static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) +static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); + vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0); } static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) @@ -256,6 +256,11 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) vcpu->arch.vgic_cpu.pending_shared); } +static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq) +{ + return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq); +} + static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) { return le32_to_cpu(*((u32 *)mmio->data)) & mask; @@ -1079,8 +1084,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { vgic_retire_lr(lr, vlr.irq, vcpu); - if (vgic_irq_is_active(vcpu, vlr.irq)) - vgic_irq_clear_active(vcpu, vlr.irq); + if (vgic_irq_is_queued(vcpu, vlr.irq)) + vgic_irq_clear_queued(vcpu, vlr.irq); } } } @@ -1170,7 +1175,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) { - if (vgic_irq_is_active(vcpu, irq)) + if (!vgic_can_sample_irq(vcpu, irq)) return true; /* level interrupt, already queued */ if (vgic_queue_irq(vcpu, 0, irq)) { @@ -1178,7 +1183,7 @@ static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) vgic_dist_irq_clear_pending(vcpu, irq); vgic_cpu_irq_clear(vcpu, irq); } else { - vgic_irq_set_active(vcpu, irq); + vgic_irq_set_queued(vcpu, irq); } return true; @@ -1262,7 +1267,7 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - vgic_irq_clear_active(vcpu, vlr.irq); + vgic_irq_clear_queued(vcpu, vlr.irq); WARN_ON(vlr.state & LR_STATE_MASK); vlr.state = 0; vgic_set_lr(vcpu, lr, vlr); @@ -1429,7 +1434,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, goto out; } - if (level_triggered && vgic_irq_is_active(vcpu, irq_num)) { + if (!vgic_can_sample_irq(vcpu, irq_num)) { /* * Level interrupt in progress, will be picked up * when EOId. -- cgit v1.2.2 From cced50c9280ef7ca1af48080707a170efa1adfa0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 14 Jun 2014 22:37:33 +0200 Subject: arm/arm64: KVM: vgic: Clear queued flags on unqueue If we unqueue a level-triggered interrupt completely, and the LR does not stick around in the active state (and will therefore no longer generate a maintenance interrupt), then we should clear the queued flag so that the vgic can actually queue this level-triggered interrupt at a later time and deal with its pending state then. Note: This should actually be properly fixed to handle the active state on the distributor. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index ce1a2d17ee81..2026b6147805 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -667,8 +667,10 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * active), then the LR does not hold any useful info and can * be marked as free for other use. */ - if (!(lr.state & LR_STATE_MASK)) + if (!(lr.state & LR_STATE_MASK)) { vgic_retire_lr(i, lr.irq, vcpu); + vgic_irq_clear_queued(vcpu, lr.irq); + } /* Finally update the VGIC state. */ vgic_update_state(vcpu->kvm); -- cgit v1.2.2 From faa1b46c3e9f4d40359aee04ff275eea5f4cae3a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 14 Jun 2014 21:54:51 +0200 Subject: arm/arm64: KVM: vgic: Improve handling of GICD_I{CS}PENDRn Writes to GICD_ISPENDRn and GICD_ICPENDRn are currently not handled correctly for level-triggered interrupts. The spec states that for level-triggered interrupts, writes to the GICD_ISPENDRn activate the output of a flip-flop which is in turn or'ed with the actual input interrupt signal. Correspondingly, writes to GICD_ICPENDRn simply deactivates the output of that flip-flop, but does not (of course) affect the external input signal. Reads from GICC_IAR will also deactivate the flip-flop output. This requires us to track the state of the level-input separately from the state in the flip-flop. We therefore introduce two new variables on the distributor struct to track these two states. Astute readers may notice that this is introducing more state than required (because an OR of the two states gives you the pending state), but the remaining vgic code uses the pending bitmap for optimized operations to figure out, at the end of the day, if an interrupt is pending or not on the distributor side. Refactoring the code to consider the two state variables all the places where we currently access the precomputed pending value, did not look pretty. Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 108 insertions(+), 11 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 2026b6147805..435d8e7ad137 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -67,6 +67,11 @@ * - When the interrupt is EOIed, the maintenance interrupt fires, * and clears the corresponding bit in irq_queued. This allows the * interrupt line to be sampled again. + * - Note that level-triggered interrupts can also be set to pending from + * writes to GICD_ISPENDRn and lowering the external input line does not + * cause the interrupt to become inactive in such a situation. + * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become + * inactive as long as the external input line is held high. */ #define VGIC_ADDR_UNDEF (-1) @@ -217,6 +222,41 @@ static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq) vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0); } +static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq); +} + +static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1); +} + +static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0); +} + +static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq); +} + +static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); +} + static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; @@ -414,11 +454,26 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending, - vcpu->vcpu_id, offset); + u32 *reg; + u32 level_mask; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu->vcpu_id, offset); + level_mask = (~(*reg)); + + /* Mark both level and edge triggered irqs as pending */ + reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + if (mmio->is_write) { + /* Set the soft-pending flag only for level-triggered irqs */ + reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + *reg &= level_mask; + vgic_update_state(vcpu->kvm); return true; } @@ -430,11 +485,27 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending, - vcpu->vcpu_id, offset); + u32 *level_active; + u32 *reg; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); if (mmio->is_write) { + /* Re-set level triggered level-active interrupts */ + level_active = vgic_bitmap_get_reg(&dist->irq_level, + vcpu->vcpu_id, offset); + reg = vgic_bitmap_get_reg(&dist->irq_pending, + vcpu->vcpu_id, offset); + *reg |= *level_active; + + /* Clear soft-pending flags */ + reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); + vgic_update_state(vcpu->kvm); return true; } @@ -1268,17 +1339,32 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); vgic_irq_clear_queued(vcpu, vlr.irq); WARN_ON(vlr.state & LR_STATE_MASK); vlr.state = 0; vgic_set_lr(vcpu, lr, vlr); + /* + * If the IRQ was EOIed it was also ACKed and we we + * therefore assume we can clear the soft pending + * state (should it had been set) for this interrupt. + * + * Note: if the IRQ soft pending state was set after + * the IRQ was acked, it actually shouldn't be + * cleared, but we have no way of knowing that unless + * we start trapping ACKs when the soft-pending state + * is set. + */ + vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); + /* Any additional pending interrupt? */ - if (vgic_dist_irq_is_pending(vcpu, vlr.irq)) { + if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { vgic_cpu_irq_set(vcpu, vlr.irq); level_pending = true; } else { + vgic_dist_irq_clear_pending(vcpu, vlr.irq); vgic_cpu_irq_clear(vcpu, vlr.irq); } @@ -1384,17 +1470,19 @@ static void vgic_kick_vcpus(struct kvm *kvm) static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) { int edge_triggered = vgic_irq_is_edge(vcpu, irq); - int state = vgic_dist_irq_is_pending(vcpu, irq); /* * Only inject an interrupt if: * - edge triggered and we have a rising edge * - level triggered and we change level */ - if (edge_triggered) + if (edge_triggered) { + int state = vgic_dist_irq_is_pending(vcpu, irq); return level > state; - else + } else { + int state = vgic_dist_irq_get_level(vcpu, irq); return level != state; + } } static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, @@ -1424,10 +1512,19 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); - if (level) + if (level) { + if (level_triggered) + vgic_dist_irq_set_level(vcpu, irq_num); vgic_dist_irq_set_pending(vcpu, irq_num); - else - vgic_dist_irq_clear_pending(vcpu, irq_num); + } else { + if (level_triggered) { + vgic_dist_irq_clear_level(vcpu, irq_num); + if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) + vgic_dist_irq_clear_pending(vcpu, irq_num); + } else { + vgic_dist_irq_clear_pending(vcpu, irq_num); + } + } enabled = vgic_irq_is_enabled(vcpu, irq_num); -- cgit v1.2.2 From 9da48b5502622f9f0e49df957521ec43a0c9f4c1 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 14 Jun 2014 22:30:45 +0200 Subject: arm/arm64: KVM: vgic: Fix SGI writes to GICD_I{CS}PENDR0 Writes to GICD_ISPENDR0 and GICD_ICPENDR0 ignore all settings of the pending state for SGIs. Make sure the implementation handles this correctly. Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 435d8e7ad137..0039ae266a7b 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -454,7 +454,7 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg; + u32 *reg, orig; u32 level_mask; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; @@ -463,6 +463,7 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, /* Mark both level and edge triggered irqs as pending */ reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); + orig = *reg; vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); @@ -474,6 +475,12 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); *reg &= level_mask; + /* Ignore writes to SGIs */ + if (offset < 2) { + *reg &= ~0xffff; + *reg |= orig & 0xffff; + } + vgic_update_state(vcpu->kvm); return true; } @@ -486,10 +493,11 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, phys_addr_t offset) { u32 *level_active; - u32 *reg; + u32 *reg, orig; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); + orig = *reg; vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); if (mmio->is_write) { @@ -500,6 +508,12 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, vcpu->vcpu_id, offset); *reg |= *level_active; + /* Ignore writes to SGIs */ + if (offset < 2) { + *reg &= ~0xffff; + *reg |= orig & 0xffff; + } + /* Clear soft-pending flags */ reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, vcpu->vcpu_id, offset); -- cgit v1.2.2 From 7e362919a59e6fc60e08ad1cf0b047291d1ca2e9 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 14 Jun 2014 22:34:04 +0200 Subject: arm/arm64: KVM: vgic: Clarify and correct vgic documentation The VGIC virtual distributor implementation documentation was written a very long time ago, before the true nature of the beast had been partially absorbed into my bloodstream. Clarify the docs. Plus, it fixes an actual bug. ICFRn, pfff. Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 0039ae266a7b..37fd20d35759 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -36,21 +36,22 @@ * How the whole thing works (courtesy of Christoffer Dall): * * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if - * something is pending - * - VGIC pending interrupts are stored on the vgic.irq_pending vgic - * bitmap (this bitmap is updated by both user land ioctls and guest - * mmio ops, and other in-kernel peripherals such as the - * arch. timers) and indicate the 'wire' state. + * something is pending on the CPU interface. + * - Interrupts that are pending on the distributor are stored on the + * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land + * ioctls and guest mmio ops, and other in-kernel peripherals such as the + * arch. timers). * - Every time the bitmap changes, the irq_pending_on_cpu oracle is * recalculated * - To calculate the oracle, we need info for each cpu from * compute_pending_for_cpu, which considers: * - PPI: dist->irq_pending & dist->irq_enable * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target - * - irq_spi_target is a 'formatted' version of the GICD_ICFGR + * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn * registers, stored on each vcpu. We only keep one bit of * information per interrupt, making sure that only one vcpu can * accept the interrupt. + * - If any of the above state changes, we must recalculate the oracle. * - The same is true when injecting an interrupt, except that we only * consider a single interrupt at a time. The irq_spi_cpu array * contains the target CPU for each SPI. -- cgit v1.2.2 From 71afaba4a2e98bb7bdeba5078370ab43d46e67a1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:00 +0100 Subject: KVM: ARM: vgic: plug irq injection race As it stands, nothing prevents userspace from injecting an interrupt before the guest's GIC is actually initialized. This goes unnoticed so far (as everything is pretty much statically allocated), but ends up exploding in a spectacular way once we switch to a more dynamic allocation (the GIC data structure isn't there yet). The fix is to test for the "ready" flag in the VGIC distributor before trying to inject the interrupt. Note that in order to avoid breaking userspace, we have to ignore what is essentially an error. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 37fd20d35759..9bdf181a00e2 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1585,7 +1585,8 @@ out: int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level) { - if (vgic_update_irq_pending(kvm, cpuid, irq_num, level)) + if (likely(vgic_initialized(kvm)) && + vgic_update_irq_pending(kvm, cpuid, irq_num, level)) vgic_kick_vcpus(kvm); return 0; -- cgit v1.2.2 From c1bfb577addd4867a82c4f235824a315d5afb94a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:01 +0100 Subject: arm/arm64: KVM: vgic: switch to dynamic allocation So far, all the VGIC data structures are statically defined by the *maximum* number of vcpus and interrupts it supports. It means that we always have to oversize it to cater for the worse case. Start by changing the data structures to be dynamically sizeable, and allocate them at runtime. The sizes are still very static though. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 243 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 208 insertions(+), 35 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 9bdf181a00e2..08db8764496a 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -95,6 +95,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); static void vgic_update_state(struct kvm *kvm); static void vgic_kick_vcpus(struct kvm *kvm); +static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi); static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); @@ -105,10 +106,8 @@ static const struct vgic_ops *vgic_ops; static const struct vgic_params *vgic; /* - * struct vgic_bitmap contains unions that provide two views of - * the same data. In one case it is an array of registers of - * u32's, and in the other case it is a bitmap of unsigned - * longs. + * struct vgic_bitmap contains a bitmap made of unsigned longs, but + * extracts u32s out of them. * * This does not work on 64-bit BE systems, because the bitmap access * will store two consecutive 32-bit words with the higher-addressed @@ -124,23 +123,45 @@ static const struct vgic_params *vgic; #define REG_OFFSET_SWIZZLE 0 #endif +static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs) +{ + int nr_longs; + + nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); + + b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL); + if (!b->private) + return -ENOMEM; + + b->shared = b->private + nr_cpus; + + return 0; +} + +static void vgic_free_bitmap(struct vgic_bitmap *b) +{ + kfree(b->private); + b->private = NULL; + b->shared = NULL; +} + static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset) { offset >>= 2; if (!offset) - return x->percpu[cpuid].reg + (offset ^ REG_OFFSET_SWIZZLE); + return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE; else - return x->shared.reg + ((offset - 1) ^ REG_OFFSET_SWIZZLE); + return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE); } static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, int cpuid, int irq) { if (irq < VGIC_NR_PRIVATE_IRQS) - return test_bit(irq, x->percpu[cpuid].reg_ul); + return test_bit(irq, x->private + cpuid); - return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); + return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared); } static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, @@ -149,9 +170,9 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, unsigned long *reg; if (irq < VGIC_NR_PRIVATE_IRQS) { - reg = x->percpu[cpuid].reg_ul; + reg = x->private + cpuid; } else { - reg = x->shared.reg_ul; + reg = x->shared; irq -= VGIC_NR_PRIVATE_IRQS; } @@ -163,24 +184,49 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) { - if (unlikely(cpuid >= VGIC_MAX_CPUS)) - return NULL; - return x->percpu[cpuid].reg_ul; + return x->private + cpuid; } static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) { - return x->shared.reg_ul; + return x->shared; +} + +static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs) +{ + int size; + + size = nr_cpus * VGIC_NR_PRIVATE_IRQS; + size += nr_irqs - VGIC_NR_PRIVATE_IRQS; + + x->private = kzalloc(size, GFP_KERNEL); + if (!x->private) + return -ENOMEM; + + x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32); + return 0; +} + +static void vgic_free_bytemap(struct vgic_bytemap *b) +{ + kfree(b->private); + b->private = NULL; + b->shared = NULL; } static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) { - offset >>= 2; - BUG_ON(offset > (VGIC_NR_IRQS / 4)); - if (offset < 8) - return x->percpu[cpuid] + offset; - else - return x->shared + offset - 8; + u32 *reg; + + if (offset < VGIC_NR_PRIVATE_IRQS) { + reg = x->private; + offset += cpuid * VGIC_NR_PRIVATE_IRQS; + } else { + reg = x->shared; + offset -= VGIC_NR_PRIVATE_IRQS; + } + + return reg + (offset / sizeof(u32)); } #define VGIC_CFG_LEVEL 0 @@ -744,7 +790,7 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) */ vgic_dist_irq_set_pending(vcpu, lr.irq); if (lr.irq < VGIC_NR_SGIS) - dist->irq_sgi_sources[vcpu_id][lr.irq] |= 1 << lr.source; + *vgic_get_sgi_sources(dist, vcpu_id, lr.irq) |= 1 << lr.source; lr.state &= ~LR_STATE_PENDING; vgic_set_lr(vcpu, i, lr); @@ -778,7 +824,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, /* Copy source SGIs from distributor side */ for (sgi = min_sgi; sgi <= max_sgi; sgi++) { int shift = 8 * (sgi - min_sgi); - reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift; + reg |= ((u32)*vgic_get_sgi_sources(dist, vcpu_id, sgi)) << shift; } mmio_data_write(mmio, ~0, reg); @@ -802,14 +848,15 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, /* Clear pending SGIs on the distributor */ for (sgi = min_sgi; sgi <= max_sgi; sgi++) { u8 mask = reg >> (8 * (sgi - min_sgi)); + u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi); if (set) { - if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask) + if ((*src & mask) != mask) updated = true; - dist->irq_sgi_sources[vcpu_id][sgi] |= mask; + *src |= mask; } else { - if (dist->irq_sgi_sources[vcpu_id][sgi] & mask) + if (*src & mask) updated = true; - dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask; + *src &= ~mask; } } @@ -993,6 +1040,11 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, return true; } +static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) +{ + return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; +} + static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) { struct kvm *kvm = vcpu->kvm; @@ -1026,7 +1078,7 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) if (target_cpus & 1) { /* Flag the SGI as pending */ vgic_dist_irq_set_pending(vcpu, sgi); - dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; + *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id; kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); } @@ -1073,14 +1125,14 @@ static void vgic_update_state(struct kvm *kvm) int c; if (!dist->enabled) { - set_bit(0, &dist->irq_pending_on_cpu); + set_bit(0, dist->irq_pending_on_cpu); return; } kvm_for_each_vcpu(c, vcpu, kvm) { if (compute_pending_for_cpu(vcpu)) { pr_debug("CPU%d has pending interrupts\n", c); - set_bit(c, &dist->irq_pending_on_cpu); + set_bit(c, dist->irq_pending_on_cpu); } } } @@ -1237,14 +1289,14 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) int vcpu_id = vcpu->vcpu_id; int c; - sources = dist->irq_sgi_sources[vcpu_id][irq]; + sources = *vgic_get_sgi_sources(dist, vcpu_id, irq); for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { if (vgic_queue_irq(vcpu, c, irq)) clear_bit(c, &sources); } - dist->irq_sgi_sources[vcpu_id][irq] = sources; + *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources; /* * If the sources bitmap has been cleared it means that we @@ -1332,7 +1384,7 @@ epilog: * us. Claim we don't have anything pending. We'll * adjust that if needed while exiting. */ - clear_bit(vcpu_id, &dist->irq_pending_on_cpu); + clear_bit(vcpu_id, dist->irq_pending_on_cpu); } } @@ -1430,7 +1482,7 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) /* Check if we still have something up our sleeve... */ pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); if (level_pending || pending < vgic->nr_lr) - set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); + set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); } void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) @@ -1464,7 +1516,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) return 0; - return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); + return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); } static void vgic_kick_vcpus(struct kvm *kvm) @@ -1559,7 +1611,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, if (level) { vgic_cpu_irq_set(vcpu, irq_num); - set_bit(cpuid, &dist->irq_pending_on_cpu); + set_bit(cpuid, dist->irq_pending_on_cpu); } out: @@ -1603,6 +1655,32 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data) return IRQ_HANDLED; } +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + kfree(vgic_cpu->pending_shared); + kfree(vgic_cpu->vgic_irq_lr_map); + vgic_cpu->pending_shared = NULL; + vgic_cpu->vgic_irq_lr_map = NULL; +} + +static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8; + vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); + vgic_cpu->vgic_irq_lr_map = kzalloc(nr_irqs, GFP_KERNEL); + + if (!vgic_cpu->pending_shared || !vgic_cpu->vgic_irq_lr_map) { + kvm_vgic_vcpu_destroy(vcpu); + return -ENOMEM; + } + + return 0; +} + /** * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state * @vcpu: pointer to the vcpu struct @@ -1642,6 +1720,97 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) return 0; } +void kvm_vgic_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_destroy(vcpu); + + vgic_free_bitmap(&dist->irq_enabled); + vgic_free_bitmap(&dist->irq_level); + vgic_free_bitmap(&dist->irq_pending); + vgic_free_bitmap(&dist->irq_soft_pend); + vgic_free_bitmap(&dist->irq_queued); + vgic_free_bitmap(&dist->irq_cfg); + vgic_free_bytemap(&dist->irq_priority); + if (dist->irq_spi_target) { + for (i = 0; i < dist->nr_cpus; i++) + vgic_free_bitmap(&dist->irq_spi_target[i]); + } + kfree(dist->irq_sgi_sources); + kfree(dist->irq_spi_cpu); + kfree(dist->irq_spi_target); + kfree(dist->irq_pending_on_cpu); + dist->irq_sgi_sources = NULL; + dist->irq_spi_cpu = NULL; + dist->irq_spi_target = NULL; + dist->irq_pending_on_cpu = NULL; +} + +/* + * Allocate and initialize the various data structures. Must be called + * with kvm->lock held! + */ +static int vgic_init_maps(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int nr_cpus, nr_irqs; + int ret, i; + + nr_cpus = dist->nr_cpus = VGIC_MAX_CPUS; + nr_irqs = dist->nr_irqs = VGIC_NR_IRQS; + + ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs); + ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs); + + if (ret) + goto out; + + dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL); + dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL); + dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus, + GFP_KERNEL); + dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), + GFP_KERNEL); + if (!dist->irq_sgi_sources || + !dist->irq_spi_cpu || + !dist->irq_spi_target || + !dist->irq_pending_on_cpu) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < nr_cpus; i++) + ret |= vgic_init_bitmap(&dist->irq_spi_target[i], + nr_cpus, nr_irqs); + + if (ret) + goto out; + + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_vcpu_init_maps(vcpu, nr_irqs); + if (ret) { + kvm_err("VGIC: Failed to allocate vcpu memory\n"); + break; + } + } + +out: + if (ret) + kvm_vgic_destroy(kvm); + + return ret; +} + /** * kvm_vgic_init - Initialize global VGIC state before running any VCPUs * @kvm: pointer to the kvm struct @@ -1722,6 +1891,10 @@ int kvm_vgic_create(struct kvm *kvm) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + ret = vgic_init_maps(kvm); + if (ret) + kvm_err("Unable to allocate maps\n"); + out_unlock: for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); -- cgit v1.2.2 From fb65ab63b8cae510ea1e43e68b5da2f9980aa6d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:02 +0100 Subject: arm/arm64: KVM: vgic: Parametrize VGIC_NR_SHARED_IRQS Having a dynamic number of supported interrupts means that we cannot relly on VGIC_NR_SHARED_IRQS being fixed anymore. Instead, make it take the distributor structure as a parameter, so it can return the right value. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 08db8764496a..7d64dc242afc 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1086,11 +1086,17 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) } } +static int vgic_nr_shared_irqs(struct vgic_dist *dist) +{ + return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; +} + static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; unsigned long *pending, *enabled, *pend_percpu, *pend_shared; unsigned long pending_private, pending_shared; + int nr_shared = vgic_nr_shared_irqs(dist); int vcpu_id; vcpu_id = vcpu->vcpu_id; @@ -1103,15 +1109,15 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) pending = vgic_bitmap_get_shared_map(&dist->irq_pending); enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); + bitmap_and(pend_shared, pending, enabled, nr_shared); bitmap_and(pend_shared, pend_shared, vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - VGIC_NR_SHARED_IRQS); + nr_shared); pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); - pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); + pending_shared = find_first_bit(pend_shared, nr_shared); return (pending_private < VGIC_NR_PRIVATE_IRQS || - pending_shared < VGIC_NR_SHARED_IRQS); + pending_shared < vgic_nr_shared_irqs(dist)); } /* @@ -1368,7 +1374,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) } /* SPIs */ - for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { + for_each_set_bit(i, vgic_cpu->pending_shared, vgic_nr_shared_irqs(dist)) { if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) overflow = 1; } -- cgit v1.2.2 From fc675e355e705a046df7b635d3f3330c0ad94569 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:03 +0100 Subject: arm/arm64: KVM: vgic: kill VGIC_MAX_CPUS We now have the information about the number of CPU interfaces in the distributor itself. Let's get rid of VGIC_MAX_CPUS, and just rely on KVM_MAX_VCPUS where we don't have the choice. Yet. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 7d64dc242afc..599ad17e5436 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1297,7 +1297,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) sources = *vgic_get_sgi_sources(dist, vcpu_id, irq); - for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { + for_each_set_bit(c, &sources, dist->nr_cpus) { if (vgic_queue_irq(vcpu, c, irq)) clear_bit(c, &sources); } @@ -1700,7 +1700,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int i; - if (vcpu->vcpu_id >= VGIC_MAX_CPUS) + if (vcpu->vcpu_id >= dist->nr_cpus) return -EBUSY; for (i = 0; i < VGIC_NR_IRQS; i++) { @@ -1767,7 +1767,7 @@ static int vgic_init_maps(struct kvm *kvm) int nr_cpus, nr_irqs; int ret, i; - nr_cpus = dist->nr_cpus = VGIC_MAX_CPUS; + nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS; nr_irqs = dist->nr_irqs = VGIC_NR_IRQS; ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); -- cgit v1.2.2 From c3c918361adcceb816c92b21dd95d2b46fb96a8f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:04 +0100 Subject: arm/arm64: KVM: vgic: handle out-of-range MMIO accesses Now that we can (almost) dynamically size the number of interrupts, we're facing an interesting issue: We have to evaluate at runtime whether or not an access hits a valid register, based on the sizing of this particular instance of the distributor. Furthermore, the GIC spec says that accessing a reserved register is RAZ/WI. For this, add a new field to our range structure, indicating the number of bits a single interrupts uses. That allows us to find out whether or not the access is in range. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 56 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 599ad17e5436..973eaf7ebe98 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -895,6 +895,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, struct mmio_range { phys_addr_t base; unsigned long len; + int bits_per_irq; bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset); }; @@ -903,56 +904,67 @@ static const struct mmio_range vgic_dist_ranges[] = { { .base = GIC_DIST_CTRL, .len = 12, + .bits_per_irq = 0, .handle_mmio = handle_mmio_misc, }, { .base = GIC_DIST_IGROUP, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_ENABLE_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_set_enable_reg, }, { .base = GIC_DIST_ENABLE_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_clear_enable_reg, }, { .base = GIC_DIST_PENDING_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_set_pending_reg, }, { .base = GIC_DIST_PENDING_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_clear_pending_reg, }, { .base = GIC_DIST_ACTIVE_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_ACTIVE_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_PRI, - .len = VGIC_NR_IRQS, + .len = VGIC_MAX_IRQS, + .bits_per_irq = 8, .handle_mmio = handle_mmio_priority_reg, }, { .base = GIC_DIST_TARGET, - .len = VGIC_NR_IRQS, + .len = VGIC_MAX_IRQS, + .bits_per_irq = 8, .handle_mmio = handle_mmio_target_reg, }, { .base = GIC_DIST_CONFIG, - .len = VGIC_NR_IRQS / 4, + .len = VGIC_MAX_IRQS / 4, + .bits_per_irq = 2, .handle_mmio = handle_mmio_cfg_reg, }, { @@ -990,6 +1002,22 @@ struct mmio_range *find_matching_range(const struct mmio_range *ranges, return NULL; } +static bool vgic_validate_access(const struct vgic_dist *dist, + const struct mmio_range *range, + unsigned long offset) +{ + int irq; + + if (!range->bits_per_irq) + return true; /* Not an irq-based access */ + + irq = offset * 8 / range->bits_per_irq; + if (irq >= dist->nr_irqs) + return false; + + return true; +} + /** * vgic_handle_mmio - handle an in-kernel MMIO access * @vcpu: pointer to the vcpu performing the access @@ -1029,7 +1057,13 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, spin_lock(&vcpu->kvm->arch.vgic.lock); offset = mmio->phys_addr - range->base - base; - updated_state = range->handle_mmio(vcpu, mmio, offset); + if (vgic_validate_access(dist, range, offset)) { + updated_state = range->handle_mmio(vcpu, mmio, offset); + } else { + vgic_reg_access(mmio, NULL, offset, + ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); + updated_state = false; + } spin_unlock(&vcpu->kvm->arch.vgic.lock); kvm_prepare_mmio(run, mmio); kvm_handle_mmio_return(vcpu, run); -- cgit v1.2.2 From 5fb66da64064d0cb8dcce4cc8bf4cb1b921b13a0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:05 +0100 Subject: arm/arm64: KVM: vgic: kill VGIC_NR_IRQS Nuke VGIC_NR_IRQS entierly, now that the distributor instance contains the number of IRQ allocated to this GIC. Also add VGIC_NR_IRQS_LEGACY to preserve the current API. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 973eaf7ebe98..725d829ad1d9 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -439,7 +439,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu, case 4: /* GICD_TYPER */ reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - reg |= (VGIC_NR_IRQS >> 5) - 1; + reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1; vgic_reg_access(mmio, ®, word_offset, ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); break; @@ -1277,13 +1277,14 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct vgic_lr vlr; int lr; /* Sanitize the input... */ BUG_ON(sgi_source_id & ~7); BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); - BUG_ON(irq >= VGIC_NR_IRQS); + BUG_ON(irq >= dist->nr_irqs); kvm_debug("Queue IRQ%d\n", irq); @@ -1515,7 +1516,7 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) vlr = vgic_get_lr(vcpu, lr); - BUG_ON(vlr.irq >= VGIC_NR_IRQS); + BUG_ON(vlr.irq >= dist->nr_irqs); vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY; } @@ -1737,7 +1738,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) if (vcpu->vcpu_id >= dist->nr_cpus) return -EBUSY; - for (i = 0; i < VGIC_NR_IRQS; i++) { + for (i = 0; i < dist->nr_irqs; i++) { if (i < VGIC_NR_PPIS) vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); @@ -1802,7 +1803,11 @@ static int vgic_init_maps(struct kvm *kvm) int ret, i; nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS; - nr_irqs = dist->nr_irqs = VGIC_NR_IRQS; + + if (!dist->nr_irqs) + dist->nr_irqs = VGIC_NR_IRQS_LEGACY; + + nr_irqs = dist->nr_irqs; ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs); @@ -1886,7 +1891,7 @@ int kvm_vgic_init(struct kvm *kvm) goto out; } - for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) + for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4) vgic_set_target_reg(kvm, 0, i); kvm->arch.vgic.ready = true; -- cgit v1.2.2 From 4956f2bc1fdee4bc336532f3f34635a8534cedfd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:06 +0100 Subject: arm/arm64: KVM: vgic: delay vgic allocation until init time It is now quite easy to delay the allocation of the vgic tables until we actually require it to be up and running (when the first vcpu is kicking around, or someones tries to access the GIC registers). This allow us to allocate memory for the exact number of CPUs we have. As nobody configures the number of interrupts just yet, use a fallback to VGIC_NR_IRQS_LEGACY. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 725d829ad1d9..ac2b44d58e60 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1729,15 +1729,12 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to * this vcpu and enable the VGIC for this VCPU */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int i; - if (vcpu->vcpu_id >= dist->nr_cpus) - return -EBUSY; - for (i = 0; i < dist->nr_irqs; i++) { if (i < VGIC_NR_PPIS) vgic_bitmap_set_irq_val(&dist->irq_enabled, @@ -1757,8 +1754,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) vgic_cpu->nr_lr = vgic->nr_lr; vgic_enable(vcpu); - - return 0; } void kvm_vgic_destroy(struct kvm *kvm) @@ -1802,8 +1797,17 @@ static int vgic_init_maps(struct kvm *kvm) int nr_cpus, nr_irqs; int ret, i; - nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS; + if (dist->nr_cpus) /* Already allocated */ + return 0; + + nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus); + if (!nr_cpus) /* No vcpus? Can't be good... */ + return -EINVAL; + /* + * If nobody configured the number of interrupts, use the + * legacy one. + */ if (!dist->nr_irqs) dist->nr_irqs = VGIC_NR_IRQS_LEGACY; @@ -1849,6 +1853,9 @@ static int vgic_init_maps(struct kvm *kvm) } } + for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4) + vgic_set_target_reg(kvm, 0, i); + out: if (ret) kvm_vgic_destroy(kvm); @@ -1867,6 +1874,7 @@ out: */ int kvm_vgic_init(struct kvm *kvm) { + struct kvm_vcpu *vcpu; int ret = 0, i; if (!irqchip_in_kernel(kvm)) @@ -1884,6 +1892,12 @@ int kvm_vgic_init(struct kvm *kvm) goto out; } + ret = vgic_init_maps(kvm); + if (ret) { + kvm_err("Unable to allocate maps\n"); + goto out; + } + ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE); if (ret) { @@ -1891,11 +1905,13 @@ int kvm_vgic_init(struct kvm *kvm) goto out; } - for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4) - vgic_set_target_reg(kvm, 0, i); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_init(vcpu); kvm->arch.vgic.ready = true; out: + if (ret) + kvm_vgic_destroy(kvm); mutex_unlock(&kvm->lock); return ret; } @@ -1936,10 +1952,6 @@ int kvm_vgic_create(struct kvm *kvm) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - ret = vgic_init_maps(kvm); - if (ret) - kvm_err("Unable to allocate maps\n"); - out_unlock: for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); @@ -2140,6 +2152,10 @@ static int vgic_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + ret = vgic_init_maps(dev->kvm); + if (ret) + goto out; + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { ret = -EINVAL; goto out; -- cgit v1.2.2 From a98f26f183801685ef57333de4bafd4bbc692c7c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2014 12:09:07 +0100 Subject: arm/arm64: KVM: vgic: make number of irqs a configurable attribute In order to make the number of interrupts configurable, use the new fancy device management API to add KVM_DEV_ARM_VGIC_GRP_NR_IRQS as a VGIC configurable attribute. Userspace can now specify the exact size of the GIC (by increments of 32 interrupts). Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index ac2b44d58e60..b6fab0f25f3b 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -2253,6 +2253,36 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return vgic_attr_regs_access(dev, attr, ®, true); } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_IRQS || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_irqs = val; + + mutex_unlock(&dev->kvm->lock); + + return ret; + } } @@ -2289,6 +2319,11 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = put_user(reg, uaddr); break; } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr); + break; + } } @@ -2325,6 +2360,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; return vgic_has_attr_regs(vgic_cpu_ranges, offset); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; } return -ENXIO; } -- cgit v1.2.2 From 29f1b65b5984c1e35e2d60d1416d03cee0b91ee2 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 22 Sep 2014 23:33:08 +0200 Subject: KVM: EVENTFD: Remove inclusion of irq.h Commit c77dcac (KVM: Move more code under CONFIG_HAVE_KVM_IRQFD) added functionality that depends on definitions in ioapic.h when __KVM_HAVE_IOAPIC is defined. At the same time, kvm-arm commit 0ba0951 (KVM: EVENTFD: remove inclusion of irq.h) removed the inclusion of irq.h, an architecture-specific header that is not present on ARM but which happened to include ioapic.h on x86. Include ioapic.h directly in eventfd.c if __KVM_HAVE_IOAPIC is defined. This fixes x86 and lets ARM use eventfd.c. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 3c5981c87c3f..b0fb390943c6 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -36,7 +36,9 @@ #include #include -#include "irq.h" +#ifdef __KVM_HAVE_IOAPIC +#include "ioapic.h" +#endif #include "iodev.h" #ifdef CONFIG_HAVE_KVM_IRQFD -- cgit v1.2.2 From 3c3c29fd0d7cddc32862c350d0700ce69953e3bd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 24 Sep 2014 13:02:46 +0200 Subject: kvm-vfio: do not use module_init /me got confused between the kernel and QEMU. In the kernel, you can only have one module_init function, and it will prevent unloading the module unless you also have the corresponding module_exit function. So, commit 80ce1639727e (KVM: VFIO: register kvm_device_ops dynamically, 2014-09-02) broke unloading of the kvm module, by adding a module_init function and no module_exit. Repair it by making kvm_vfio_ops_init weak, and checking it in kvm_init. Cc: Will Deacon Cc: Gleb Natapov Cc: Alex Williamson Fixes: 80ce1639727e9d38729c34f162378508c307ca25 Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++++ virt/kvm/vfio.c | 4 ++-- virt/kvm/vfio.h | 13 +++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 virt/kvm/vfio.h (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index db57363cc287..499db0977f3c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -57,6 +57,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "vfio.h" #define CREATE_TRACE_POINTS #include @@ -3226,6 +3227,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_undebugfs; } + r = kvm_vfio_ops_init(); + WARN_ON(r); + return 0; out_undebugfs: diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index bb11b36ee8a2..281e7cf2b8e5 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -18,6 +18,7 @@ #include #include #include +#include "vfio.h" struct kvm_vfio_group { struct list_head node; @@ -278,8 +279,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) return 0; } -static int __init kvm_vfio_ops_init(void) +int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } -module_init(kvm_vfio_ops_init); diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h new file mode 100644 index 000000000000..92eac75d6b62 --- /dev/null +++ b/virt/kvm/vfio.h @@ -0,0 +1,13 @@ +#ifndef __KVM_VFIO_H +#define __KVM_VFIO_H + +#ifdef CONFIG_KVM_VFIO +int kvm_vfio_ops_init(void); +#else +static inline int kvm_vfio_ops_init(void) +{ + return 0; +} +#endif + +#endif -- cgit v1.2.2 From 234b239bea395316d7f78018c672f4a88b3cdf0d Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Wed, 17 Sep 2014 10:51:48 -0700 Subject: kvm: Faults which trigger IO release the mmap_sem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been swapped out or is behind a filemap, this will trigger async readahead and return immediately. The rationale is that KVM will kick back the guest with an "async page fault" and allow for some other guest process to take over. If async PFs are enabled the fault is retried asap from an async workqueue. If not, it's retried immediately in the same code path. In either case the retry will not relinquish the mmap semaphore and will block on the IO. This is a bad thing, as other mmap semaphore users now stall as a function of swap or filemap latency. This patch ensures both the regular and async PF path re-enter the fault allowing for the mmap semaphore to be relinquished in the case of IO wait. Reviewed-by: Radim Krčmář Signed-off-by: Andres Lagar-Cavilla Acked-by: Andrew Morton Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 4 +--- virt/kvm/kvm_main.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 6 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index d6a3d0993d88..5ff7f7f2689a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); - down_read(&mm->mmap_sem); - get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL); - up_read(&mm->mmap_sem); + kvm_get_user_page_io(NULL, mm, addr, 1, NULL); kvm_async_page_present_sync(vcpu, apf); spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 499db0977f3c..1c6e8476b244 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1122,6 +1122,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); } +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, bool write_fault, + struct page **pagep) +{ + int npages; + int locked = 1; + int flags = FOLL_TOUCH | FOLL_HWPOISON | + (pagep ? FOLL_GET : 0) | + (write_fault ? FOLL_WRITE : 0); + + /* + * If retrying the fault, we get here *not* having allowed the filemap + * to wait on the page lock. We should now allow waiting on the IO with + * the mmap semaphore released. + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, + &locked); + if (!locked) { + VM_BUG_ON(npages != -EBUSY); + + if (!pagep) + return 0; + + /* + * The previous call has now waited on the IO. Now we can + * retry and complete. Pass TRIED to ensure we do not re + * schedule async IO (see e.g. filemap_fault). + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED, + pagep, NULL, NULL); + } + up_read(&mm->mmap_sem); + return npages; +} + static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; @@ -1184,9 +1221,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(current, current->mm, addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else - npages = get_user_pages_fast(addr, 1, write_fault, - page); + } else { + /* + * By now we have tried gup_fast, and possibly async_pf, and we + * are certainly not atomic. Time to retry the gup, allowing + * mmap semaphore to be relinquished in the case of IO. + */ + npages = kvm_get_user_page_io(current, current->mm, addr, + write_fault, page); + } if (npages != 1) return npages; -- cgit v1.2.2 From 2ea75be3219571d0ec009ce20d9971e54af96e09 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 19 Sep 2014 16:03:25 -0700 Subject: kvm: don't take vcpu mutex for obviously invalid vcpu ioctls vcpu ioctls can hang the calling thread if issued while a vcpu is running. However, invalid ioctls can happen when userspace tries to probe the kind of file descriptors (e.g. isatty() calls ioctl(TCGETS)); in that case, we know the ioctl is going to be rejected as invalid anyway and we can fail before trying to take the vcpu mutex. This patch does not change functionality, it just makes invalid ioctls fail faster. Cc: stable@vger.kernel.org Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1c6e8476b244..ff42b11d2b9c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -52,6 +52,7 @@ #include #include +#include #include #include @@ -2032,6 +2033,9 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; + if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) + return -EINVAL; + #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) /* * Special cases: vcpu ioctls that are asynchronous to vcpu execution, -- cgit v1.2.2 From 57128468080a8b6ea452223036d3e417f748af55 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Mon, 22 Sep 2014 14:54:42 -0700 Subject: kvm: Fix page ageing bugs 1. We were calling clear_flush_young_notify in unmap_one, but we are within an mmu notifier invalidate range scope. The spte exists no more (due to range_start) and the accessed bit info has already been propagated (due to kvm_pfn_set_accessed). Simply call clear_flush_young. 2. We clear_flush_young on a primary MMU PMD, but this may be mapped as a collection of PTEs by the secondary MMU (e.g. during log-dirty). This required expanding the interface of the clear_flush_young mmu notifier, so a lot of code has been trivially touched. 3. In the absence of shadow_accessed_mask (e.g. EPT A bit), we emulate the access bit by blowing the spte. This requires proper synchronizing with MMU notifier consumers, like every other removal of spte's does. Signed-off-by: Andres Lagar-Cavilla Acked-by: Rik van Riel Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ff42b11d2b9c..0316314d48f4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,7 +369,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -377,7 +378,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); + young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); -- cgit v1.2.2 From 445b8236959bfe624a5aa9bce89f44a3bec9b2b1 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 24 Sep 2014 15:57:55 +0800 Subject: kvm: Rename make_all_cpus_request() to kvm_make_all_cpus_request() and make it non-static Different architectures need different requests, and in fact we will use this function in architecture-specific code later. This will be outside kvm_main.c, so make it non-static and rename it to kvm_make_all_cpus_request(). Reviewed-by: Paolo Bonzini Signed-off-by: Tang Chen Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0316314d48f4..5b8ca365932a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -153,7 +153,7 @@ static void ack_flush(void *_completed) { } -static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { int i, cpu, me; cpumask_var_t cpus; @@ -190,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) long dirty_count = kvm->tlbs_dirty; smp_mb(); - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } @@ -198,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_reload_remote_mmus(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } void kvm_make_mclock_inprogress_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) -- cgit v1.2.2 From fe71557afbec641fee73711e40602bed37f6f33b Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 24 Sep 2014 15:57:57 +0800 Subject: kvm: Add arch specific mmu notifier for page invalidation This will be used to let the guest run while the APIC access page is not pinned. Because subsequent patches will fill in the function for x86, place the (still empty) x86 implementation in the x86.c file instead of adding an inline function in kvm_host.h. Signed-off-by: Tang Chen Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b8ca365932a..3f16f569169e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -296,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + kvm_arch_mmu_notifier_invalidate_page(kvm, address); + srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.2 From 0fea6d7628ed6e25a9ee1b67edf7c859718d39e8 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 25 Sep 2014 18:41:07 +0200 Subject: arm/arm64: KVM: Fix set_clear_sgi_pend_reg offset The sgi values calculated in read_set_clear_sgi_pend_reg() and write_set_clear_sgi_pend_reg() were horribly incorrectly multiplied by 4 with catastrophic results in that subfunctions ended up overwriting memory not allocated for the expected purpose. This showed up as bugs in kfree() and the kernel complaining a lot of you turn on memory debugging. This addresses: http://marc.info/?l=kvm&m=141164910007868&w=2 Reported-by: Shannon Zhao Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt/kvm') diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index b6fab0f25f3b..862967852d5a 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -816,7 +816,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int sgi; - int min_sgi = (offset & ~0x3) * 4; + int min_sgi = (offset & ~0x3); int max_sgi = min_sgi + 3; int vcpu_id = vcpu->vcpu_id; u32 reg = 0; @@ -837,7 +837,7 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int sgi; - int min_sgi = (offset & ~0x3) * 4; + int min_sgi = (offset & ~0x3); int max_sgi = min_sgi + 3; int vcpu_id = vcpu->vcpu_id; u32 reg; -- cgit v1.2.2 From bb0ca6acd466af55c95b7ce508f29e23a24cabd9 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Thu, 25 Sep 2014 15:26:50 -0700 Subject: kvm: Fix kvm_get_page_retry_io __gup retval check Confusion around -EBUSY and zero (inside a BUG_ON no less). Reported-by: Andrea Arcangeli Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt/kvm') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3f16f569169e..a1cf53ee0d28 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1146,7 +1146,7 @@ int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, &locked); if (!locked) { - VM_BUG_ON(npages != -EBUSY); + VM_BUG_ON(npages); if (!pagep) return 0; -- cgit v1.2.2