Merge tag 'kvm-arm-for-v4.16' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm

KVM/ARM Changes for v4.16 The changes for this version include icache invalidation optimizations (improving VM startup time), support for forwarded level-triggered interrupts (improved performance for timers and passthrough platform devices), a small fix for power-management notifiers, and some cosmetic changes.
author: Radim Krčmář <rkrcmar@redhat.com> 2018-01-31 07:34:41 -0500
committer: Radim Krčmář <rkrcmar@redhat.com> 2018-01-31 07:34:41 -0500
commit: e53175395d7e12d8474707271bc02a2814279843 (patch)
tree: ca6a0fc846cffb1b6db999a4595998c160333cf0 /virt
parent: 810f4600ec5ee79c68dcbb136ed26a652df46348 (diff)
parent: cd15d2050c044ca9525ba165e9073ac8e036b8d0 (diff)
10 files changed, 368 insertions, 123 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index f9555b1e7f15..fb6bd9b9845e 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -97,15 +97,13 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
                pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n");
                return IRQ_NONE;
        }
-        vtimer = vcpu_vtimer(vcpu);
-        if (!vtimer->irq.level) {
+        vtimer = vcpu_vtimer(vcpu);
-                vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+        if (kvm_timer_should_fire(vtimer))
-                if (kvm_timer_irq_can_fire(vtimer))
+                kvm_timer_update_irq(vcpu, true, vtimer);
-                        kvm_timer_update_irq(vcpu, true, vtimer);
-        }
-        if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+        if (static_branch_unlikely(&userspace_irqchip_in_use) &&
+            unlikely(!irqchip_in_kernel(vcpu->kvm)))
                kvm_vtimer_update_mask_user(vcpu);
        return IRQ_HANDLED;
@@ -231,6 +229,16 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 {
        u64 cval, now;
+        if (timer_ctx->loaded) {
+                u32 cnt_ctl;
+                /* Only the virtual timer can be loaded so far */
+                cnt_ctl = read_sysreg_el0(cntv_ctl);
+                return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
+                        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
+                       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
+        }
        if (!kvm_timer_irq_can_fire(timer_ctx))
                return false;
@@ -245,15 +253,7 @@ bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-        if (vtimer->irq.level || ptimer->irq.level)
+        if (kvm_timer_should_fire(vtimer))
-                return true;
-        /*
-         * When this is called from withing the wait loop of kvm_vcpu_block(),
-         * the software view of the timer state is up to date (timer->loaded
-         * is false), and so we can simply check if the timer should fire now.
-         */
-        if (!vtimer->loaded && kvm_timer_should_fire(vtimer))
                return true;
        return kvm_timer_should_fire(ptimer);
@@ -271,9 +271,9 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
        /* Populate the device bitmap with the timer states */
        regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
                                    KVM_ARM_DEV_EL1_PTIMER);
-        if (vtimer->irq.level)
+        if (kvm_timer_should_fire(vtimer))
                regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
-        if (ptimer->irq.level)
+        if (kvm_timer_should_fire(ptimer))
                regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
 }
@@ -286,7 +286,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
        trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
                                   timer_ctx->irq.level);
-        if (likely(irqchip_in_kernel(vcpu->kvm))) {
+        if (!static_branch_unlikely(&userspace_irqchip_in_use) ||
+            likely(irqchip_in_kernel(vcpu->kvm))) {
                ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
                                          timer_ctx->irq.irq,
                                          timer_ctx->irq.level,
@@ -324,12 +325,20 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+        bool level;
        if (unlikely(!timer->enabled))
                return;
-        if (kvm_timer_should_fire(vtimer) != vtimer->irq.level)
+        /*
-                kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer);
+         * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
+         * of its lifecycle is offloaded to the hardware, and we therefore may
+         * not have lowered the irq.level value before having to signal a new
+         * interrupt, but have to signal an interrupt every time the level is
+         * asserted.
+         */
+        level = kvm_timer_should_fire(vtimer);
+        kvm_timer_update_irq(vcpu, level, vtimer);
        if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
                kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
@@ -337,6 +346,12 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
        phys_timer_emulate(vcpu);
 }
+static void __timer_snapshot_state(struct arch_timer_context *timer)
+{
+        timer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+        timer->cnt_cval = read_sysreg_el0(cntv_cval);
+}
 static void vtimer_save_state(struct kvm_vcpu *vcpu)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -348,10 +363,8 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
        if (!vtimer->loaded)
                goto out;
-        if (timer->enabled) {
+        if (timer->enabled)
-                vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+                __timer_snapshot_state(vtimer);
-                vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-        }
        /* Disable the virtual timer */
        write_sysreg_el0(0, cntv_ctl);
@@ -448,8 +461,7 @@ static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
        bool phys_active;
        int ret;
-        phys_active = vtimer->irq.level ||
+        phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
-                      kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
        ret = irq_set_irqchip_state(host_vtimer_irq,
                                    IRQCHIP_STATE_ACTIVE,
@@ -496,8 +508,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
        vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
        plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
-        return vtimer->irq.level != vlevel ||
+        return kvm_timer_should_fire(vtimer) != vlevel ||
-               ptimer->irq.level != plevel;
+               kvm_timer_should_fire(ptimer) != plevel;
 }
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
@@ -529,54 +541,27 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
        set_cntvoff(0);
 }
-static void unmask_vtimer_irq(struct kvm_vcpu *vcpu)
+/*
+ * With a userspace irqchip we have to check if the guest de-asserted the
+ * timer and if so, unmask the timer irq signal on the host interrupt
+ * controller to ensure that we see future timer signals.
+ */
+static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
 {
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-                kvm_vtimer_update_mask_user(vcpu);
+                __timer_snapshot_state(vtimer);
-                return;
+                if (!kvm_timer_should_fire(vtimer)) {
-        }
+                        kvm_timer_update_irq(vcpu, false, vtimer);
+                        kvm_vtimer_update_mask_user(vcpu);
-        /*
+                }
-         * If the guest disabled the timer without acking the interrupt, then
-         * we must make sure the physical and virtual active states are in
-         * sync by deactivating the physical interrupt, because otherwise we
-         * wouldn't see the next timer interrupt in the host.
-         */
-        if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) {
-                int ret;
-                ret = irq_set_irqchip_state(host_vtimer_irq,
-                                            IRQCHIP_STATE_ACTIVE,
-                                            false);
-                WARN_ON(ret);
        }
 }
-/**
- * kvm_timer_sync_hwstate - sync timer state from cpu
- * @vcpu: The vcpu pointer
- *
- * Check if any of the timers have expired while we were running in the guest,
- * and inject an interrupt if that was the case.
- */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+        unmask_vtimer_irq_user(vcpu);
-        /*
-         * If we entered the guest with the vtimer output asserted we have to
-         * check if the guest has modified the timer so that we should lower
-         * the line at this point.
-         */
-        if (vtimer->irq.level) {
-                vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-                vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-                if (!kvm_timer_should_fire(vtimer)) {
-                        kvm_timer_update_irq(vcpu, false, vtimer);
-                        unmask_vtimer_irq(vcpu);
-                }
-        }
 }
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -807,6 +792,19 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
        return true;
 }
+bool kvm_arch_timer_get_input_level(int vintid)
+{
+        struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu();
+        struct arch_timer_context *timer;
+        if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+                timer = vcpu_vtimer(vcpu);
+        else
+                BUG(); /* We only map the vtimer so far */
+        return kvm_timer_should_fire(timer);
+}
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -828,7 +826,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
                return -EINVAL;
        }
-        ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq);
+        ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
+                                    kvm_arch_timer_get_input_level);
        if (ret)
                return ret;
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index cd7d90c9f644..92b95ae9a2ca 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -71,17 +71,17 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
-        BUG_ON(preemptible());
        __this_cpu_write(kvm_arm_running_vcpu, vcpu);
 }
+DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 /**
 * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU.
 * Must be called from non-preemptible context
 */
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
 {
-        BUG_ON(preemptible());
        return __this_cpu_read(kvm_arm_running_vcpu);
 }
@@ -295,6 +295,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
+        if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+                static_branch_dec(&userspace_irqchip_in_use);
        kvm_mmu_free_memory_caches(vcpu);
        kvm_timer_vcpu_terminate(vcpu);
        kvm_pmu_vcpu_destroy(vcpu);
@@ -532,14 +535,22 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
        vcpu->arch.has_run_once = true;
-        /*
+        if (likely(irqchip_in_kernel(kvm))) {
-         * Map the VGIC hardware resources before running a vcpu the first
+                /*
-         * time on this VM.
+                 * Map the VGIC hardware resources before running a vcpu the
-         */
+                 * first time on this VM.
-        if (unlikely(irqchip_in_kernel(kvm) && !vgic_ready(kvm))) {
+                 */
-                ret = kvm_vgic_map_resources(kvm);
+                if (unlikely(!vgic_ready(kvm))) {
-                if (ret)
+                        ret = kvm_vgic_map_resources(kvm);
-                        return ret;
+                        if (ret)
+                                return ret;
+                }
+        } else {
+                /*
+                 * Tell the rest of the code that there are userspace irqchip
+                 * VMs in the wild.
+                 */
+                static_branch_inc(&userspace_irqchip_in_use);
        }
        ret = kvm_timer_enable(vcpu);
@@ -680,19 +691,30 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                kvm_vgic_flush_hwstate(vcpu);
                /*
-                 * If we have a singal pending, or need to notify a userspace
+                 * Exit if we have a signal pending so that we can deliver the
-                 * irqchip about timer or PMU level changes, then we exit (and
+                 * signal to user space.
-                 * update the timer level state in kvm_timer_update_run
-                 * below).
                 */
-                if (signal_pending(current) ||
+                if (signal_pending(current)) {
-                    kvm_timer_should_notify_user(vcpu) ||
-                    kvm_pmu_should_notify_user(vcpu)) {
                        ret = -EINTR;
                        run->exit_reason = KVM_EXIT_INTR;
                }
                /*
+                 * If we're using a userspace irqchip, then check if we need
+                 * to tell a userspace irqchip about timer or PMU level
+                 * changes and if so, exit to userspace (the actual level
+                 * state gets updated in kvm_timer_update_run and
+                 * kvm_pmu_update_run below).
+                 */
+                if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+                        if (kvm_timer_should_notify_user(vcpu) ||
+                            kvm_pmu_should_notify_user(vcpu)) {
+                                ret = -EINTR;
+                                run->exit_reason = KVM_EXIT_INTR;
+                        }
+                }
+                /*
                 * Ensure we set mode to IN_GUEST_MODE after we disable
                 * interrupts and before the final VCPU requests check.
                 * See the comment in kvm_vcpu_exiting_guest_mode() and
@@ -704,7 +726,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                    kvm_request_pending(vcpu)) {
                        vcpu->mode = OUTSIDE_GUEST_MODE;
                        kvm_pmu_sync_hwstate(vcpu);
-                        kvm_timer_sync_hwstate(vcpu);
+                        if (static_branch_unlikely(&userspace_irqchip_in_use))
+                                kvm_timer_sync_hwstate(vcpu);
                        kvm_vgic_sync_hwstate(vcpu);
                        local_irq_enable();
                        preempt_enable();
@@ -748,7 +771,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * we don't want vtimer interrupts to race with syncing the
                 * timer virtual interrupt state.
                 */
-                kvm_timer_sync_hwstate(vcpu);
+                if (static_branch_unlikely(&userspace_irqchip_in_use))
+                        kvm_timer_sync_hwstate(vcpu);
                /*
                 * We may have taken a host interrupt in HYP mode (ie
@@ -1277,6 +1301,7 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                        cpu_hyp_reset();
                return NOTIFY_OK;
+        case CPU_PM_ENTER_FAILED:
        case CPU_PM_EXIT:
                if (__this_cpu_read(kvm_arm_hardware_enabled))
                        /* The hardware was enabled before suspend. */
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index d7fd46fe9efb..4fe6e797e8b3 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -21,6 +21,7 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b36945d49986..a1ea43fa75cf 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -926,6 +926,25 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
        return 0;
 }
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+        pmd_t *pmdp;
+        pte_t *ptep;
+        pmdp = stage2_get_pmd(kvm, NULL, addr);
+        if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
+                return false;
+        if (pmd_thp_or_huge(*pmdp))
+                return kvm_s2pmd_exec(pmdp);
+        ptep = pte_offset_kernel(pmdp, addr);
+        if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
+                return false;
+        return kvm_s2pte_exec(ptep);
+}
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
                          phys_addr_t addr, const pte_t *new_pte,
                          unsigned long flags)
@@ -1257,10 +1276,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
-static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-                                      unsigned long size)
 {
-        __coherent_cache_guest_page(vcpu, pfn, size);
+        __clean_dcache_guest_page(pfn, size);
+}
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+        __invalidate_icache_guest_page(pfn, size);
 }
 static void kvm_send_hwpoison_signal(unsigned long address,
@@ -1286,7 +1309,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                          unsigned long fault_status)
 {
        int ret;
-        bool write_fault, writable, hugetlb = false, force_pte = false;
+        bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
        unsigned long mmu_seq;
        gfn_t gfn = fault_ipa >> PAGE_SHIFT;
        struct kvm *kvm = vcpu->kvm;
@@ -1298,7 +1321,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        unsigned long flags = 0;
        write_fault = kvm_is_write_fault(vcpu);
-        if (fault_status == FSC_PERM && !write_fault) {
+        exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+        VM_BUG_ON(write_fault && exec_fault);
+        if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
                kvm_err("Unexpected L2 read permission error\n");
                return -EFAULT;
        }
@@ -1391,7 +1417,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        new_pmd = kvm_s2pmd_mkwrite(new_pmd);
                        kvm_set_pfn_dirty(pfn);
                }
-                coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
+                if (fault_status != FSC_PERM)
+                        clean_dcache_guest_page(pfn, PMD_SIZE);
+                if (exec_fault) {
+                        new_pmd = kvm_s2pmd_mkexec(new_pmd);
+                        invalidate_icache_guest_page(pfn, PMD_SIZE);
+                } else if (fault_status == FSC_PERM) {
+                        /* Preserve execute if XN was already cleared */
+                        if (stage2_is_exec(kvm, fault_ipa))
+                                new_pmd = kvm_s2pmd_mkexec(new_pmd);
+                }
                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
        } else {
                pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1401,7 +1439,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        kvm_set_pfn_dirty(pfn);
                        mark_page_dirty(kvm, gfn);
                }
-                coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE);
+                if (fault_status != FSC_PERM)
+                        clean_dcache_guest_page(pfn, PAGE_SIZE);
+                if (exec_fault) {
+                        new_pte = kvm_s2pte_mkexec(new_pte);
+                        invalidate_icache_guest_page(pfn, PAGE_SIZE);
+                } else if (fault_status == FSC_PERM) {
+                        /* Preserve execute if XN was already cleared */
+                        if (stage2_is_exec(kvm, fault_ipa))
+                                new_pte = kvm_s2pte_mkexec(new_pte);
+                }
                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
        }
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 8e633bd9cc1e..465095355666 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -1034,10 +1034,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
        device = vgic_its_alloc_device(its, device_id, itt_addr,
                                       num_eventid_bits);
-        if (IS_ERR(device))
-                return PTR_ERR(device);
-        return 0;
+        return PTR_ERR_OR_ZERO(device);
 }
 /*
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index deb51ee16a3d..83d82bd7dc4e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -16,6 +16,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <kvm/iodev.h>
+#include <kvm/arm_arch_timer.h>
 #include <kvm/arm_vgic.h>
 #include "vgic.h"
@@ -122,10 +123,43 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
        return value;
 }
+/*
+ * This function will return the VCPU that performed the MMIO access and
+ * trapped from within the VM, and will return NULL if this is a userspace
+ * access.
+ *
+ * We can disable preemption locally around accessing the per-CPU variable,
+ * and use the resolved vcpu pointer after enabling preemption again, because
+ * even if the current thread is migrated to another CPU, reading the per-CPU
+ * value later will give us the same value as we update the per-CPU variable
+ * in the preempt notifier handlers.
+ */
+static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void)
+{
+        struct kvm_vcpu *vcpu;
+        preempt_disable();
+        vcpu = kvm_arm_get_running_vcpu();
+        preempt_enable();
+        return vcpu;
+}
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                 bool is_uaccess)
+{
+        if (is_uaccess)
+                return;
+        irq->pending_latch = true;
+        vgic_irq_set_phys_active(irq, true);
+}
 void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val)
 {
+        bool is_uaccess = !vgic_get_mmio_requester_vcpu();
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;
@@ -134,17 +168,45 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                spin_lock_irqsave(&irq->irq_lock, flags);
-                irq->pending_latch = true;
+                if (irq->hw)
+                        vgic_hw_irq_spending(vcpu, irq, is_uaccess);
+                else
+                        irq->pending_latch = true;
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
 }
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                 bool is_uaccess)
+{
+        if (is_uaccess)
+                return;
+        irq->pending_latch = false;
+        /*
+         * We don't want the guest to effectively mask the physical
+         * interrupt by doing a write to SPENDR followed by a write to
+         * CPENDR for HW interrupts, so we clear the active state on
+         * the physical side if the virtual interrupt is not active.
+         * This may lead to taking an additional interrupt on the
+         * host, but that should not be a problem as the worst that
+         * can happen is an additional vgic injection.  We also clear
+         * the pending state to maintain proper semantics for edge HW
+         * interrupts.
+         */
+        vgic_irq_set_phys_pending(irq, false);
+        if (!irq->active)
+                vgic_irq_set_phys_active(irq, false);
+}
 void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val)
 {
+        bool is_uaccess = !vgic_get_mmio_requester_vcpu();
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;
@@ -154,7 +216,10 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
                spin_lock_irqsave(&irq->irq_lock, flags);
-                irq->pending_latch = false;
+                if (irq->hw)
+                        vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
+                else
+                        irq->pending_latch = false;
                spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
@@ -181,27 +246,24 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
        return value;
 }
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                      bool active, bool is_uaccess)
+{
+        if (is_uaccess)
+                return;
+        irq->active = active;
+        vgic_irq_set_phys_active(irq, active);
+}
 static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-                                    bool new_active_state)
+                                    bool active)
 {
-        struct kvm_vcpu *requester_vcpu;
        unsigned long flags;
-        spin_lock_irqsave(&irq->irq_lock, flags);
+        struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu();
-        /*
+        spin_lock_irqsave(&irq->irq_lock, flags);
-         * The vcpu parameter here can mean multiple things depending on how
-         * this function is called; when handling a trap from the kernel it
-         * depends on the GIC version, and these functions are also called as
-         * part of save/restore from userspace.
-         *
-         * Therefore, we have to figure out the requester in a reliable way.
-         *
-         * When accessing VGIC state from user space, the requester_vcpu is
-         * NULL, which is fine, because we guarantee that no VCPUs are running
-         * when accessing VGIC state from user space so irq->vcpu->cpu is
-         * always -1.
-         */
-        requester_vcpu = kvm_arm_get_running_vcpu();
        /*
         * If this virtual IRQ was written into a list register, we
@@ -213,14 +275,23 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
         * vgic_change_active_prepare)  and still has to sync back this IRQ,
         * so we release and re-acquire the spin_lock to let the other thread
         * sync back the IRQ.
+         *
+         * When accessing VGIC state from user space, requester_vcpu is
+         * NULL, which is fine, because we guarantee that no VCPUs are running
+         * when accessing VGIC state from user space so irq->vcpu->cpu is
+         * always -1.
         */
        while (irq->vcpu && /* IRQ may have state in an LR somewhere */
               irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */
               irq->vcpu->cpu != -1) /* VCPU thread is running */
                cond_resched_lock(&irq->irq_lock);
-        irq->active = new_active_state;
+        if (irq->hw)
-        if (new_active_state)
+                vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
+        else
+                irq->active = active;
+        if (irq->active)
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
        else
                spin_unlock_irqrestore(&irq->irq_lock, flags);
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 80897102da26..c32d7b93ffd1 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -105,6 +105,26 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
                                irq->pending_latch = false;
                }
+                /*
+                 * Level-triggered mapped IRQs are special because we only
+                 * observe rising edges as input to the VGIC.
+                 *
+                 * If the guest never acked the interrupt we have to sample
+                 * the physical line and set the line level, because the
+                 * device state could have changed or we simply need to
+                 * process the still pending interrupt later.
+                 *
+                 * If this causes us to lower the level, we have to also clear
+                 * the physical active state, since we will otherwise never be
+                 * told when the interrupt becomes asserted again.
+                 */
+                if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
+                        irq->line_level = vgic_get_phys_line_level(irq);
+                        if (!irq->line_level)
+                                vgic_irq_set_phys_active(irq, false);
+                }
                spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
@@ -162,6 +182,15 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
                        val |= GICH_LR_EOI;
        }
+        /*
+         * Level-triggered mapped IRQs are special because we only observe
+         * rising edges as input to the VGIC.  We therefore lower the line
+         * level here, so that we can take new virtual IRQs.  See
+         * vgic_v2_fold_lr_state for more info.
+         */
+        if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
+                irq->line_level = false;
        /* The GICv2 LR only holds five bits of priority. */
        val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index f47e8481fa45..6b329414e57a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -96,6 +96,26 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                                irq->pending_latch = false;
                }
+                /*
+                 * Level-triggered mapped IRQs are special because we only
+                 * observe rising edges as input to the VGIC.
+                 *
+                 * If the guest never acked the interrupt we have to sample
+                 * the physical line and set the line level, because the
+                 * device state could have changed or we simply need to
+                 * process the still pending interrupt later.
+                 *
+                 * If this causes us to lower the level, we have to also clear
+                 * the physical active state, since we will otherwise never be
+                 * told when the interrupt becomes asserted again.
+                 */
+                if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
+                        irq->line_level = vgic_get_phys_line_level(irq);
+                        if (!irq->line_level)
+                                vgic_irq_set_phys_active(irq, false);
+                }
                spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
@@ -146,6 +166,15 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
        }
        /*
+         * Level-triggered mapped IRQs are special because we only observe
+         * rising edges as input to the VGIC.  We therefore lower the line
+         * level here, so that we can take new virtual IRQs.  See
+         * vgic_v3_fold_lr_state for more info.
+         */
+        if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
+                irq->line_level = false;
+        /*
         * We currently only support Group1 interrupts, which is a
         * known defect. This needs to be addressed at some point.
         */
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index ecb8e25f5fe5..c7c5ef190afa 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -144,6 +144,38 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
        kfree(irq);
 }
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
+{
+        WARN_ON(irq_set_irqchip_state(irq->host_irq,
+                                      IRQCHIP_STATE_PENDING,
+                                      pending));
+}
+bool vgic_get_phys_line_level(struct vgic_irq *irq)
+{
+        bool line_level;
+        BUG_ON(!irq->hw);
+        if (irq->get_input_level)
+                return irq->get_input_level(irq->intid);
+        WARN_ON(irq_get_irqchip_state(irq->host_irq,
+                                      IRQCHIP_STATE_PENDING,
+                                      &line_level));
+        return line_level;
+}
+/* Set/Clear the physical active state */
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
+{
+        BUG_ON(!irq->hw);
+        WARN_ON(irq_set_irqchip_state(irq->host_irq,
+                                      IRQCHIP_STATE_ACTIVE,
+                                      active));
+}
 /**
 * kvm_vgic_target_oracle - compute the target vcpu for an irq
 *
@@ -413,7 +445,8 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 /* @irq->irq_lock must be held */
 static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-                            unsigned int host_irq)
+                            unsigned int host_irq,
+                            bool (*get_input_level)(int vindid))
 {
        struct irq_desc *desc;
        struct irq_data *data;
@@ -433,6 +466,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
        irq->hw = true;
        irq->host_irq = host_irq;
        irq->hwintid = data->hwirq;
+        irq->get_input_level = get_input_level;
        return 0;
 }
@@ -441,10 +475,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
 {
        irq->hw = false;
        irq->hwintid = 0;
+        irq->get_input_level = NULL;
 }
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-                          u32 vintid)
+                          u32 vintid, bool (*get_input_level)(int vindid))
 {
        struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
        unsigned long flags;
@@ -453,7 +488,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
        BUG_ON(!irq);
        spin_lock_irqsave(&irq->irq_lock, flags);
-        ret = kvm_vgic_map_irq(vcpu, irq, host_irq);
+        ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
        spin_unlock_irqrestore(&irq->irq_lock, flags);
        vgic_put_irq(vcpu->kvm, irq);
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index efbcf8f96f9c..12c37b89f7a3 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -104,6 +104,11 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
                return irq->pending_latch || irq->line_level;
 }
+static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
+{
+        return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
+}
 /*
 * This struct provides an intermediate representation of the fields contained
 * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
@@ -140,6 +145,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                              u32 intid);
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
                           unsigned long flags);
 void vgic_kick_vcpus(struct kvm *kvm);
author	Radim Krčmář <rkrcmar@redhat.com>	2018-01-31 07:34:41 -0500
committer	Radim Krčmář <rkrcmar@redhat.com>	2018-01-31 07:34:41 -0500
commit	e53175395d7e12d8474707271bc02a2814279843 (patch)
tree	ca6a0fc846cffb1b6db999a4595998c160333cf0 /virt
parent	810f4600ec5ee79c68dcbb136ed26a652df46348 (diff)
parent	cd15d2050c044ca9525ba165e9073ac8e036b8d0 (diff)