21 files changed, 4409 insertions, 3996 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 13f2d19793e3..e2c876d5a03b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -6,6 +6,9 @@ config HAVE_KVM
 config HAVE_KVM_IRQCHIP
       bool
+config HAVE_KVM_IRQFD
+       bool
 config HAVE_KVM_IRQ_ROUTING
       bool
@@ -34,3 +37,13 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT
 config KVM_VFIO
       bool
+config HAVE_KVM_ARCH_TLB_FLUSH_ALL
+       bool
+config KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       bool
+config KVM_COMPAT
+       def_bool y
+       depends on COMPAT && !S390
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 22fa819a9b6a..98c95f2fcba4 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -61,12 +61,14 @@ static void timer_disarm(struct arch_timer_cpu *timer)
 static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 {
+        int ret;
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
        timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-        kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+        ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                            timer->irq->irq,
+                                  timer->irq->irq,
-                            timer->irq->level);
+                                  timer->irq->level);
+        WARN_ON(ret);
 }
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@@ -83,13 +85,22 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
+/*
+ * Work function for handling the backup timer that we schedule when a vcpu is
+ * no longer running, but had a timer programmed to fire in the future.
+ */
 static void kvm_timer_inject_irq_work(struct work_struct *work)
 {
        struct kvm_vcpu *vcpu;
        vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
        vcpu->arch.timer_cpu.armed = false;
-        kvm_timer_inject_irq(vcpu);
+        /*
+         * If the vcpu is blocked we want to wake it up so that it will see
+         * the timer has expired when entering the guest.
+         */
+        kvm_vcpu_kick(vcpu);
 }
 static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
@@ -100,6 +111,21 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
        return HRTIMER_NORESTART;
 }
+bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
+{
+        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+        cycle_t cval, now;
+        if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+                !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+                return false;
+        cval = timer->cntv_cval;
+        now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+        return cval <= now;
+}
 /**
 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
 * @vcpu: The vcpu pointer
@@ -117,6 +143,13 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
         * populate the CPU timer again.
         */
        timer_disarm(timer);
+        /*
+         * If the timer expired while we were not scheduled, now is the time
+         * to inject it.
+         */
+        if (kvm_timer_should_fire(vcpu))
+                kvm_timer_inject_irq(vcpu);
 }
 /**
@@ -132,16 +165,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
        u64 ns;
-        if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-                !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
-                return;
-        cval = timer->cntv_cval;
-        now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
        BUG_ON(timer_is_armed(timer));
-        if (cval <= now) {
+        if (kvm_timer_should_fire(vcpu)) {
                /*
                 * Timer has already expired while we were not
                 * looking. Inject the interrupt and carry on.
@@ -150,7 +176,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
                return;
        }
-        ns = cyclecounter_cyc2ns(timecounter->cc, cval - now);
+        cval = timer->cntv_cval;
+        now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+        ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
+                                 &timecounter->frac);
        timer_arm(timer, ns);
 }
@@ -307,12 +337,24 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        timer_disarm(timer);
 }
-int kvm_timer_init(struct kvm *kvm)
+void kvm_timer_enable(struct kvm *kvm)
 {
-        if (timecounter && wqueue) {
+        if (kvm->arch.timer.enabled)
-                kvm->arch.timer.cntvoff = kvm_phys_timer_read();
+                return;
+        /*
+         * There is a potential race here between VCPUs starting for the first
+         * time, which may be enabling the timer multiple times.  That doesn't
+         * hurt though, because we're just setting a variable to the same
+         * variable that it already was.  The important thing is that all
+         * VCPUs have the enabled variable set, before entering the guest, if
+         * the arch timers are enabled.
+         */
+        if (timecounter && wqueue)
                kvm->arch.timer.enabled = 1;
-        }
+}
-        return 0;
+void kvm_timer_init(struct kvm *kvm)
+{
+        kvm->arch.timer.cntvoff = kvm_phys_timer_read();
 }
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
new file mode 100644
index 000000000000..13907970d11c
--- /dev/null
+++ b/virt/kvm/arm/vgic-v2-emul.c
@@ -0,0 +1,856 @@
+/*
+ * Contains GICv2 specific emulation code, was in vgic.c before.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/irqchip/arm-gic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+#define GICC_ARCH_VERSION_V2            0x2
+static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
+static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
+{
+        return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
+}
+static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
+                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg;
+        u32 word_offset = offset & 3;
+        switch (offset & ~3) {
+        case 0:                 /* GICD_CTLR */
+                reg = vcpu->kvm->arch.vgic.enabled;
+                vgic_reg_access(mmio, &reg, word_offset,
+                                ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+                if (mmio->is_write) {
+                        vcpu->kvm->arch.vgic.enabled = reg & 1;
+                        vgic_update_state(vcpu->kvm);
+                        return true;
+                }
+                break;
+        case 4:                 /* GICD_TYPER */
+                reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+                reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
+                vgic_reg_access(mmio, &reg, word_offset,
+                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+                break;
+        case 8:                 /* GICD_IIDR */
+                reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+                vgic_reg_access(mmio, &reg, word_offset,
+                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+                break;
+        }
+        return false;
+}
+static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
+                                       struct kvm_exit_mmio *mmio,
+                                       phys_addr_t offset)
+{
+        return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                      vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
+}
+static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
+                                         struct kvm_exit_mmio *mmio,
+                                         phys_addr_t offset)
+{
+        return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                      vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
+}
+static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
+                                        struct kvm_exit_mmio *mmio,
+                                        phys_addr_t offset)
+{
+        return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+                                           vcpu->vcpu_id);
+}
+static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
+                                          struct kvm_exit_mmio *mmio,
+                                          phys_addr_t offset)
+{
+        return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+                                             vcpu->vcpu_id);
+}
+static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
+                                       struct kvm_exit_mmio *mmio,
+                                       phys_addr_t offset)
+{
+        return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
+                                          vcpu->vcpu_id);
+}
+static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
+                                         struct kvm_exit_mmio *mmio,
+                                         phys_addr_t offset)
+{
+        return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
+                                            vcpu->vcpu_id);
+}
+static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
+                                     struct kvm_exit_mmio *mmio,
+                                     phys_addr_t offset)
+{
+        u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+                                        vcpu->vcpu_id, offset);
+        vgic_reg_access(mmio, reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        return false;
+}
+#define GICD_ITARGETSR_SIZE     32
+#define GICD_CPUTARGETS_BITS    8
+#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
+static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
+{
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        int i;
+        u32 val = 0;
+        irq -= VGIC_NR_PRIVATE_IRQS;
+        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
+                val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
+        return val;
+}
+static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
+{
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        struct kvm_vcpu *vcpu;
+        int i, c;
+        unsigned long *bmap;
+        u32 target;
+        irq -= VGIC_NR_PRIVATE_IRQS;
+        /*
+         * Pick the LSB in each byte. This ensures we target exactly
+         * one vcpu per IRQ. If the byte is null, assume we target
+         * CPU0.
+         */
+        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
+                int shift = i * GICD_CPUTARGETS_BITS;
+                target = ffs((val >> shift) & 0xffU);
+                target = target ? (target - 1) : 0;
+                dist->irq_spi_cpu[irq + i] = target;
+                kvm_for_each_vcpu(c, vcpu, kvm) {
+                        bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
+                        if (c == target)
+                                set_bit(irq + i, bmap);
+                        else
+                                clear_bit(irq + i, bmap);
+                }
+        }
+}
+static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
+                                   struct kvm_exit_mmio *mmio,
+                                   phys_addr_t offset)
+{
+        u32 reg;
+        /* We treat the banked interrupts targets as read-only */
+        if (offset < 32) {
+                u32 roreg;
+                roreg = 1 << vcpu->vcpu_id;
+                roreg |= roreg << 8;
+                roreg |= roreg << 16;
+                vgic_reg_access(mmio, &roreg, offset,
+                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+                return false;
+        }
+        reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        if (mmio->is_write) {
+                vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
+                vgic_update_state(vcpu->kvm);
+                return true;
+        }
+        return false;
+}
+static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
+                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 *reg;
+        reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                  vcpu->vcpu_id, offset >> 1);
+        return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
+                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
+        if (mmio->is_write) {
+                vgic_dispatch_sgi(vcpu, reg);
+                vgic_update_state(vcpu->kvm);
+                return true;
+        }
+        return false;
+}
+/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
+static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+                                        struct kvm_exit_mmio *mmio,
+                                        phys_addr_t offset)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        int sgi;
+        int min_sgi = (offset & ~0x3);
+        int max_sgi = min_sgi + 3;
+        int vcpu_id = vcpu->vcpu_id;
+        u32 reg = 0;
+        /* Copy source SGIs from distributor side */
+        for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
+                u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
+                reg |= ((u32)sources) << (8 * (sgi - min_sgi));
+        }
+        mmio_data_write(mmio, ~0, reg);
+        return false;
+}
+static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+                                         struct kvm_exit_mmio *mmio,
+                                         phys_addr_t offset, bool set)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        int sgi;
+        int min_sgi = (offset & ~0x3);
+        int max_sgi = min_sgi + 3;
+        int vcpu_id = vcpu->vcpu_id;
+        u32 reg;
+        bool updated = false;
+        reg = mmio_data_read(mmio, ~0);
+        /* Clear pending SGIs on the distributor */
+        for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
+                u8 mask = reg >> (8 * (sgi - min_sgi));
+                u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
+                if (set) {
+                        if ((*src & mask) != mask)
+                                updated = true;
+                        *src |= mask;
+                } else {
+                        if (*src & mask)
+                                updated = true;
+                        *src &= ~mask;
+                }
+        }
+        if (updated)
+                vgic_update_state(vcpu->kvm);
+        return updated;
+}
+static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
+                                struct kvm_exit_mmio *mmio,
+                                phys_addr_t offset)
+{
+        if (!mmio->is_write)
+                return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
+        else
+                return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
+}
+static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
+                                  struct kvm_exit_mmio *mmio,
+                                  phys_addr_t offset)
+{
+        if (!mmio->is_write)
+                return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
+        else
+                return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
+}
+static const struct vgic_io_range vgic_dist_ranges[] = {
+        {
+                .base           = GIC_DIST_CTRL,
+                .len            = 12,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_misc,
+        },
+        {
+                .base           = GIC_DIST_IGROUP,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GIC_DIST_ENABLE_SET,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_enable_reg,
+        },
+        {
+                .base           = GIC_DIST_ENABLE_CLEAR,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_enable_reg,
+        },
+        {
+                .base           = GIC_DIST_PENDING_SET,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_pending_reg,
+        },
+        {
+                .base           = GIC_DIST_PENDING_CLEAR,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_pending_reg,
+        },
+        {
+                .base           = GIC_DIST_ACTIVE_SET,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_active_reg,
+        },
+        {
+                .base           = GIC_DIST_ACTIVE_CLEAR,
+                .len            = VGIC_MAX_IRQS / 8,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_active_reg,
+        },
+        {
+                .base           = GIC_DIST_PRI,
+                .len            = VGIC_MAX_IRQS,
+                .bits_per_irq   = 8,
+                .handle_mmio    = handle_mmio_priority_reg,
+        },
+        {
+                .base           = GIC_DIST_TARGET,
+                .len            = VGIC_MAX_IRQS,
+                .bits_per_irq   = 8,
+                .handle_mmio    = handle_mmio_target_reg,
+        },
+        {
+                .base           = GIC_DIST_CONFIG,
+                .len            = VGIC_MAX_IRQS / 4,
+                .bits_per_irq   = 2,
+                .handle_mmio    = handle_mmio_cfg_reg,
+        },
+        {
+                .base           = GIC_DIST_SOFTINT,
+                .len            = 4,
+                .handle_mmio    = handle_mmio_sgi_reg,
+        },
+        {
+                .base           = GIC_DIST_SGI_PENDING_CLEAR,
+                .len            = VGIC_NR_SGIS,
+                .handle_mmio    = handle_mmio_sgi_clear,
+        },
+        {
+                .base           = GIC_DIST_SGI_PENDING_SET,
+                .len            = VGIC_NR_SGIS,
+                .handle_mmio    = handle_mmio_sgi_set,
+        },
+        {}
+};
+static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
+{
+        struct kvm *kvm = vcpu->kvm;
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        int nrcpus = atomic_read(&kvm->online_vcpus);
+        u8 target_cpus;
+        int sgi, mode, c, vcpu_id;
+        vcpu_id = vcpu->vcpu_id;
+        sgi = reg & 0xf;
+        target_cpus = (reg >> 16) & 0xff;
+        mode = (reg >> 24) & 3;
+        switch (mode) {
+        case 0:
+                if (!target_cpus)
+                        return;
+                break;
+        case 1:
+                target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
+                break;
+        case 2:
+                target_cpus = 1 << vcpu_id;
+                break;
+        }
+        kvm_for_each_vcpu(c, vcpu, kvm) {
+                if (target_cpus & 1) {
+                        /* Flag the SGI as pending */
+                        vgic_dist_irq_set_pending(vcpu, sgi);
+                        *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
+                        kvm_debug("SGI%d from CPU%d to CPU%d\n",
+                                  sgi, vcpu_id, c);
+                }
+                target_cpus >>= 1;
+        }
+}
+static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        unsigned long sources;
+        int vcpu_id = vcpu->vcpu_id;
+        int c;
+        sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
+        for_each_set_bit(c, &sources, dist->nr_cpus) {
+                if (vgic_queue_irq(vcpu, c, irq))
+                        clear_bit(c, &sources);
+        }
+        *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
+        /*
+         * If the sources bitmap has been cleared it means that we
+         * could queue all the SGIs onto link registers (see the
+         * clear_bit above), and therefore we are done with them in
+         * our emulated gic and can get rid of them.
+         */
+        if (!sources) {
+                vgic_dist_irq_clear_pending(vcpu, irq);
+                vgic_cpu_irq_clear(vcpu, irq);
+                return true;
+        }
+        return false;
+}
+/**
+ * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
+ * @kvm: pointer to the kvm struct
+ *
+ * Map the virtual CPU interface into the VM before running any VCPUs.  We
+ * can't do this at creation time, because user space must first set the
+ * virtual CPU interface address in the guest physical address space.
+ */
+static int vgic_v2_map_resources(struct kvm *kvm,
+                                 const struct vgic_params *params)
+{
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        int ret = 0;
+        if (!irqchip_in_kernel(kvm))
+                return 0;
+        mutex_lock(&kvm->lock);
+        if (vgic_ready(kvm))
+                goto out;
+        if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+            IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+                kvm_err("Need to set vgic cpu and dist addresses first\n");
+                ret = -ENXIO;
+                goto out;
+        }
+        vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
+                                 KVM_VGIC_V2_DIST_SIZE,
+                                 vgic_dist_ranges, -1, &dist->dist_iodev);
+        /*
+         * Initialize the vgic if this hasn't already been done on demand by
+         * accessing the vgic state from userspace.
+         */
+        ret = vgic_init(kvm);
+        if (ret) {
+                kvm_err("Unable to allocate maps\n");
+                goto out_unregister;
+        }
+        ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                    params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
+                                    true);
+        if (ret) {
+                kvm_err("Unable to remap VGIC CPU to VCPU\n");
+                goto out_unregister;
+        }
+        dist->ready = true;
+        goto out;
+out_unregister:
+        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
+out:
+        if (ret)
+                kvm_vgic_destroy(kvm);
+        mutex_unlock(&kvm->lock);
+        return ret;
+}
+static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
+}
+static int vgic_v2_init_model(struct kvm *kvm)
+{
+        int i;
+        for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
+                vgic_set_target_reg(kvm, 0, i);
+        return 0;
+}
+void vgic_v2_init_emulation(struct kvm *kvm)
+{
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
+        dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
+        dist->vm_ops.init_model = vgic_v2_init_model;
+        dist->vm_ops.map_resources = vgic_v2_map_resources;
+        kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+}
+static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
+                                 struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        bool updated = false;
+        struct vgic_vmcr vmcr;
+        u32 *vmcr_field;
+        u32 reg;
+        vgic_get_vmcr(vcpu, &vmcr);
+        switch (offset & ~0x3) {
+        case GIC_CPU_CTRL:
+                vmcr_field = &vmcr.ctlr;
+                break;
+        case GIC_CPU_PRIMASK:
+                vmcr_field = &vmcr.pmr;
+                break;
+        case GIC_CPU_BINPOINT:
+                vmcr_field = &vmcr.bpr;
+                break;
+        case GIC_CPU_ALIAS_BINPOINT:
+                vmcr_field = &vmcr.abpr;
+                break;
+        default:
+                BUG();
+        }
+        if (!mmio->is_write) {
+                reg = *vmcr_field;
+                mmio_data_write(mmio, ~0, reg);
+        } else {
+                reg = mmio_data_read(mmio, ~0);
+                if (reg != *vmcr_field) {
+                        *vmcr_field = reg;
+                        vgic_set_vmcr(vcpu, &vmcr);
+                        updated = true;
+                }
+        }
+        return updated;
+}
+static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
+                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
+}
+static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
+                                  struct kvm_exit_mmio *mmio,
+                                  phys_addr_t offset)
+{
+        u32 reg;
+        if (mmio->is_write)
+                return false;
+        /* GICC_IIDR */
+        reg = (PRODUCT_ID_KVM << 20) |
+              (GICC_ARCH_VERSION_V2 << 16) |
+              (IMPLEMENTER_ARM << 0);
+        mmio_data_write(mmio, ~0, reg);
+        return false;
+}
+/*
+ * CPU Interface Register accesses - these are not accessed by the VM, but by
+ * user space for saving and restoring VGIC state.
+ */
+static const struct vgic_io_range vgic_cpu_ranges[] = {
+        {
+                .base           = GIC_CPU_CTRL,
+                .len            = 12,
+                .handle_mmio    = handle_cpu_mmio_misc,
+        },
+        {
+                .base           = GIC_CPU_ALIAS_BINPOINT,
+                .len            = 4,
+                .handle_mmio    = handle_mmio_abpr,
+        },
+        {
+                .base           = GIC_CPU_ACTIVEPRIO,
+                .len            = 16,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GIC_CPU_IDENT,
+                .len            = 4,
+                .handle_mmio    = handle_cpu_mmio_ident,
+        },
+};
+static int vgic_attr_regs_access(struct kvm_device *dev,
+                                 struct kvm_device_attr *attr,
+                                 u32 *reg, bool is_write)
+{
+        const struct vgic_io_range *r = NULL, *ranges;
+        phys_addr_t offset;
+        int ret, cpuid, c;
+        struct kvm_vcpu *vcpu, *tmp_vcpu;
+        struct vgic_dist *vgic;
+        struct kvm_exit_mmio mmio;
+        u32 data;
+        offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+        cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+        mutex_lock(&dev->kvm->lock);
+        ret = vgic_init(dev->kvm);
+        if (ret)
+                goto out;
+        if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+        vgic = &dev->kvm->arch.vgic;
+        mmio.len = 4;
+        mmio.is_write = is_write;
+        mmio.data = &data;
+        if (is_write)
+                mmio_data_write(&mmio, ~0, *reg);
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+                mmio.phys_addr = vgic->vgic_dist_base + offset;
+                ranges = vgic_dist_ranges;
+                break;
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+                mmio.phys_addr = vgic->vgic_cpu_base + offset;
+                ranges = vgic_cpu_ranges;
+                break;
+        default:
+                BUG();
+        }
+        r = vgic_find_range(ranges, 4, offset);
+        if (unlikely(!r || !r->handle_mmio)) {
+                ret = -ENXIO;
+                goto out;
+        }
+        spin_lock(&vgic->lock);
+        /*
+         * Ensure that no other VCPU is running by checking the vcpu->cpu
+         * field.  If no other VPCUs are running we can safely access the VGIC
+         * state, because even if another VPU is run after this point, that
+         * VCPU will not touch the vgic state, because it will block on
+         * getting the vgic->lock in kvm_vgic_sync_hwstate().
+         */
+        kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
+                if (unlikely(tmp_vcpu->cpu != -1)) {
+                        ret = -EBUSY;
+                        goto out_vgic_unlock;
+                }
+        }
+        /*
+         * Move all pending IRQs from the LRs on all VCPUs so the pending
+         * state can be properly represented in the register state accessible
+         * through this API.
+         */
+        kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
+                vgic_unqueue_irqs(tmp_vcpu);
+        offset -= r->base;
+        r->handle_mmio(vcpu, &mmio, offset);
+        if (!is_write)
+                *reg = mmio_data_read(&mmio, ~0);
+        ret = 0;
+out_vgic_unlock:
+        spin_unlock(&vgic->lock);
+out:
+        mutex_unlock(&dev->kvm->lock);
+        return ret;
+}
+static int vgic_v2_create(struct kvm_device *dev, u32 type)
+{
+        return kvm_vgic_create(dev->kvm, type);
+}
+static void vgic_v2_destroy(struct kvm_device *dev)
+{
+        kfree(dev);
+}
+static int vgic_v2_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        int ret;
+        ret = vgic_set_common_attr(dev, attr);
+        if (ret != -ENXIO)
+                return ret;
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+                u32 reg;
+                if (get_user(reg, uaddr))
+                        return -EFAULT;
+                return vgic_attr_regs_access(dev, attr, &reg, true);
+        }
+        }
+        return -ENXIO;
+}
+static int vgic_v2_get_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        int ret;
+        ret = vgic_get_common_attr(dev, attr);
+        if (ret != -ENXIO)
+                return ret;
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+                u32 reg = 0;
+                ret = vgic_attr_regs_access(dev, attr, &reg, false);
+                if (ret)
+                        return ret;
+                return put_user(reg, uaddr);
+        }
+        }
+        return -ENXIO;
+}
+static int vgic_v2_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        phys_addr_t offset;
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_ADDR:
+                switch (attr->attr) {
+                case KVM_VGIC_V2_ADDR_TYPE_DIST:
+                case KVM_VGIC_V2_ADDR_TYPE_CPU:
+                        return 0;
+                }
+                break;
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+                offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+                return vgic_has_attr_regs(vgic_dist_ranges, offset);
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+                offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+                return vgic_has_attr_regs(vgic_cpu_ranges, offset);
+        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+                return 0;
+        case KVM_DEV_ARM_VGIC_GRP_CTRL:
+                switch (attr->attr) {
+                case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                        return 0;
+                }
+        }
+        return -ENXIO;
+}
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+        .name = "kvm-arm-vgic-v2",
+        .create = vgic_v2_create,
+        .destroy = vgic_v2_destroy,
+        .set_attr = vgic_v2_set_attr,
+        .get_attr = vgic_v2_get_attr,
+        .has_attr = vgic_v2_has_attr,
+};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
new file mode 100644
index 000000000000..f9b9c7c51372
--- /dev/null
+++ b/virt/kvm/arm/vgic-v2.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/irqchip/arm-gic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
+{
+        struct vgic_lr lr_desc;
+        u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
+        lr_desc.irq     = val & GICH_LR_VIRTUALID;
+        if (lr_desc.irq <= 15)
+                lr_desc.source  = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
+        else
+                lr_desc.source = 0;
+        lr_desc.state   = 0;
+        if (val & GICH_LR_PENDING_BIT)
+                lr_desc.state |= LR_STATE_PENDING;
+        if (val & GICH_LR_ACTIVE_BIT)
+                lr_desc.state |= LR_STATE_ACTIVE;
+        if (val & GICH_LR_EOI)
+                lr_desc.state |= LR_EOI_INT;
+        return lr_desc;
+}
+static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
+                           struct vgic_lr lr_desc)
+{
+        u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+        if (lr_desc.state & LR_STATE_PENDING)
+                lr_val |= GICH_LR_PENDING_BIT;
+        if (lr_desc.state & LR_STATE_ACTIVE)
+                lr_val |= GICH_LR_ACTIVE_BIT;
+        if (lr_desc.state & LR_EOI_INT)
+                lr_val |= GICH_LR_EOI;
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
+}
+static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
+                                  struct vgic_lr lr_desc)
+{
+        if (!(lr_desc.state & LR_STATE_MASK))
+                vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
+        else
+                vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
+}
+static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
+}
+static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
+}
+static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
+}
+static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
+{
+        u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
+        u32 ret = 0;
+        if (misr & GICH_MISR_EOI)
+                ret |= INT_STATUS_EOI;
+        if (misr & GICH_MISR_U)
+                ret |= INT_STATUS_UNDERFLOW;
+        return ret;
+}
+static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
+}
+static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
+}
+static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+        u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
+        vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
+        vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+        vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
+        vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
+}
+static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+        u32 vmcr;
+        vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
+        vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
+        vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
+        vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
+}
+static void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+        /*
+         * By forcing VMCR to zero, the GIC will restore the binary
+         * points to their reset values. Anything else resets to zero
+         * anyway.
+         */
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+        /* Get the show on the road... */
+        vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+static const struct vgic_ops vgic_v2_ops = {
+        .get_lr                 = vgic_v2_get_lr,
+        .set_lr                 = vgic_v2_set_lr,
+        .sync_lr_elrsr          = vgic_v2_sync_lr_elrsr,
+        .get_elrsr              = vgic_v2_get_elrsr,
+        .get_eisr               = vgic_v2_get_eisr,
+        .clear_eisr             = vgic_v2_clear_eisr,
+        .get_interrupt_status   = vgic_v2_get_interrupt_status,
+        .enable_underflow       = vgic_v2_enable_underflow,
+        .disable_underflow      = vgic_v2_disable_underflow,
+        .get_vmcr               = vgic_v2_get_vmcr,
+        .set_vmcr               = vgic_v2_set_vmcr,
+        .enable                 = vgic_v2_enable,
+};
+static struct vgic_params vgic_v2_params;
+/**
+ * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
+ * @node:       pointer to the DT node
+ * @ops:        address of a pointer to the GICv2 operations
+ * @params:     address of a pointer to HW-specific parameters
+ *
+ * Returns 0 if a GICv2 has been found, with the low level operations
+ * in *ops and the HW parameters in *params. Returns an error code
+ * otherwise.
+ */
+int vgic_v2_probe(struct device_node *vgic_node,
+                  const struct vgic_ops **ops,
+                  const struct vgic_params **params)
+{
+        int ret;
+        struct resource vctrl_res;
+        struct resource vcpu_res;
+        struct vgic_params *vgic = &vgic_v2_params;
+        vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0);
+        if (!vgic->maint_irq) {
+                kvm_err("error getting vgic maintenance irq from DT\n");
+                ret = -ENXIO;
+                goto out;
+        }
+        ret = of_address_to_resource(vgic_node, 2, &vctrl_res);
+        if (ret) {
+                kvm_err("Cannot obtain GICH resource\n");
+                goto out;
+        }
+        vgic->vctrl_base = of_iomap(vgic_node, 2);
+        if (!vgic->vctrl_base) {
+                kvm_err("Cannot ioremap GICH\n");
+                ret = -ENOMEM;
+                goto out;
+        }
+        vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
+        vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
+        ret = create_hyp_io_mappings(vgic->vctrl_base,
+                                     vgic->vctrl_base + resource_size(&vctrl_res),
+                                     vctrl_res.start);
+        if (ret) {
+                kvm_err("Cannot map VCTRL into hyp\n");
+                goto out_unmap;
+        }
+        if (of_address_to_resource(vgic_node, 3, &vcpu_res)) {
+                kvm_err("Cannot obtain GICV resource\n");
+                ret = -ENXIO;
+                goto out_unmap;
+        }
+        if (!PAGE_ALIGNED(vcpu_res.start)) {
+                kvm_err("GICV physical address 0x%llx not page aligned\n",
+                        (unsigned long long)vcpu_res.start);
+                ret = -ENXIO;
+                goto out_unmap;
+        }
+        if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+                kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                        (unsigned long long)resource_size(&vcpu_res),
+                        PAGE_SIZE);
+                ret = -ENXIO;
+                goto out_unmap;
+        }
+        vgic->can_emulate_gicv2 = true;
+        kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
+        vgic->vcpu_base = vcpu_res.start;
+        kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
+                 vctrl_res.start, vgic->maint_irq);
+        vgic->type = VGIC_V2;
+        vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
+        *ops = &vgic_v2_ops;
+        *params = vgic;
+        goto out;
+out_unmap:
+        iounmap(vgic->vctrl_base);
+out:
+        of_node_put(vgic_node);
+        return ret;
+}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
new file mode 100644
index 000000000000..e9c3a7a83833
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -0,0 +1,1030 @@
+/*
+ * GICv3 distributor and redistributor emulation
+ *
+ * GICv3 emulation is currently only supported on a GICv3 host (because
+ * we rely on the hardware's CPU interface virtualization support), but
+ * supports both hardware with or without the optional GICv2 backwards
+ * compatibility features.
+ *
+ * Limitations of the emulation:
+ * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
+ * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
+ * - We do not support the message based interrupts (MBIs) triggered by
+ *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
+ * - We do not support the (optional) backwards compatibility feature.
+ *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
+ *   the compatiblity feature, you can use a GICv2 in the guest, though.
+ * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
+ * - Priorities are not emulated (same as the GICv2 emulation). Linux
+ *   as a guest is fine with this, because it does not use priorities.
+ * - We only support Group1 interrupts. Again Linux uses only those.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
+                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg = 0xffffffff;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
+                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg = 0;
+        /*
+         * Force ARE and DS to 1, the guest cannot change this.
+         * For the time being we only support Group1 interrupts.
+         */
+        if (vcpu->kvm->arch.vgic.enabled)
+                reg = GICD_CTLR_ENABLE_SS_G1;
+        reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        if (mmio->is_write) {
+                if (reg & GICD_CTLR_ENABLE_SS_G0)
+                        kvm_info("guest tried to enable unsupported Group0 interrupts\n");
+                vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
+                vgic_update_state(vcpu->kvm);
+                return true;
+        }
+        return false;
+}
+/*
+ * As this implementation does not provide compatibility
+ * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
+ * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
+ * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
+ */
+#define INTERRUPT_ID_BITS 10
+static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
+                              struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg;
+        reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
+        reg |= (INTERRUPT_ID_BITS - 1) << 19;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
+                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+        u32 reg;
+        reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
+                                            struct kvm_exit_mmio *mmio,
+                                            phys_addr_t offset)
+{
+        if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+                return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                              vcpu->vcpu_id,
+                                              ACCESS_WRITE_SETBIT);
+        vgic_reg_access(mmio, NULL, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
+                                              struct kvm_exit_mmio *mmio,
+                                              phys_addr_t offset)
+{
+        if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+                return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                              vcpu->vcpu_id,
+                                              ACCESS_WRITE_CLEARBIT);
+        vgic_reg_access(mmio, NULL, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
+                                             struct kvm_exit_mmio *mmio,
+                                             phys_addr_t offset)
+{
+        if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+                return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+                                                   vcpu->vcpu_id);
+        vgic_reg_access(mmio, NULL, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
+                                               struct kvm_exit_mmio *mmio,
+                                               phys_addr_t offset)
+{
+        if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+                return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+                                                     vcpu->vcpu_id);
+        vgic_reg_access(mmio, NULL, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
+                                          struct kvm_exit_mmio *mmio,
+                                          phys_addr_t offset)
+{
+        u32 *reg;
+        if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
+                vgic_reg_access(mmio, NULL, offset,
+                                ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+                return false;
+        }
+        reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+                                   vcpu->vcpu_id, offset);
+        vgic_reg_access(mmio, reg, offset,
+                ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        return false;
+}
+static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
+                                     struct kvm_exit_mmio *mmio,
+                                     phys_addr_t offset)
+{
+        u32 *reg;
+        if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
+                vgic_reg_access(mmio, NULL, offset,
+                                ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+                return false;
+        }
+        reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                  vcpu->vcpu_id, offset >> 1);
+        return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+/*
+ * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
+ * when we store the target MPIDR written by the guest.
+ */
+static u32 compress_mpidr(unsigned long mpidr)
+{
+        u32 ret;
+        ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+        ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
+        ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
+        ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
+        return ret;
+}
+static unsigned long uncompress_mpidr(u32 value)
+{
+        unsigned long mpidr;
+        mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
+        mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
+        mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
+        mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
+        return mpidr;
+}
+/*
+ * Lookup the given MPIDR value to get the vcpu_id (if there is one)
+ * and store that in the irq_spi_cpu[] array.
+ * This limits the number of VCPUs to 255 for now, extending the data
+ * type (or storing kvm_vcpu pointers) should lift the limit.
+ * Store the original MPIDR value in an extra array to support read-as-written.
+ * Unallocated MPIDRs are translated to a special value and caught
+ * before any array accesses.
+ */
+static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
+                                  struct kvm_exit_mmio *mmio,
+                                  phys_addr_t offset)
+{
+        struct kvm *kvm = vcpu->kvm;
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        int spi;
+        u32 reg;
+        int vcpu_id;
+        unsigned long *bmap, mpidr;
+        /*
+         * The upper 32 bits of each 64 bit register are zero,
+         * as we don't support Aff3.
+         */
+        if ((offset & 4)) {
+                vgic_reg_access(mmio, NULL, offset,
+                                ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+                return false;
+        }
+        /* This region only covers SPIs, so no handling of private IRQs here. */
+        spi = offset / 8;
+        /* get the stored MPIDR for this IRQ */
+        mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
+        reg = mpidr;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        if (!mmio->is_write)
+                return false;
+        /*
+         * Now clear the currently assigned vCPU from the map, making room
+         * for the new one to be written below
+         */
+        vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
+        if (likely(vcpu)) {
+                vcpu_id = vcpu->vcpu_id;
+                bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+                __clear_bit(spi, bmap);
+        }
+        dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
+        vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
+        /*
+         * The spec says that non-existent MPIDR values should not be
+         * forwarded to any existent (v)CPU, but should be able to become
+         * pending anyway. We simply keep the irq_spi_target[] array empty, so
+         * the interrupt will never be injected.
+         * irq_spi_cpu[irq] gets a magic value in this case.
+         */
+        if (likely(vcpu)) {
+                vcpu_id = vcpu->vcpu_id;
+                dist->irq_spi_cpu[spi] = vcpu_id;
+                bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+                __set_bit(spi, bmap);
+        } else {
+                dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
+        }
+        vgic_update_state(kvm);
+        return true;
+}
+/*
+ * We should be careful about promising too much when a guest reads
+ * this register. Don't claim to be like any hardware implementation,
+ * but just report the GIC as version 3 - which is what a Linux guest
+ * would check.
+ */
+static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
+                               struct kvm_exit_mmio *mmio,
+                               phys_addr_t offset)
+{
+        u32 reg = 0;
+        switch (offset + GICD_IDREGS) {
+        case GICD_PIDR2:
+                reg = 0x3b;
+                break;
+        }
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static const struct vgic_io_range vgic_v3_dist_ranges[] = {
+        {
+                .base           = GICD_CTLR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_ctlr,
+        },
+        {
+                .base           = GICD_TYPER,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_typer,
+        },
+        {
+                .base           = GICD_IIDR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_iidr,
+        },
+        {
+                /* this register is optional, it is RAZ/WI if not implemented */
+                .base           = GICD_STATUSR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this write only register is WI when TYPER.MBIS=0 */
+                .base           = GICD_SETSPI_NSR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this write only register is WI when TYPER.MBIS=0 */
+                .base           = GICD_CLRSPI_NSR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when DS=1 */
+                .base           = GICD_SETSPI_SR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when DS=1 */
+                .base           = GICD_CLRSPI_SR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICD_IGROUPR,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_rao_wi,
+        },
+        {
+                .base           = GICD_ISENABLER,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_enable_reg_dist,
+        },
+        {
+                .base           = GICD_ICENABLER,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_enable_reg_dist,
+        },
+        {
+                .base           = GICD_ISPENDR,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_pending_reg_dist,
+        },
+        {
+                .base           = GICD_ICPENDR,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_pending_reg_dist,
+        },
+        {
+                .base           = GICD_ISACTIVER,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICD_ICACTIVER,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICD_IPRIORITYR,
+                .len            = 0x400,
+                .bits_per_irq   = 8,
+                .handle_mmio    = handle_mmio_priority_reg_dist,
+        },
+        {
+                /* TARGETSRn is RES0 when ARE=1 */
+                .base           = GICD_ITARGETSR,
+                .len            = 0x400,
+                .bits_per_irq   = 8,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICD_ICFGR,
+                .len            = 0x100,
+                .bits_per_irq   = 2,
+                .handle_mmio    = handle_mmio_cfg_reg_dist,
+        },
+        {
+                /* this is RAZ/WI when DS=1 */
+                .base           = GICD_IGRPMODR,
+                .len            = 0x80,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when DS=1 */
+                .base           = GICD_NSACR,
+                .len            = 0x100,
+                .bits_per_irq   = 2,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when ARE=1 */
+                .base           = GICD_SGIR,
+                .len            = 0x04,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when ARE=1 */
+                .base           = GICD_CPENDSGIR,
+                .len            = 0x10,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                /* this is RAZ/WI when ARE=1 */
+                .base           = GICD_SPENDSGIR,
+                .len            = 0x10,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICD_IROUTER + 0x100,
+                .len            = 0x1ee0,
+                .bits_per_irq   = 64,
+                .handle_mmio    = handle_mmio_route_reg,
+        },
+        {
+                .base           = GICD_IDREGS,
+                .len            = 0x30,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_idregs,
+        },
+        {},
+};
+static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
+                                    struct kvm_exit_mmio *mmio,
+                                    phys_addr_t offset)
+{
+        /* since we don't support LPIs, this register is zero for now */
+        vgic_reg_access(mmio, NULL, offset,
+                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
+                                     struct kvm_exit_mmio *mmio,
+                                     phys_addr_t offset)
+{
+        u32 reg;
+        u64 mpidr;
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        int target_vcpu_id = redist_vcpu->vcpu_id;
+        /* the upper 32 bits contain the affinity value */
+        if ((offset & ~3) == 4) {
+                mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
+                reg = compress_mpidr(mpidr);
+                vgic_reg_access(mmio, &reg, offset,
+                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+                return false;
+        }
+        reg = redist_vcpu->vcpu_id << 8;
+        if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+                reg |= GICR_TYPER_LAST;
+        vgic_reg_access(mmio, &reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        return false;
+}
+static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
+                                              struct kvm_exit_mmio *mmio,
+                                              phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                      redist_vcpu->vcpu_id,
+                                      ACCESS_WRITE_SETBIT);
+}
+static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
+                                                struct kvm_exit_mmio *mmio,
+                                                phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+                                      redist_vcpu->vcpu_id,
+                                      ACCESS_WRITE_CLEARBIT);
+}
+static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
+                                               struct kvm_exit_mmio *mmio,
+                                               phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+                                           redist_vcpu->vcpu_id);
+}
+static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
+                                                 struct kvm_exit_mmio *mmio,
+                                                 phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+                                             redist_vcpu->vcpu_id);
+}
+static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
+                                            struct kvm_exit_mmio *mmio,
+                                            phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        u32 *reg;
+        reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+                                   redist_vcpu->vcpu_id, offset);
+        vgic_reg_access(mmio, reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+        return false;
+}
+static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
+                                       struct kvm_exit_mmio *mmio,
+                                       phys_addr_t offset)
+{
+        struct kvm_vcpu *redist_vcpu = mmio->private;
+        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                       redist_vcpu->vcpu_id, offset >> 1);
+        return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+#define SGI_base(x) ((x) + SZ_64K)
+static const struct vgic_io_range vgic_redist_ranges[] = {
+        {
+                .base           = GICR_CTLR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_ctlr_redist,
+        },
+        {
+                .base           = GICR_TYPER,
+                .len            = 0x08,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_typer_redist,
+        },
+        {
+                .base           = GICR_IIDR,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_iidr,
+        },
+        {
+                .base           = GICR_WAKER,
+                .len            = 0x04,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = GICR_IDREGS,
+                .len            = 0x30,
+                .bits_per_irq   = 0,
+                .handle_mmio    = handle_mmio_idregs,
+        },
+        {
+                .base           = SGI_base(GICR_IGROUPR0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_rao_wi,
+        },
+        {
+                .base           = SGI_base(GICR_ISENABLER0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_enable_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_ICENABLER0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_enable_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_ISPENDR0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_set_pending_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_ICPENDR0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_clear_pending_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_ISACTIVER0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = SGI_base(GICR_ICACTIVER0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = SGI_base(GICR_IPRIORITYR0),
+                .len            = 0x20,
+                .bits_per_irq   = 8,
+                .handle_mmio    = handle_mmio_priority_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_ICFGR0),
+                .len            = 0x08,
+                .bits_per_irq   = 2,
+                .handle_mmio    = handle_mmio_cfg_reg_redist,
+        },
+        {
+                .base           = SGI_base(GICR_IGRPMODR0),
+                .len            = 0x04,
+                .bits_per_irq   = 1,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {
+                .base           = SGI_base(GICR_NSACR),
+                .len            = 0x04,
+                .handle_mmio    = handle_mmio_raz_wi,
+        },
+        {},
+};
+static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+        if (vgic_queue_irq(vcpu, 0, irq)) {
+                vgic_dist_irq_clear_pending(vcpu, irq);
+                vgic_cpu_irq_clear(vcpu, irq);
+                return true;
+        }
+        return false;
+}
+static int vgic_v3_map_resources(struct kvm *kvm,
+                                 const struct vgic_params *params)
+{
+        int ret = 0;
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        gpa_t rdbase = dist->vgic_redist_base;
+        struct vgic_io_device *iodevs = NULL;
+        int i;
+        if (!irqchip_in_kernel(kvm))
+                return 0;
+        mutex_lock(&kvm->lock);
+        if (vgic_ready(kvm))
+                goto out;
+        if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+            IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+                kvm_err("Need to set vgic distributor addresses first\n");
+                ret = -ENXIO;
+                goto out;
+        }
+        /*
+         * For a VGICv3 we require the userland to explicitly initialize
+         * the VGIC before we need to use it.
+         */
+        if (!vgic_initialized(kvm)) {
+                ret = -EBUSY;
+                goto out;
+        }
+        ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
+                                       GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
+                                       -1, &dist->dist_iodev);
+        if (ret)
+                goto out;
+        iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
+        if (!iodevs) {
+                ret = -ENOMEM;
+                goto out_unregister;
+        }
+        for (i = 0; i < dist->nr_cpus; i++) {
+                ret = vgic_register_kvm_io_dev(kvm, rdbase,
+                                               SZ_128K, vgic_redist_ranges,
+                                               i, &iodevs[i]);
+                if (ret)
+                        goto out_unregister;
+                rdbase += GIC_V3_REDIST_SIZE;
+        }
+        dist->redist_iodevs = iodevs;
+        dist->ready = true;
+        goto out;
+out_unregister:
+        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
+        if (iodevs) {
+                for (i = 0; i < dist->nr_cpus; i++) {
+                        if (iodevs[i].dev.ops)
+                                kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                          &iodevs[i].dev);
+                }
+        }
+out:
+        if (ret)
+                kvm_vgic_destroy(kvm);
+        mutex_unlock(&kvm->lock);
+        return ret;
+}
+static int vgic_v3_init_model(struct kvm *kvm)
+{
+        int i;
+        u32 mpidr;
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+        dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
+                                      GFP_KERNEL);
+        if (!dist->irq_spi_mpidr)
+                return -ENOMEM;
+        /* Initialize the target VCPUs for each IRQ to VCPU 0 */
+        mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
+        for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
+                dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
+                dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
+                vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
+        }
+        return 0;
+}
+/* GICv3 does not keep track of SGI sources anymore. */
+static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
+{
+}
+void vgic_v3_init_emulation(struct kvm *kvm)
+{
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
+        dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
+        dist->vm_ops.init_model = vgic_v3_init_model;
+        dist->vm_ops.map_resources = vgic_v3_map_resources;
+        kvm->arch.max_vcpus = KVM_MAX_VCPUS;
+}
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+        unsigned long affinity;
+        int level0;
+        /*
+         * Split the current VCPU's MPIDR into affinity level 0 and the
+         * rest as this is what we have to compare against.
+         */
+        affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+        level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+        affinity &= ~MPIDR_LEVEL_MASK;
+        /* bail out if the upper three levels don't match */
+        if (sgi_aff != affinity)
+                return -1;
+        /* Is this VCPU's bit set in the mask ? */
+        if (!(sgi_cpu_mask & BIT(level0)))
+                return -1;
+        return level0;
+}
+#define SGI_AFFINITY_LEVEL(reg, level) \
+        ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+        >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
+{
+        struct kvm *kvm = vcpu->kvm;
+        struct kvm_vcpu *c_vcpu;
+        struct vgic_dist *dist = &kvm->arch.vgic;
+        u16 target_cpus;
+        u64 mpidr;
+        int sgi, c;
+        int vcpu_id = vcpu->vcpu_id;
+        bool broadcast;
+        int updated = 0;
+        sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+        broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+        target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+        mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+        mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+        mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+        /*
+         * We take the dist lock here, because we come from the sysregs
+         * code path and not from the MMIO one (which already takes the lock).
+         */
+        spin_lock(&dist->lock);
+        /*
+         * We iterate over all VCPUs to find the MPIDRs matching the request.
+         * If we have handled one CPU, we clear it's bit to detect early
+         * if we are already finished. This avoids iterating through all
+         * VCPUs when most of the times we just signal a single VCPU.
+         */
+        kvm_for_each_vcpu(c, c_vcpu, kvm) {
+                /* Exit early if we have dealt with all requested CPUs */
+                if (!broadcast && target_cpus == 0)
+                        break;
+                 /* Don't signal the calling VCPU */
+                if (broadcast && c == vcpu_id)
+                        continue;
+                if (!broadcast) {
+                        int level0;
+                        level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+                        if (level0 == -1)
+                                continue;
+                        /* remove this matching VCPU from the mask */
+                        target_cpus &= ~BIT(level0);
+                }
+                /* Flag the SGI as pending */
+                vgic_dist_irq_set_pending(c_vcpu, sgi);
+                updated = 1;
+                kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
+        }
+        if (updated)
+                vgic_update_state(vcpu->kvm);
+        spin_unlock(&dist->lock);
+        if (updated)
+                vgic_kick_vcpus(vcpu->kvm);
+}
+static int vgic_v3_create(struct kvm_device *dev, u32 type)
+{
+        return kvm_vgic_create(dev->kvm, type);
+}
+static void vgic_v3_destroy(struct kvm_device *dev)
+{
+        kfree(dev);
+}
+static int vgic_v3_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        int ret;
+        ret = vgic_set_common_attr(dev, attr);
+        if (ret != -ENXIO)
+                return ret;
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+                return -ENXIO;
+        }
+        return -ENXIO;
+}
+static int vgic_v3_get_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        int ret;
+        ret = vgic_get_common_attr(dev, attr);
+        if (ret != -ENXIO)
+                return ret;
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+                return -ENXIO;
+        }
+        return -ENXIO;
+}
+static int vgic_v3_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+        switch (attr->group) {
+        case KVM_DEV_ARM_VGIC_GRP_ADDR:
+                switch (attr->attr) {
+                case KVM_VGIC_V2_ADDR_TYPE_DIST:
+                case KVM_VGIC_V2_ADDR_TYPE_CPU:
+                        return -ENXIO;
+                case KVM_VGIC_V3_ADDR_TYPE_DIST:
+                case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+                        return 0;
+                }
+                break;
+        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+                return -ENXIO;
+        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+                return 0;
+        case KVM_DEV_ARM_VGIC_GRP_CTRL:
+                switch (attr->attr) {
+                case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                        return 0;
+                }
+        }
+        return -ENXIO;
+}
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+        .name = "kvm-arm-vgic-v3",
+        .create = vgic_v3_create,
+        .destroy = vgic_v3_destroy,
+        .set_attr = vgic_v3_set_attr,
+        .get_attr = vgic_v3_get_attr,
+        .has_attr = vgic_v3_has_attr,
+};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
new file mode 100644
index 000000000000..dff06021e748
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2013 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID               (0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT      (10)
+#define GICH_LR_PHYSID_CPUID            (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
+#define ICH_LR_VIRTUALID_MASK           (BIT_ULL(32) - 1)
+/*
+ * LRs are stored in reverse order in memory. make sure we index them
+ * correctly.
+ */
+#define LR_INDEX(lr)                    (VGIC_V3_MAX_LRS - 1 - lr)
+static u32 ich_vtr_el2;
+static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
+{
+        struct vgic_lr lr_desc;
+        u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)];
+        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
+        else
+                lr_desc.irq = val & GICH_LR_VIRTUALID;
+        lr_desc.source = 0;
+        if (lr_desc.irq <= 15 &&
+            vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+                lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
+        lr_desc.state = 0;
+        if (val & ICH_LR_PENDING_BIT)
+                lr_desc.state |= LR_STATE_PENDING;
+        if (val & ICH_LR_ACTIVE_BIT)
+                lr_desc.state |= LR_STATE_ACTIVE;
+        if (val & ICH_LR_EOI)
+                lr_desc.state |= LR_EOI_INT;
+        return lr_desc;
+}
+static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
+                           struct vgic_lr lr_desc)
+{
+        u64 lr_val;
+        lr_val = lr_desc.irq;
+        /*
+         * Currently all guest IRQs are Group1, as Group0 would result
+         * in a FIQ in the guest, which it wouldn't expect.
+         * Eventually we want to make this configurable, so we may revisit
+         * this in the future.
+         */
+        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                lr_val |= ICH_LR_GROUP;
+        else
+                lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+        if (lr_desc.state & LR_STATE_PENDING)
+                lr_val |= ICH_LR_PENDING_BIT;
+        if (lr_desc.state & LR_STATE_ACTIVE)
+                lr_val |= ICH_LR_ACTIVE_BIT;
+        if (lr_desc.state & LR_EOI_INT)
+                lr_val |= ICH_LR_EOI;
+        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
+}
+static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
+                                  struct vgic_lr lr_desc)
+{
+        if (!(lr_desc.state & LR_STATE_MASK))
+                vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
+        else
+                vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
+}
+static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
+}
+static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
+}
+static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
+}
+static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
+{
+        u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
+        u32 ret = 0;
+        if (misr & ICH_MISR_EOI)
+                ret |= INT_STATUS_EOI;
+        if (misr & ICH_MISR_U)
+                ret |= INT_STATUS_UNDERFLOW;
+        return ret;
+}
+static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+        u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
+        vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
+        vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+        vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+        vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+}
+static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
+}
+static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
+}
+static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+        u32 vmcr;
+        vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
+        vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+        vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+        vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+        vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
+}
+static void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+        struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+        /*
+         * By forcing VMCR to zero, the GIC will restore the binary
+         * points to their reset values. Anything else resets to zero
+         * anyway.
+         */
+        vgic_v3->vgic_vmcr = 0;
+        /*
+         * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+         * way, so we force SRE to 1 to demonstrate this to the guest.
+         * This goes with the spec allowing the value to be RAO/WI.
+         */
+        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
+        else
+                vgic_v3->vgic_sre = 0;
+        /* Get the show on the road... */
+        vgic_v3->vgic_hcr = ICH_HCR_EN;
+}
+static const struct vgic_ops vgic_v3_ops = {
+        .get_lr                 = vgic_v3_get_lr,
+        .set_lr                 = vgic_v3_set_lr,
+        .sync_lr_elrsr          = vgic_v3_sync_lr_elrsr,
+        .get_elrsr              = vgic_v3_get_elrsr,
+        .get_eisr               = vgic_v3_get_eisr,
+        .clear_eisr             = vgic_v3_clear_eisr,
+        .get_interrupt_status   = vgic_v3_get_interrupt_status,
+        .enable_underflow       = vgic_v3_enable_underflow,
+        .disable_underflow      = vgic_v3_disable_underflow,
+        .get_vmcr               = vgic_v3_get_vmcr,
+        .set_vmcr               = vgic_v3_set_vmcr,
+        .enable                 = vgic_v3_enable,
+};
+static struct vgic_params vgic_v3_params;
+/**
+ * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
+ * @node:       pointer to the DT node
+ * @ops:        address of a pointer to the GICv3 operations
+ * @params:     address of a pointer to HW-specific parameters
+ *
+ * Returns 0 if a GICv3 has been found, with the low level operations
+ * in *ops and the HW parameters in *params. Returns an error code
+ * otherwise.
+ */
+int vgic_v3_probe(struct device_node *vgic_node,
+                  const struct vgic_ops **ops,
+                  const struct vgic_params **params)
+{
+        int ret = 0;
+        u32 gicv_idx;
+        struct resource vcpu_res;
+        struct vgic_params *vgic = &vgic_v3_params;
+        vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0);
+        if (!vgic->maint_irq) {
+                kvm_err("error getting vgic maintenance irq from DT\n");
+                ret = -ENXIO;
+                goto out;
+        }
+        ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+        /*
+         * The ListRegs field is 5 bits, but there is a architectural
+         * maximum of 16 list registers. Just ignore bit 4...
+         */
+        vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
+        vgic->can_emulate_gicv2 = false;
+        if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx))
+                gicv_idx = 1;
+        gicv_idx += 3; /* Also skip GICD, GICC, GICH */
+        if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) {
+                kvm_info("GICv3: no GICV resource entry\n");
+                vgic->vcpu_base = 0;
+        } else if (!PAGE_ALIGNED(vcpu_res.start)) {
+                pr_warn("GICV physical address 0x%llx not page aligned\n",
+                        (unsigned long long)vcpu_res.start);
+                vgic->vcpu_base = 0;
+        } else if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+                pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                        (unsigned long long)resource_size(&vcpu_res),
+                        PAGE_SIZE);
+                vgic->vcpu_base = 0;
+        } else {
+                vgic->vcpu_base = vcpu_res.start;
+                vgic->can_emulate_gicv2 = true;
+                kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                        KVM_DEV_TYPE_ARM_VGIC_V2);
+        }
+        if (vgic->vcpu_base == 0)
+                kvm_info("disabling GICv2 emulation\n");
+        kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
+        vgic->vctrl_base = NULL;
+        vgic->type = VGIC_V3;
+        vgic->max_gic_vcpus = KVM_MAX_VCPUS;
+        kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
+                 vcpu_res.start, vgic->maint_irq);
+        *ops = &vgic_v3_ops;
+        *params = vgic;
+out:
+        of_node_put(vgic_node);
+        return ret;
+}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 476d3bf540a8..8d550ff14700 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -31,26 +31,30 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <trace/events/kvm.h>
+#include <asm/kvm.h>
+#include <kvm/iodev.h>
 /*
 * How the whole thing works (courtesy of Christoffer Dall):
 *
 * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- *   something is pending
+ *   something is pending on the CPU interface.
- * - VGIC pending interrupts are stored on the vgic.irq_state vgic
+ * - Interrupts that are pending on the distributor are stored on the
- *   bitmap (this bitmap is updated by both user land ioctls and guest
+ *   vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- *   mmio ops, and other in-kernel peripherals such as the
+ *   ioctls and guest mmio ops, and other in-kernel peripherals such as the
- *   arch. timers) and indicate the 'wire' state.
+ *   arch. timers).
 * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
 *   recalculated
 * - To calculate the oracle, we need info for each cpu from
 *   compute_pending_for_cpu, which considers:
- *   - PPI: dist->irq_state & dist->irq_enable
+ *   - PPI: dist->irq_pending & dist->irq_enable
- *   - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target
+ *   - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- *   - irq_spi_target is a 'formatted' version of the GICD_ICFGR
+ *   - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
 *     registers, stored on each vcpu. We only keep one bit of
 *     information per interrupt, making sure that only one vcpu can
 *     accept the interrupt.
+ * - If any of the above state changes, we must recalculate the oracle.
 * - The same is true when injecting an interrupt, except that we only
 *   consider a single interrupt at a time. The irq_spi_cpu array
 *   contains the target CPU for each SPI.
@@ -60,75 +64,126 @@
 * the 'line' again. This is achieved as such:
 *
 * - When a level interrupt is moved onto a vcpu, the corresponding
- *   bit in irq_active is set. As long as this bit is set, the line
+ *   bit in irq_queued is set. As long as this bit is set, the line
 *   will be ignored for further interrupts. The interrupt is injected
 *   into the vcpu with the GICH_LR_EOI bit set (generate a
 *   maintenance interrupt on EOI).
 * - When the interrupt is EOIed, the maintenance interrupt fires,
- *   and clears the corresponding bit in irq_active. This allow the
+ *   and clears the corresponding bit in irq_queued. This allows the
 *   interrupt line to be sampled again.
+ * - Note that level-triggered interrupts can also be set to pending from
+ *   writes to GICD_ISPENDRn and lowering the external input line does not
+ *   cause the interrupt to become inactive in such a situation.
+ *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
+ *   inactive as long as the external input line is held high.
 */
-#define VGIC_ADDR_UNDEF         (-1)
+#include "vgic.h"
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-#define PRODUCT_ID_KVM          0x4b    /* ASCII code K */
+static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-#define IMPLEMENTER_ARM         0x43b
+static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
-#define GICC_ARCH_VERSION_V2    0x2
+static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
+static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-/* Physical address of vgic virtual cpu interface */
+static const struct vgic_ops *vgic_ops;
-static phys_addr_t vgic_vcpu_base;
+static const struct vgic_params *vgic;
-/* Virtual control interface base address */
+static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-static void __iomem *vgic_vctrl_base;
+{
+        vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
+}
-static struct device_node *vgic_node;
+static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+        return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
+}
-#define ACCESS_READ_VALUE       (1 << 0)
+int kvm_vgic_map_resources(struct kvm *kvm)
-#define ACCESS_READ_RAZ         (0 << 0)
+{
-#define ACCESS_READ_MASK(x)     ((x) & (1 << 0))
+        return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-#define ACCESS_WRITE_IGNORED    (0 << 1)
+}
-#define ACCESS_WRITE_SETBIT     (1 << 1)
-#define ACCESS_WRITE_CLEARBIT   (2 << 1)
-#define ACCESS_WRITE_VALUE      (3 << 1)
-#define ACCESS_WRITE_MASK(x)    ((x) & (3 << 1))
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
+/*
-static void vgic_update_state(struct kvm *kvm);
+ * struct vgic_bitmap contains a bitmap made of unsigned longs, but
-static void vgic_kick_vcpus(struct kvm *kvm);
+ * extracts u32s out of them.
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
+ *
-static u32 vgic_nr_lr;
+ * This does not work on 64-bit BE systems, because the bitmap access
+ * will store two consecutive 32-bit words with the higher-addressed
+ * register's bits at the lower index and the lower-addressed register's
+ * bits at the higher index.
+ *
+ * Therefore, swizzle the register index when accessing the 32-bit word
+ * registers to access the right register's value.
+ */
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
+#define REG_OFFSET_SWIZZLE      1
+#else
+#define REG_OFFSET_SWIZZLE      0
+#endif
+static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
+{
+        int nr_longs;
+        nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
+        b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
+        if (!b->private)
+                return -ENOMEM;
+        b->shared = b->private + nr_cpus;
+        return 0;
+}
+static void vgic_free_bitmap(struct vgic_bitmap *b)
+{
+        kfree(b->private);
+        b->private = NULL;
+        b->shared = NULL;
+}
-static unsigned int vgic_maint_irq;
+/*
+ * Call this function to convert a u64 value to an unsigned long * bitmask
+ * in a way that works on both 32-bit and 64-bit LE and BE platforms.
+ *
+ * Warning: Calling this function may modify *val.
+ */
+static unsigned long *u64_to_bitmask(u64 *val)
+{
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
+        *val = (*val >> 32) | (*val << 32);
+#endif
+        return (unsigned long *)val;
+}
-static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x,
+u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-                                int cpuid, u32 offset)
 {
        offset >>= 2;
        if (!offset)
-                return x->percpu[cpuid].reg;
+                return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
        else
-                return x->shared.reg + offset - 1;
+                return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
 }
 static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
                                   int cpuid, int irq)
 {
        if (irq < VGIC_NR_PRIVATE_IRQS)
-                return test_bit(irq, x->percpu[cpuid].reg_ul);
+                return test_bit(irq, x->private + cpuid);
-        return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul);
+        return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
 }
-static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
+void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-                                    int irq, int val)
+                             int irq, int val)
 {
        unsigned long *reg;
        if (irq < VGIC_NR_PRIVATE_IRQS) {
-                reg = x->percpu[cpuid].reg_ul;
+                reg = x->private + cpuid;
        } else {
-                reg =  x->shared.reg_ul;
+                reg = x->shared;
                irq -= VGIC_NR_PRIVATE_IRQS;
        }
@@ -140,24 +195,49 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
 static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
 {
-        if (unlikely(cpuid >= VGIC_MAX_CPUS))
+        return x->private + cpuid;
-                return NULL;
-        return x->percpu[cpuid].reg_ul;
 }
-static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
+unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
 {
-        return x->shared.reg_ul;
+        return x->shared;
 }
-static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
+static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
 {
-        offset >>= 2;
+        int size;
-        BUG_ON(offset > (VGIC_NR_IRQS / 4));
-        if (offset < 8)
+        size  = nr_cpus * VGIC_NR_PRIVATE_IRQS;
-                return x->percpu[cpuid] + offset;
+        size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-        else
-                return x->shared + offset - 8;
+        x->private = kzalloc(size, GFP_KERNEL);
+        if (!x->private)
+                return -ENOMEM;
+        x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
+        return 0;
+}
+static void vgic_free_bytemap(struct vgic_bytemap *b)
+{
+        kfree(b->private);
+        b->private = NULL;
+        b->shared = NULL;
+}
+u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
+{
+        u32 *reg;
+        if (offset < VGIC_NR_PRIVATE_IRQS) {
+                reg = x->private;
+                offset += cpuid * VGIC_NR_PRIVATE_IRQS;
+        } else {
+                reg = x->shared;
+                offset -= VGIC_NR_PRIVATE_IRQS;
+        }
+        return reg + (offset / sizeof(u32));
 }
 #define VGIC_CFG_LEVEL  0
@@ -179,6 +259,13 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
        return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
 }
+static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
+}
 static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -186,6 +273,20 @@ static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
        return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
 }
+static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
+}
+static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
+}
 static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -200,25 +301,60 @@ static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
        vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
 }
+static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
+}
+static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
+}
+static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
+}
+static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
+}
+static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
+}
 static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq);
+        return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
 }
-static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq)
+void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1);
+        vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
 }
-static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq)
+void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0);
+        vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
 }
 static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
@@ -230,7 +366,7 @@ static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
                        vcpu->arch.vgic_cpu.pending_shared);
 }
-static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
+void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
 {
        if (irq < VGIC_NR_PRIVATE_IRQS)
                clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
@@ -239,14 +375,9 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
                          vcpu->arch.vgic_cpu.pending_shared);
 }
-static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
+static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
-        return *((u32 *)mmio->data) & mask;
-}
-static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
 {
-        *((u32 *)mmio->data) = value & mask;
+        return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
 }
 /**
@@ -260,8 +391,8 @@ static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
 * modes defined for vgic register access
 * (read,raz,write-ignored,setbit,clearbit,write)
 */
-static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
+void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-                            phys_addr_t offset, int mode)
+                     phys_addr_t offset, int mode)
 {
        int word_offset = (offset & 3) * 8;
        u32 mask = (1UL << (mmio->len * 8)) - 1;
@@ -310,197 +441,141 @@ static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
        }
 }
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
+bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
+                        phys_addr_t offset)
-{
-        u32 reg;
-        u32 word_offset = offset & 3;
-        switch (offset & ~3) {
-        case 0:                 /* GICD_CTLR */
-                reg = vcpu->kvm->arch.vgic.enabled;
-                vgic_reg_access(mmio, &reg, word_offset,
-                                ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-                if (mmio->is_write) {
-                        vcpu->kvm->arch.vgic.enabled = reg & 1;
-                        vgic_update_state(vcpu->kvm);
-                        return true;
-                }
-                break;
-        case 4:                 /* GICD_TYPER */
-                reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-                reg |= (VGIC_NR_IRQS >> 5) - 1;
-                vgic_reg_access(mmio, &reg, word_offset,
-                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-                break;
-        case 8:                 /* GICD_IIDR */
-                reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-                vgic_reg_access(mmio, &reg, word_offset,
-                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-                break;
-        }
-        return false;
-}
-static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
 {
        vgic_reg_access(mmio, NULL, offset,
                        ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
        return false;
 }
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
+bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                                       struct kvm_exit_mmio *mmio,
+                            phys_addr_t offset, int vcpu_id, int access)
-                                       phys_addr_t offset)
 {
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
+        u32 *reg;
-                                       vcpu->vcpu_id, offset);
+        int mode = ACCESS_READ_VALUE | access;
-        vgic_reg_access(mmio, reg, offset,
+        struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-                        ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-        if (mmio->is_write) {
-                vgic_update_state(vcpu->kvm);
-                return true;
-        }
-        return false;
-}
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
+        reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
-                                         struct kvm_exit_mmio *mmio,
+        vgic_reg_access(mmio, reg, offset, mode);
-                                         phys_addr_t offset)
-{
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
-                                       vcpu->vcpu_id, offset);
-        vgic_reg_access(mmio, reg, offset,
-                        ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
        if (mmio->is_write) {
-                if (offset < 4) /* Force SGI enabled */
+                if (access & ACCESS_WRITE_CLEARBIT) {
-                        *reg |= 0xffff;
+                        if (offset < 4) /* Force SGI enabled */
-                vgic_retire_disabled_irqs(vcpu);
+                                *reg |= 0xffff;
-                vgic_update_state(vcpu->kvm);
+                        vgic_retire_disabled_irqs(target_vcpu);
+                }
+                vgic_update_state(kvm);
                return true;
        }
        return false;
 }
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
+bool vgic_handle_set_pending_reg(struct kvm *kvm,
-                                        struct kvm_exit_mmio *mmio,
+                                 struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset)
+                                 phys_addr_t offset, int vcpu_id)
 {
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state,
+        u32 *reg, orig;
-                                       vcpu->vcpu_id, offset);
+        u32 level_mask;
-        vgic_reg_access(mmio, reg, offset,
+        int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
-                        ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
+        struct vgic_dist *dist = &kvm->arch.vgic;
-        if (mmio->is_write) {
-                vgic_update_state(vcpu->kvm);
-                return true;
-        }
-        return false;
+        reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
-}
+        level_mask = (~(*reg));
+        /* Mark both level and edge triggered irqs as pending */
+        reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
+        orig = *reg;
+        vgic_reg_access(mmio, reg, offset, mode);
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
-                                          struct kvm_exit_mmio *mmio,
-                                          phys_addr_t offset)
-{
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state,
-                                       vcpu->vcpu_id, offset);
-        vgic_reg_access(mmio, reg, offset,
-                        ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
        if (mmio->is_write) {
-                vgic_update_state(vcpu->kvm);
+                /* Set the soft-pending flag only for level-triggered irqs */
+                reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
+                                          vcpu_id, offset);
+                vgic_reg_access(mmio, reg, offset, mode);
+                *reg &= level_mask;
+                /* Ignore writes to SGIs */
+                if (offset < 2) {
+                        *reg &= ~0xffff;
+                        *reg |= orig & 0xffff;
+                }
+                vgic_update_state(kvm);
                return true;
        }
        return false;
 }
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
+bool vgic_handle_clear_pending_reg(struct kvm *kvm,
-                                     struct kvm_exit_mmio *mmio,
+                                   struct kvm_exit_mmio *mmio,
-                                     phys_addr_t offset)
+                                   phys_addr_t offset, int vcpu_id)
-{
-        u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                        vcpu->vcpu_id, offset);
-        vgic_reg_access(mmio, reg, offset,
-                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-        return false;
-}
-#define GICD_ITARGETSR_SIZE     32
-#define GICD_CPUTARGETS_BITS    8
-#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
 {
+        u32 *level_active;
+        u32 *reg, orig;
+        int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
        struct vgic_dist *dist = &kvm->arch.vgic;
-        int i;
-        u32 val = 0;
-        irq -= VGIC_NR_PRIVATE_IRQS;
+        reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
+        orig = *reg;
+        vgic_reg_access(mmio, reg, offset, mode);
+        if (mmio->is_write) {
+                /* Re-set level triggered level-active interrupts */
+                level_active = vgic_bitmap_get_reg(&dist->irq_level,
+                                          vcpu_id, offset);
+                reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
+                *reg |= *level_active;
+                /* Ignore writes to SGIs */
+                if (offset < 2) {
+                        *reg &= ~0xffff;
+                        *reg |= orig & 0xffff;
+                }
-        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
+                /* Clear soft-pending flags */
-                val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
+                reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
+                                          vcpu_id, offset);
+                vgic_reg_access(mmio, reg, offset, mode);
-        return val;
+                vgic_update_state(kvm);
+                return true;
+        }
+        return false;
 }
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
+bool vgic_handle_set_active_reg(struct kvm *kvm,
+                                struct kvm_exit_mmio *mmio,
+                                phys_addr_t offset, int vcpu_id)
 {
+        u32 *reg;
        struct vgic_dist *dist = &kvm->arch.vgic;
-        struct kvm_vcpu *vcpu;
-        int i, c;
-        unsigned long *bmap;
-        u32 target;
-        irq -= VGIC_NR_PRIVATE_IRQS;
+        reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
+        vgic_reg_access(mmio, reg, offset,
+                        ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-        /*
+        if (mmio->is_write) {
-         * Pick the LSB in each byte. This ensures we target exactly
+                vgic_update_state(kvm);
-         * one vcpu per IRQ. If the byte is null, assume we target
+                return true;
-         * CPU0.
-         */
-        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
-                int shift = i * GICD_CPUTARGETS_BITS;
-                target = ffs((val >> shift) & 0xffU);
-                target = target ? (target - 1) : 0;
-                dist->irq_spi_cpu[irq + i] = target;
-                kvm_for_each_vcpu(c, vcpu, kvm) {
-                        bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-                        if (c == target)
-                                set_bit(irq + i, bmap);
-                        else
-                                clear_bit(irq + i, bmap);
-                }
        }
+        return false;
 }
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
+bool vgic_handle_clear_active_reg(struct kvm *kvm,
-                                   struct kvm_exit_mmio *mmio,
+                                  struct kvm_exit_mmio *mmio,
-                                   phys_addr_t offset)
+                                  phys_addr_t offset, int vcpu_id)
 {
-        u32 reg;
+        u32 *reg;
+        struct vgic_dist *dist = &kvm->arch.vgic;
-        /* We treat the banked interrupts targets as read-only */
-        if (offset < 32) {
-                u32 roreg = 1 << vcpu->vcpu_id;
-                roreg |= roreg << 8;
-                roreg |= roreg << 16;
-                vgic_reg_access(mmio, &roreg, offset,
+        reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-                                ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+        vgic_reg_access(mmio, reg, offset,
-                return false;
+                        ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-        }
-        reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
-        vgic_reg_access(mmio, &reg, offset,
-                        ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
        if (mmio->is_write) {
-                vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
+                vgic_update_state(kvm);
-                vgic_update_state(vcpu->kvm);
                return true;
        }
@@ -542,14 +617,10 @@ static u16 vgic_cfg_compress(u32 val)
 * LSB is always 0. As such, we only keep the upper bit, and use the
 * two above functions to compress/expand the bits
 */
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
+bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
+                         phys_addr_t offset)
 {
        u32 val;
-        u32 *reg;
-        reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                  vcpu->vcpu_id, offset >> 1);
        if (offset & 4)
                val = *reg >> 16;
@@ -578,57 +649,21 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
        return false;
 }
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
-                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-        u32 reg;
-        vgic_reg_access(mmio, &reg, offset,
-                        ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
-        if (mmio->is_write) {
-                vgic_dispatch_sgi(vcpu, reg);
-                vgic_update_state(vcpu->kvm);
-                return true;
-        }
-        return false;
-}
-#define LR_CPUID(lr)    \
-        (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT)
-#define LR_IRQID(lr)    \
-        ((lr) & GICH_LR_VIRTUALID)
-static void vgic_retire_lr(int lr_nr, int irq, struct vgic_cpu *vgic_cpu)
-{
-        clear_bit(lr_nr, vgic_cpu->lr_used);
-        vgic_cpu->vgic_lr[lr_nr] &= ~GICH_LR_STATE;
-        vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
-}
 /**
- * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
+ * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
 * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
 *
- * Move any pending IRQs that have already been assigned to LRs back to the
+ * Move any IRQs that have already been assigned to LRs back to the
 * emulated distributor state so that the complete emulated state can be read
 * from the main emulation structures without investigating the LRs.
- *
- * Note that IRQs in the active state in the LRs get their pending state moved
- * to the distributor but the active state stays in the LRs, because we don't
- * track the active state on the distributor side.
 */
-static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
+void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-        int vcpu_id = vcpu->vcpu_id;
+        int i;
-        int i, irq, source_cpu;
-        u32 *lr;
        for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
-                lr = &vgic_cpu->vgic_lr[i];
+                struct vgic_lr lr = vgic_get_lr(vcpu, i);
-                irq = LR_IRQID(*lr);
-                source_cpu = LR_CPUID(*lr);
                /*
                 * There are three options for the state bits:
@@ -636,12 +671,22 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
                 * 01: pending
                 * 10: active
                 * 11: pending and active
-                 *
-                 * If the LR holds only an active interrupt (not pending) then
-                 * just leave it alone.
                 */
-                if ((*lr & GICH_LR_STATE) == GICH_LR_ACTIVE_BIT)
+                BUG_ON(!(lr.state & LR_STATE_MASK));
-                        continue;
+                /* Reestablish SGI source for pending and active IRQs */
+                if (lr.irq < VGIC_NR_SGIS)
+                        add_sgi_source(vcpu, lr.irq, lr.source);
+                /*
+                 * If the LR holds an active (10) or a pending and active (11)
+                 * interrupt then move the active state to the
+                 * distributor tracking bit.
+                 */
+                if (lr.state & LR_STATE_ACTIVE) {
+                        vgic_irq_set_active(vcpu, lr.irq);
+                        lr.state &= ~LR_STATE_ACTIVE;
+                }
                /*
                 * Reestablish the pending state on the distributor and the
@@ -649,293 +694,257 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
                 * is fine, then we are only setting a few bits that were
                 * already set.
                 */
-                vgic_dist_irq_set(vcpu, irq);
+                if (lr.state & LR_STATE_PENDING) {
-                if (irq < VGIC_NR_SGIS)
+                        vgic_dist_irq_set_pending(vcpu, lr.irq);
-                        dist->irq_sgi_sources[vcpu_id][irq] |= 1 << source_cpu;
+                        lr.state &= ~LR_STATE_PENDING;
-                *lr &= ~GICH_LR_PENDING_BIT;
+                }
+                vgic_set_lr(vcpu, i, lr);
                /*
-                 * If there's no state left on the LR (it could still be
+                 * Mark the LR as free for other use.
-                 * active), then the LR does not hold any useful info and can
-                 * be marked as free for other use.
                 */
-                if (!(*lr & GICH_LR_STATE))
+                BUG_ON(lr.state & LR_STATE_MASK);
-                        vgic_retire_lr(i, irq, vgic_cpu);
+                vgic_retire_lr(i, lr.irq, vcpu);
+                vgic_irq_clear_queued(vcpu, lr.irq);
                /* Finally update the VGIC state. */
                vgic_update_state(vcpu->kvm);
        }
 }
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
+const
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-                                        struct kvm_exit_mmio *mmio,
+                                      int len, gpa_t offset)
-                                        phys_addr_t offset)
 {
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        while (ranges->len) {
-        int sgi;
+                if (offset >= ranges->base &&
-        int min_sgi = (offset & ~0x3) * 4;
+                    (offset + len) <= (ranges->base + ranges->len))
-        int max_sgi = min_sgi + 3;
+                        return ranges;
-        int vcpu_id = vcpu->vcpu_id;
+                ranges++;
-        u32 reg = 0;
-        /* Copy source SGIs from distributor side */
-        for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-                int shift = 8 * (sgi - min_sgi);
-                reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift;
        }
-        mmio_data_write(mmio, ~0, reg);
+        return NULL;
-        return false;
 }
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+static bool vgic_validate_access(const struct vgic_dist *dist,
-                                         struct kvm_exit_mmio *mmio,
+                                 const struct vgic_io_range *range,
-                                         phys_addr_t offset, bool set)
+                                 unsigned long offset)
 {
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        int irq;
-        int sgi;
-        int min_sgi = (offset & ~0x3) * 4;
-        int max_sgi = min_sgi + 3;
-        int vcpu_id = vcpu->vcpu_id;
-        u32 reg;
-        bool updated = false;
-        reg = mmio_data_read(mmio, ~0);
-        /* Clear pending SGIs on the distributor */
-        for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-                u8 mask = reg >> (8 * (sgi - min_sgi));
-                if (set) {
-                        if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask)
-                                updated = true;
-                        dist->irq_sgi_sources[vcpu_id][sgi] |= mask;
-                } else {
-                        if (dist->irq_sgi_sources[vcpu_id][sgi] & mask)
-                                updated = true;
-                        dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask;
-                }
-        }
-        if (updated)
-                vgic_update_state(vcpu->kvm);
-        return updated;
+        if (!range->bits_per_irq)
-}
+                return true;    /* Not an irq-based access */
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
+        irq = offset * 8 / range->bits_per_irq;
-                                struct kvm_exit_mmio *mmio,
+        if (irq >= dist->nr_irqs)
-                                phys_addr_t offset)
+                return false;
-{
-        if (!mmio->is_write)
-                return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-        else
-                return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
+        return true;
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset)
-{
-        if (!mmio->is_write)
-                return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-        else
-                return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
 }
 /*
- * I would have liked to use the kvm_bus_io_*() API instead, but it
+ * Call the respective handler function for the given range.
- * cannot cope with banked registers (only the VM pointer is passed
+ * We split up any 64 bit accesses into two consecutive 32 bit
- * around, and we need the vcpu). One of these days, someone please
+ * handler calls and merge the result afterwards.
- * fix it!
+ * We do this in a little endian fashion regardless of the host's
+ * or guest's endianness, because the GIC is always LE and the rest of
+ * the code (vgic_reg_access) also puts it in a LE fashion already.
+ * At this point we have already identified the handle function, so
+ * range points to that one entry and offset is relative to this.
 */
-struct mmio_range {
+static bool call_range_handler(struct kvm_vcpu *vcpu,
-        phys_addr_t base;
+                               struct kvm_exit_mmio *mmio,
-        unsigned long len;
+                               unsigned long offset,
-        bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+                               const struct vgic_io_range *range)
-                            phys_addr_t offset);
+{
-};
+        struct kvm_exit_mmio mmio32;
+        bool ret;
-static const struct mmio_range vgic_dist_ranges[] = {
+        if (likely(mmio->len <= 4))
-        {
+                return range->handle_mmio(vcpu, mmio, offset);
-                .base           = GIC_DIST_CTRL,
-                .len            = 12,
-                .handle_mmio    = handle_mmio_misc,
-        },
-        {
-                .base           = GIC_DIST_IGROUP,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_raz_wi,
-        },
-        {
-                .base           = GIC_DIST_ENABLE_SET,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_set_enable_reg,
-        },
-        {
-                .base           = GIC_DIST_ENABLE_CLEAR,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_clear_enable_reg,
-        },
-        {
-                .base           = GIC_DIST_PENDING_SET,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_set_pending_reg,
-        },
-        {
-                .base           = GIC_DIST_PENDING_CLEAR,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_clear_pending_reg,
-        },
-        {
-                .base           = GIC_DIST_ACTIVE_SET,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_raz_wi,
-        },
-        {
-                .base           = GIC_DIST_ACTIVE_CLEAR,
-                .len            = VGIC_NR_IRQS / 8,
-                .handle_mmio    = handle_mmio_raz_wi,
-        },
-        {
-                .base           = GIC_DIST_PRI,
-                .len            = VGIC_NR_IRQS,
-                .handle_mmio    = handle_mmio_priority_reg,
-        },
-        {
-                .base           = GIC_DIST_TARGET,
-                .len            = VGIC_NR_IRQS,
-                .handle_mmio    = handle_mmio_target_reg,
-        },
-        {
-                .base           = GIC_DIST_CONFIG,
-                .len            = VGIC_NR_IRQS / 4,
-                .handle_mmio    = handle_mmio_cfg_reg,
-        },
-        {
-                .base           = GIC_DIST_SOFTINT,
-                .len            = 4,
-                .handle_mmio    = handle_mmio_sgi_reg,
-        },
-        {
-                .base           = GIC_DIST_SGI_PENDING_CLEAR,
-                .len            = VGIC_NR_SGIS,
-                .handle_mmio    = handle_mmio_sgi_clear,
-        },
-        {
-                .base           = GIC_DIST_SGI_PENDING_SET,
-                .len            = VGIC_NR_SGIS,
-                .handle_mmio    = handle_mmio_sgi_set,
-        },
-        {}
-};
-static const
+        /*
-struct mmio_range *find_matching_range(const struct mmio_range *ranges,
+         * Any access bigger than 4 bytes (that we currently handle in KVM)
-                                       struct kvm_exit_mmio *mmio,
+         * is actually 8 bytes long, caused by a 64-bit access
-                                       phys_addr_t offset)
+         */
-{
-        const struct mmio_range *r = ranges;
-        while (r->len) {
+        mmio32.len = 4;
-                if (offset >= r->base &&
+        mmio32.is_write = mmio->is_write;
-                    (offset + mmio->len) <= (r->base + r->len))
+        mmio32.private = mmio->private;
-                        return r;
-                r++;
-        }
-        return NULL;
+        mmio32.phys_addr = mmio->phys_addr + 4;
+        mmio32.data = &((u32 *)mmio->data)[1];
+        ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
+        mmio32.phys_addr = mmio->phys_addr;
+        mmio32.data = &((u32 *)mmio->data)[0];
+        ret |= range->handle_mmio(vcpu, &mmio32, offset);
+        return ret;
 }
 /**
- * vgic_handle_mmio - handle an in-kernel MMIO access
+ * vgic_handle_mmio_access - handle an in-kernel MMIO access
+ * This is called by the read/write KVM IO device wrappers below.
 * @vcpu:       pointer to the vcpu performing the access
- * @run:        pointer to the kvm_run structure
+ * @this:       pointer to the KVM IO device in charge
- * @mmio:       pointer to the data describing the access
+ * @addr:       guest physical address of the access
+ * @len:        size of the access
+ * @val:        pointer to the data region
+ * @is_write:   read or write access
 *
- * returns true if the MMIO access has been performed in kernel space,
+ * returns true if the MMIO access could be performed
- * and false if it needs to be emulated in user space.
 */
-bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
+static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
-                      struct kvm_exit_mmio *mmio)
+                                   struct kvm_io_device *this, gpa_t addr,
+                                   int len, void *val, bool is_write)
 {
-        const struct mmio_range *range;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        unsigned long base = dist->vgic_dist_base;
+        struct vgic_io_device *iodev = container_of(this,
+                                                    struct vgic_io_device, dev);
+        struct kvm_run *run = vcpu->run;
+        const struct vgic_io_range *range;
+        struct kvm_exit_mmio mmio;
        bool updated_state;
-        unsigned long offset;
+        gpa_t offset;
-        if (!irqchip_in_kernel(vcpu->kvm) ||
-            mmio->phys_addr < base ||
-            (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE))
-                return false;
-        /* We don't support ldrd / strd or ldm / stm to the emulated vgic */
+        offset = addr - iodev->addr;
-        if (mmio->len > 4) {
+        range = vgic_find_range(iodev->reg_ranges, len, offset);
-                kvm_inject_dabt(vcpu, mmio->phys_addr);
+        if (unlikely(!range || !range->handle_mmio)) {
-                return true;
+                pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
+                return -ENXIO;
        }
-        offset = mmio->phys_addr - base;
+        mmio.phys_addr = addr;
-        range = find_matching_range(vgic_dist_ranges, mmio, offset);
+        mmio.len = len;
-        if (unlikely(!range || !range->handle_mmio)) {
+        mmio.is_write = is_write;
-                pr_warn("Unhandled access %d %08llx %d\n",
+        mmio.data = val;
-                        mmio->is_write, mmio->phys_addr, mmio->len);
+        mmio.private = iodev->redist_vcpu;
-                return false;
+        spin_lock(&dist->lock);
+        offset -= range->base;
+        if (vgic_validate_access(dist, range, offset)) {
+                updated_state = call_range_handler(vcpu, &mmio, offset, range);
+        } else {
+                if (!is_write)
+                        memset(val, 0, len);
+                updated_state = false;
        }
+        spin_unlock(&dist->lock);
+        run->mmio.is_write      = is_write;
+        run->mmio.len           = len;
+        run->mmio.phys_addr     = addr;
+        memcpy(run->mmio.data, val, len);
-        spin_lock(&vcpu->kvm->arch.vgic.lock);
-        offset = mmio->phys_addr - range->base - base;
-        updated_state = range->handle_mmio(vcpu, mmio, offset);
-        spin_unlock(&vcpu->kvm->arch.vgic.lock);
-        kvm_prepare_mmio(run, mmio);
        kvm_handle_mmio_return(vcpu, run);
        if (updated_state)
                vgic_kick_vcpus(vcpu->kvm);
-        return true;
+        return 0;
 }
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
+static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
+                                 struct kvm_io_device *this,
+                                 gpa_t addr, int len, void *val)
 {
-        struct kvm *kvm = vcpu->kvm;
+        return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-        struct vgic_dist *dist = &kvm->arch.vgic;
+}
-        int nrcpus = atomic_read(&kvm->online_vcpus);
-        u8 target_cpus;
-        int sgi, mode, c, vcpu_id;
-        vcpu_id = vcpu->vcpu_id;
+static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
+                                  struct kvm_io_device *this,
+                                  gpa_t addr, int len, const void *val)
+{
+        return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
+                                       true);
+}
-        sgi = reg & 0xf;
+struct kvm_io_device_ops vgic_io_ops = {
-        target_cpus = (reg >> 16) & 0xff;
+        .read   = vgic_handle_mmio_read,
-        mode = (reg >> 24) & 3;
+        .write  = vgic_handle_mmio_write,
+};
-        switch (mode) {
+/**
-        case 0:
+ * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
-                if (!target_cpus)
+ * @kvm:            The VM structure pointer
-                        return;
+ * @base:           The (guest) base address for the register frame
-                break;
+ * @len:            Length of the register frame window
+ * @ranges:         Describing the handler functions for each register
+ * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
+ * @iodev:          Points to memory to be passed on to the handler
+ *
+ * @iodev stores the parameters of this function to be usable by the handler
+ * respectively the dispatcher function (since the KVM I/O bus framework lacks
+ * an opaque parameter). Initialization is done in this function, but the
+ * reference should be valid and unique for the whole VGIC lifetime.
+ * If the register frame is not mapped for a specific VCPU, pass -1 to
+ * @redist_vcpu_id.
+ */
+int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
+                             const struct vgic_io_range *ranges,
+                             int redist_vcpu_id,
+                             struct vgic_io_device *iodev)
+{
+        struct kvm_vcpu *vcpu = NULL;
+        int ret;
-        case 1:
+        if (redist_vcpu_id >= 0)
-                target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
+                vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-                break;
-        case 2:
+        iodev->addr             = base;
-                target_cpus = 1 << vcpu_id;
+        iodev->len              = len;
-                break;
+        iodev->reg_ranges       = ranges;
-        }
+        iodev->redist_vcpu      = vcpu;
-        kvm_for_each_vcpu(c, vcpu, kvm) {
+        kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-                if (target_cpus & 1) {
-                        /* Flag the SGI as pending */
-                        vgic_dist_irq_set(vcpu, sgi);
-                        dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id;
-                        kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
-                }
-                target_cpus >>= 1;
+        mutex_lock(&kvm->slots_lock);
-        }
+        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
+                                      &iodev->dev);
+        mutex_unlock(&kvm->slots_lock);
+        /* Mark the iodev as invalid if registration fails. */
+        if (ret)
+                iodev->dev.ops = NULL;
+        return ret;
+}
+static int vgic_nr_shared_irqs(struct vgic_dist *dist)
+{
+        return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+}
+static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        unsigned long *active, *enabled, *act_percpu, *act_shared;
+        unsigned long active_private, active_shared;
+        int nr_shared = vgic_nr_shared_irqs(dist);
+        int vcpu_id;
+        vcpu_id = vcpu->vcpu_id;
+        act_percpu = vcpu->arch.vgic_cpu.active_percpu;
+        act_shared = vcpu->arch.vgic_cpu.active_shared;
+        active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
+        enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
+        bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
+        active = vgic_bitmap_get_shared_map(&dist->irq_active);
+        enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
+        bitmap_and(act_shared, active, enabled, nr_shared);
+        bitmap_and(act_shared, act_shared,
+                   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
+                   nr_shared);
+        active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
+        active_shared = find_first_bit(act_shared, nr_shared);
+        return (active_private < VGIC_NR_PRIVATE_IRQS ||
+                active_shared < nr_shared);
 }
 static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
@@ -943,54 +952,129 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
        unsigned long pending_private, pending_shared;
+        int nr_shared = vgic_nr_shared_irqs(dist);
        int vcpu_id;
        vcpu_id = vcpu->vcpu_id;
        pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
        pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-        pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id);
+        pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
        enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
        bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-        pending = vgic_bitmap_get_shared_map(&dist->irq_state);
+        pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
        enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-        bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
+        bitmap_and(pend_shared, pending, enabled, nr_shared);
        bitmap_and(pend_shared, pend_shared,
                   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-                   VGIC_NR_SHARED_IRQS);
+                   nr_shared);
        pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-        pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS);
+        pending_shared = find_first_bit(pend_shared, nr_shared);
        return (pending_private < VGIC_NR_PRIVATE_IRQS ||
-                pending_shared < VGIC_NR_SHARED_IRQS);
+                pending_shared < vgic_nr_shared_irqs(dist));
 }
 /*
 * Update the interrupt state and determine which CPUs have pending
- * interrupts. Must be called with distributor lock held.
+ * or active interrupts. Must be called with distributor lock held.
 */
-static void vgic_update_state(struct kvm *kvm)
+void vgic_update_state(struct kvm *kvm)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
        int c;
        if (!dist->enabled) {
-                set_bit(0, &dist->irq_pending_on_cpu);
+                set_bit(0, dist->irq_pending_on_cpu);
                return;
        }
        kvm_for_each_vcpu(c, vcpu, kvm) {
-                if (compute_pending_for_cpu(vcpu)) {
+                if (compute_pending_for_cpu(vcpu))
-                        pr_debug("CPU%d has pending interrupts\n", c);
+                        set_bit(c, dist->irq_pending_on_cpu);
-                        set_bit(c, &dist->irq_pending_on_cpu);
-                }
+                if (compute_active_for_cpu(vcpu))
+                        set_bit(c, dist->irq_active_on_cpu);
+                else
+                        clear_bit(c, dist->irq_active_on_cpu);
        }
 }
-#define MK_LR_PEND(src, irq)    \
+static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-        (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq))
+{
+        return vgic_ops->get_lr(vcpu, lr);
+}
+static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
+                               struct vgic_lr vlr)
+{
+        vgic_ops->set_lr(vcpu, lr, vlr);
+}
+static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
+                               struct vgic_lr vlr)
+{
+        vgic_ops->sync_lr_elrsr(vcpu, lr, vlr);
+}
+static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
+{
+        return vgic_ops->get_elrsr(vcpu);
+}
+static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
+{
+        return vgic_ops->get_eisr(vcpu);
+}
+static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
+{
+        vgic_ops->clear_eisr(vcpu);
+}
+static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
+{
+        return vgic_ops->get_interrupt_status(vcpu);
+}
+static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
+{
+        vgic_ops->enable_underflow(vcpu);
+}
+static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
+{
+        vgic_ops->disable_underflow(vcpu);
+}
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+        vgic_ops->get_vmcr(vcpu, vmcr);
+}
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+        vgic_ops->set_vmcr(vcpu, vmcr);
+}
+static inline void vgic_enable(struct kvm_vcpu *vcpu)
+{
+        vgic_ops->enable(vcpu);
+}
+static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
+{
+        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+        struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
+        vlr.state = 0;
+        vgic_set_lr(vcpu, lr_nr, vlr);
+        clear_bit(lr_nr, vgic_cpu->lr_used);
+        vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
+        vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
+}
 /*
 * An interrupt may have been disabled after being made pending on the
@@ -1006,104 +1090,98 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        int lr;
-        for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
+        for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) {
-                int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
+                struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-                if (!vgic_irq_is_enabled(vcpu, irq)) {
+                if (!vgic_irq_is_enabled(vcpu, vlr.irq)) {
-                        vgic_retire_lr(lr, irq, vgic_cpu);
+                        vgic_retire_lr(lr, vlr.irq, vcpu);
-                        if (vgic_irq_is_active(vcpu, irq))
+                        if (vgic_irq_is_queued(vcpu, vlr.irq))
-                                vgic_irq_clear_active(vcpu, irq);
+                                vgic_irq_clear_queued(vcpu, vlr.irq);
                }
        }
 }
+static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
+                                 int lr_nr, struct vgic_lr vlr)
+{
+        if (vgic_irq_is_active(vcpu, irq)) {
+                vlr.state |= LR_STATE_ACTIVE;
+                kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
+                vgic_irq_clear_active(vcpu, irq);
+                vgic_update_state(vcpu->kvm);
+        } else if (vgic_dist_irq_is_pending(vcpu, irq)) {
+                vlr.state |= LR_STATE_PENDING;
+                kvm_debug("Set pending: 0x%x\n", vlr.state);
+        }
+        if (!vgic_irq_is_edge(vcpu, irq))
+                vlr.state |= LR_EOI_INT;
+        vgic_set_lr(vcpu, lr_nr, vlr);
+        vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
+}
 /*
 * Queue an interrupt to a CPU virtual interface. Return true on success,
 * or false if it wasn't possible to queue it.
+ * sgi_source must be zero for any non-SGI interrupts.
 */
-static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
+bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        struct vgic_lr vlr;
        int lr;
        /* Sanitize the input... */
        BUG_ON(sgi_source_id & ~7);
        BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
-        BUG_ON(irq >= VGIC_NR_IRQS);
+        BUG_ON(irq >= dist->nr_irqs);
        kvm_debug("Queue IRQ%d\n", irq);
        lr = vgic_cpu->vgic_irq_lr_map[irq];
        /* Do we have an active interrupt for the same CPUID? */
-        if (lr != LR_EMPTY &&
+        if (lr != LR_EMPTY) {
-            (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) {
+                vlr = vgic_get_lr(vcpu, lr);
-                kvm_debug("LR%d piggyback for IRQ%d %x\n",
+                if (vlr.source == sgi_source_id) {
-                          lr, irq, vgic_cpu->vgic_lr[lr]);
+                        kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
-                BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
+                        BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
-                vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT;
+                        vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-                return true;
+                        return true;
+                }
        }
        /* Try to use another LR for this interrupt */
        lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used,
-                               vgic_cpu->nr_lr);
+                               vgic->nr_lr);
-        if (lr >= vgic_cpu->nr_lr)
+        if (lr >= vgic->nr_lr)
                return false;
        kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-        vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq);
        vgic_cpu->vgic_irq_lr_map[irq] = lr;
        set_bit(lr, vgic_cpu->lr_used);
-        if (!vgic_irq_is_edge(vcpu, irq))
+        vlr.irq = irq;
-                vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI;
+        vlr.source = sgi_source_id;
+        vlr.state = 0;
+        vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
        return true;
 }
-static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        unsigned long sources;
-        int vcpu_id = vcpu->vcpu_id;
-        int c;
-        sources = dist->irq_sgi_sources[vcpu_id][irq];
-        for_each_set_bit(c, &sources, VGIC_MAX_CPUS) {
-                if (vgic_queue_irq(vcpu, c, irq))
-                        clear_bit(c, &sources);
-        }
-        dist->irq_sgi_sources[vcpu_id][irq] = sources;
-        /*
-         * If the sources bitmap has been cleared it means that we
-         * could queue all the SGIs onto link registers (see the
-         * clear_bit above), and therefore we are done with them in
-         * our emulated gic and can get rid of them.
-         */
-        if (!sources) {
-                vgic_dist_irq_clear(vcpu, irq);
-                vgic_cpu_irq_clear(vcpu, irq);
-                return true;
-        }
-        return false;
-}
 static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
 {
-        if (vgic_irq_is_active(vcpu, irq))
+        if (!vgic_can_sample_irq(vcpu, irq))
                return true; /* level interrupt, already queued */
        if (vgic_queue_irq(vcpu, 0, irq)) {
                if (vgic_irq_is_edge(vcpu, irq)) {
-                        vgic_dist_irq_clear(vcpu, irq);
+                        vgic_dist_irq_clear_pending(vcpu, irq);
                        vgic_cpu_irq_clear(vcpu, irq);
                } else {
-                        vgic_irq_set_active(vcpu, irq);
+                        vgic_irq_set_queued(vcpu, irq);
                }
                return true;
@@ -1120,130 +1198,180 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        unsigned long *pa_percpu, *pa_shared;
        int i, vcpu_id;
        int overflow = 0;
+        int nr_shared = vgic_nr_shared_irqs(dist);
        vcpu_id = vcpu->vcpu_id;
+        pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
+        pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
+        bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
+                  VGIC_NR_PRIVATE_IRQS);
+        bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
+                  nr_shared);
        /*
         * We may not have any pending interrupt, or the interrupts
         * may have been serviced from another vcpu. In all cases,
         * move along.
         */
-        if (!kvm_vgic_vcpu_pending_irq(vcpu)) {
+        if (!kvm_vgic_vcpu_pending_irq(vcpu) && !kvm_vgic_vcpu_active_irq(vcpu))
-                pr_debug("CPU%d has no pending interrupt\n", vcpu_id);
                goto epilog;
-        }
        /* SGIs */
-        for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) {
+        for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
-                if (!vgic_queue_sgi(vcpu, i))
+                if (!queue_sgi(vcpu, i))
                        overflow = 1;
        }
        /* PPIs */
-        for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) {
+        for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
                if (!vgic_queue_hwirq(vcpu, i))
                        overflow = 1;
        }
        /* SPIs */
-        for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) {
+        for_each_set_bit(i, pa_shared, nr_shared) {
                if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
                        overflow = 1;
        }
 epilog:
        if (overflow) {
-                vgic_cpu->vgic_hcr |= GICH_HCR_UIE;
+                vgic_enable_underflow(vcpu);
        } else {
-                vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE;
+                vgic_disable_underflow(vcpu);
                /*
                 * We're about to run this VCPU, and we've consumed
                 * everything the distributor had in store for
                 * us. Claim we don't have anything pending. We'll
                 * adjust that if needed while exiting.
                 */
-                clear_bit(vcpu_id, &dist->irq_pending_on_cpu);
+                clear_bit(vcpu_id, dist->irq_pending_on_cpu);
        }
 }
 static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 {
-        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+        u32 status = vgic_get_interrupt_status(vcpu);
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        bool level_pending = false;
+        struct kvm *kvm = vcpu->kvm;
-        kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr);
+        kvm_debug("STATUS = %08x\n", status);
-        if (vgic_cpu->vgic_misr & GICH_MISR_EOI) {
+        if (status & INT_STATUS_EOI) {
                /*
                 * Some level interrupts have been EOIed. Clear their
                 * active bit.
                 */
-                int lr, irq;
+                u64 eisr = vgic_get_eisr(vcpu);
+                unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
+                int lr;
+                for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
+                        struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
+                        WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
+                        spin_lock(&dist->lock);
+                        vgic_irq_clear_queued(vcpu, vlr.irq);
+                        WARN_ON(vlr.state & LR_STATE_MASK);
+                        vlr.state = 0;
+                        vgic_set_lr(vcpu, lr, vlr);
-                for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr,
+                        /*
-                                 vgic_cpu->nr_lr) {
+                         * If the IRQ was EOIed it was also ACKed and we we
-                        irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
+                         * therefore assume we can clear the soft pending
+                         * state (should it had been set) for this interrupt.
+                         *
+                         * Note: if the IRQ soft pending state was set after
+                         * the IRQ was acked, it actually shouldn't be
+                         * cleared, but we have no way of knowing that unless
+                         * we start trapping ACKs when the soft-pending state
+                         * is set.
+                         */
+                        vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
+                        /*
+                         * kvm_notify_acked_irq calls kvm_set_irq()
+                         * to reset the IRQ level. Need to release the
+                         * lock for kvm_set_irq to grab it.
+                         */
+                        spin_unlock(&dist->lock);
-                        vgic_irq_clear_active(vcpu, irq);
+                        kvm_notify_acked_irq(kvm, 0,
-                        vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI;
+                                             vlr.irq - VGIC_NR_PRIVATE_IRQS);
+                        spin_lock(&dist->lock);
                        /* Any additional pending interrupt? */
-                        if (vgic_dist_irq_is_pending(vcpu, irq)) {
+                        if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
-                                vgic_cpu_irq_set(vcpu, irq);
+                                vgic_cpu_irq_set(vcpu, vlr.irq);
                                level_pending = true;
                        } else {
-                                vgic_cpu_irq_clear(vcpu, irq);
+                                vgic_dist_irq_clear_pending(vcpu, vlr.irq);
+                                vgic_cpu_irq_clear(vcpu, vlr.irq);
                        }
+                        spin_unlock(&dist->lock);
                        /*
                         * Despite being EOIed, the LR may not have
                         * been marked as empty.
                         */
-                        set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr);
+                        vgic_sync_lr_elrsr(vcpu, lr, vlr);
-                        vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT;
                }
        }
-        if (vgic_cpu->vgic_misr & GICH_MISR_U)
+        if (status & INT_STATUS_UNDERFLOW)
-                vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE;
+                vgic_disable_underflow(vcpu);
+        /*
+         * In the next iterations of the vcpu loop, if we sync the vgic state
+         * after flushing it, but before entering the guest (this happens for
+         * pending signals and vmid rollovers), then make sure we don't pick
+         * up any old maintenance interrupts here.
+         */
+        vgic_clear_eisr(vcpu);
        return level_pending;
 }
-/*
+/* Sync back the VGIC state after a guest run */
- * Sync back the VGIC state after a guest run. The distributor lock is
- * needed so we don't get preempted in the middle of the state processing.
- */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        u64 elrsr;
+        unsigned long *elrsr_ptr;
        int lr, pending;
        bool level_pending;
        level_pending = vgic_process_maintenance(vcpu);
+        elrsr = vgic_get_elrsr(vcpu);
+        elrsr_ptr = u64_to_bitmask(&elrsr);
        /* Clear mappings for empty LRs */
-        for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr,
+        for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
-                         vgic_cpu->nr_lr) {
+                struct vgic_lr vlr;
-                int irq;
                if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
                        continue;
-                irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
+                vlr = vgic_get_lr(vcpu, lr);
-                BUG_ON(irq >= VGIC_NR_IRQS);
+                BUG_ON(vlr.irq >= dist->nr_irqs);
-                vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
+                vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
        }
        /* Check if we still have something up our sleeve... */
-        pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr,
+        pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
-                                      vgic_cpu->nr_lr);
+        if (level_pending || pending < vgic->nr_lr)
-        if (level_pending || pending < vgic_cpu->nr_lr)
+                set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-                set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu);
 }
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
@@ -1260,14 +1388,10 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        if (!irqchip_in_kernel(vcpu->kvm))
                return;
-        spin_lock(&dist->lock);
        __kvm_vgic_sync_hwstate(vcpu);
-        spin_unlock(&dist->lock);
 }
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
@@ -1277,10 +1401,21 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
        if (!irqchip_in_kernel(vcpu->kvm))
                return 0;
-        return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu);
+        return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
 }
-static void vgic_kick_vcpus(struct kvm *kvm)
+int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu)
+{
+        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+        if (!irqchip_in_kernel(vcpu->kvm))
+                return 0;
+        return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
+}
+void vgic_kick_vcpus(struct kvm *kvm)
 {
        struct kvm_vcpu *vcpu;
        int c;
@@ -1297,34 +1432,36 @@ static void vgic_kick_vcpus(struct kvm *kvm)
 static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 {
-        int is_edge = vgic_irq_is_edge(vcpu, irq);
+        int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-        int state = vgic_dist_irq_is_pending(vcpu, irq);
        /*
         * Only inject an interrupt if:
         * - edge triggered and we have a rising edge
         * - level triggered and we change level
         */
-        if (is_edge)
+        if (edge_triggered) {
+                int state = vgic_dist_irq_is_pending(vcpu, irq);
                return level > state;
-        else
+        } else {
+                int state = vgic_dist_irq_get_level(vcpu, irq);
                return level != state;
+        }
 }
-static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
+static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
                                  unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
-        int is_edge, is_level;
+        int edge_triggered, level_triggered;
        int enabled;
-        bool ret = true;
+        bool ret = true, can_inject = true;
        spin_lock(&dist->lock);
        vcpu = kvm_get_vcpu(kvm, cpuid);
-        is_edge = vgic_irq_is_edge(vcpu, irq_num);
+        edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
-        is_level = !is_edge;
+        level_triggered = !edge_triggered;
        if (!vgic_validate_injection(vcpu, irq_num, level)) {
                ret = false;
@@ -1333,24 +1470,39 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
        if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
                cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
+                if (cpuid == VCPU_NOT_ALLOCATED) {
+                        /* Pretend we use CPU0, and prevent injection */
+                        cpuid = 0;
+                        can_inject = false;
+                }
                vcpu = kvm_get_vcpu(kvm, cpuid);
        }
        kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-        if (level)
+        if (level) {
-                vgic_dist_irq_set(vcpu, irq_num);
+                if (level_triggered)
-        else
+                        vgic_dist_irq_set_level(vcpu, irq_num);
-                vgic_dist_irq_clear(vcpu, irq_num);
+                vgic_dist_irq_set_pending(vcpu, irq_num);
+        } else {
+                if (level_triggered) {
+                        vgic_dist_irq_clear_level(vcpu, irq_num);
+                        if (!vgic_dist_irq_soft_pend(vcpu, irq_num))
+                                vgic_dist_irq_clear_pending(vcpu, irq_num);
+                }
+                ret = false;
+                goto out;
+        }
        enabled = vgic_irq_is_enabled(vcpu, irq_num);
-        if (!enabled) {
+        if (!enabled || !can_inject) {
                ret = false;
                goto out;
        }
-        if (is_level && vgic_irq_is_active(vcpu, irq_num)) {
+        if (!vgic_can_sample_irq(vcpu, irq_num)) {
                /*
                 * Level interrupt in progress, will be picked up
                 * when EOId.
@@ -1361,13 +1513,13 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
        if (level) {
                vgic_cpu_irq_set(vcpu, irq_num);
-                set_bit(cpuid, &dist->irq_pending_on_cpu);
+                set_bit(cpuid, dist->irq_pending_on_cpu);
        }
 out:
        spin_unlock(&dist->lock);
-        return ret;
+        return ret ? cpuid : -EINVAL;
 }
 /**
@@ -1387,10 +1539,36 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level)
 {
-        if (vgic_update_irq_state(kvm, cpuid, irq_num, level))
+        int ret = 0;
-                vgic_kick_vcpus(kvm);
+        int vcpu_id;
-        return 0;
+        if (unlikely(!vgic_initialized(kvm))) {
+                /*
+                 * We only provide the automatic initialization of the VGIC
+                 * for the legacy case of a GICv2. Any other type must
+                 * be explicitly initialized once setup with the respective
+                 * KVM device call.
+                 */
+                if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                mutex_lock(&kvm->lock);
+                ret = vgic_init(kvm);
+                mutex_unlock(&kvm->lock);
+                if (ret)
+                        goto out;
+        }
+        vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
+        if (vcpu_id >= 0) {
+                /* kick the specified vcpu */
+                kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
+        }
+out:
+        return ret;
 }
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1404,227 +1582,239 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
        return IRQ_HANDLED;
 }
-/**
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
- * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state
- * @vcpu: pointer to the vcpu struct
- *
- * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
- * this vcpu and enable the VGIC for this VCPU
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-        int i;
-        if (vcpu->vcpu_id >= VGIC_MAX_CPUS)
+        kfree(vgic_cpu->pending_shared);
-                return -EBUSY;
+        kfree(vgic_cpu->active_shared);
+        kfree(vgic_cpu->pend_act_shared);
+        kfree(vgic_cpu->vgic_irq_lr_map);
+        vgic_cpu->pending_shared = NULL;
+        vgic_cpu->active_shared = NULL;
+        vgic_cpu->pend_act_shared = NULL;
+        vgic_cpu->vgic_irq_lr_map = NULL;
+}
+static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
+{
+        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-        for (i = 0; i < VGIC_NR_IRQS; i++) {
+        int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
-                if (i < VGIC_NR_PPIS)
+        vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
-                        vgic_bitmap_set_irq_val(&dist->irq_enabled,
+        vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
-                                                vcpu->vcpu_id, i, 1);
+        vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-                if (i < VGIC_NR_PRIVATE_IRQS)
+        vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
-                        vgic_bitmap_set_irq_val(&dist->irq_cfg,
-                                                vcpu->vcpu_id, i, VGIC_CFG_EDGE);
-                vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY;
+        if (!vgic_cpu->pending_shared
+                || !vgic_cpu->active_shared
+                || !vgic_cpu->pend_act_shared
+                || !vgic_cpu->vgic_irq_lr_map) {
+                kvm_vgic_vcpu_destroy(vcpu);
+                return -ENOMEM;
        }
+        memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);
        /*
-         * By forcing VMCR to zero, the GIC will restore the binary
+         * Store the number of LRs per vcpu, so we don't have to go
-         * points to their reset values. Anything else resets to zero
+         * all the way to the distributor structure to find out. Only
-         * anyway.
+         * assembly code should use this one.
         */
-        vgic_cpu->vgic_vmcr = 0;
+        vgic_cpu->nr_lr = vgic->nr_lr;
-        vgic_cpu->nr_lr = vgic_nr_lr;
-        vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */
        return 0;
 }
-static void vgic_init_maintenance_interrupt(void *info)
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+int kvm_vgic_get_max_vcpus(void)
 {
-        enable_percpu_irq(vgic_maint_irq, 0);
+        return vgic->max_gic_vcpus;
 }
-static int vgic_cpu_notify(struct notifier_block *self,
+void kvm_vgic_destroy(struct kvm *kvm)
-                           unsigned long action, void *cpu)
 {
-        switch (action) {
+        struct vgic_dist *dist = &kvm->arch.vgic;
-        case CPU_STARTING:
+        struct kvm_vcpu *vcpu;
-        case CPU_STARTING_FROZEN:
+        int i;
-                vgic_init_maintenance_interrupt(NULL);
-                break;
-        case CPU_DYING:
-        case CPU_DYING_FROZEN:
-                disable_percpu_irq(vgic_maint_irq);
-                break;
-        }
-        return NOTIFY_OK;
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                kvm_vgic_vcpu_destroy(vcpu);
+        vgic_free_bitmap(&dist->irq_enabled);
+        vgic_free_bitmap(&dist->irq_level);
+        vgic_free_bitmap(&dist->irq_pending);
+        vgic_free_bitmap(&dist->irq_soft_pend);
+        vgic_free_bitmap(&dist->irq_queued);
+        vgic_free_bitmap(&dist->irq_cfg);
+        vgic_free_bytemap(&dist->irq_priority);
+        if (dist->irq_spi_target) {
+                for (i = 0; i < dist->nr_cpus; i++)
+                        vgic_free_bitmap(&dist->irq_spi_target[i]);
+        }
+        kfree(dist->irq_sgi_sources);
+        kfree(dist->irq_spi_cpu);
+        kfree(dist->irq_spi_mpidr);
+        kfree(dist->irq_spi_target);
+        kfree(dist->irq_pending_on_cpu);
+        kfree(dist->irq_active_on_cpu);
+        dist->irq_sgi_sources = NULL;
+        dist->irq_spi_cpu = NULL;
+        dist->irq_spi_target = NULL;
+        dist->irq_pending_on_cpu = NULL;
+        dist->irq_active_on_cpu = NULL;
+        dist->nr_cpus = 0;
 }
-static struct notifier_block vgic_cpu_nb = {
+/*
-        .notifier_call = vgic_cpu_notify,
+ * Allocate and initialize the various data structures. Must be called
-};
+ * with kvm->lock held!
+ */
-int kvm_vgic_hyp_init(void)
+int vgic_init(struct kvm *kvm)
 {
-        int ret;
+        struct vgic_dist *dist = &kvm->arch.vgic;
-        struct resource vctrl_res;
+        struct kvm_vcpu *vcpu;
-        struct resource vcpu_res;
+        int nr_cpus, nr_irqs;
+        int ret, i, vcpu_id;
-        vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic");
+        if (vgic_initialized(kvm))
-        if (!vgic_node) {
+                return 0;
-                kvm_err("error: no compatible vgic node in DT\n");
+        nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
+        if (!nr_cpus)           /* No vcpus? Can't be good... */
                return -ENODEV;
-        }
-        vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0);
+        /*
-        if (!vgic_maint_irq) {
+         * If nobody configured the number of interrupts, use the
-                kvm_err("error getting vgic maintenance irq from DT\n");
+         * legacy one.
-                ret = -ENXIO;
+         */
-                goto out;
+        if (!dist->nr_irqs)
-        }
+                dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-        ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler,
+        nr_irqs = dist->nr_irqs;
-                                 "vgic", kvm_get_running_vcpus());
-        if (ret) {
-                kvm_err("Cannot register interrupt %d\n", vgic_maint_irq);
-                goto out;
-        }
-        ret = __register_cpu_notifier(&vgic_cpu_nb);
+        ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
-        if (ret) {
+        ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
-                kvm_err("Cannot register vgic CPU notifier\n");
+        ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
-                goto out_free_irq;
+        ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
-        }
+        ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
+        ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
+        ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
+        ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-        ret = of_address_to_resource(vgic_node, 2, &vctrl_res);
+        if (ret)
-        if (ret) {
+                goto out;
-                kvm_err("Cannot obtain VCTRL resource\n");
-                goto out_free_irq;
-        }
-        vgic_vctrl_base = of_iomap(vgic_node, 2);
+        dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
-        if (!vgic_vctrl_base) {
+        dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
-                kvm_err("Cannot ioremap VCTRL\n");
+        dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
+                                       GFP_KERNEL);
+        dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
+                                           GFP_KERNEL);
+        dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
+                                           GFP_KERNEL);
+        if (!dist->irq_sgi_sources ||
+            !dist->irq_spi_cpu ||
+            !dist->irq_spi_target ||
+            !dist->irq_pending_on_cpu ||
+            !dist->irq_active_on_cpu) {
                ret = -ENOMEM;
-                goto out_free_irq;
+                goto out;
-        }
-        vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR);
-        vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1;
-        ret = create_hyp_io_mappings(vgic_vctrl_base,
-                                     vgic_vctrl_base + resource_size(&vctrl_res),
-                                     vctrl_res.start);
-        if (ret) {
-                kvm_err("Cannot map VCTRL into hyp\n");
-                goto out_unmap;
        }
-        if (of_address_to_resource(vgic_node, 3, &vcpu_res)) {
+        for (i = 0; i < nr_cpus; i++)
-                kvm_err("Cannot obtain VCPU resource\n");
+                ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
-                ret = -ENXIO;
+                                        nr_cpus, nr_irqs);
-                goto out_unmap;
-        }
-        if (!PAGE_ALIGNED(vcpu_res.start)) {
+        if (ret)
-                kvm_err("GICV physical address 0x%llx not page aligned\n",
+                goto out;
-                        (unsigned long long)vcpu_res.start);
-                ret = -ENXIO;
-                goto out_unmap;
-        }
-        if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+        ret = kvm->arch.vgic.vm_ops.init_model(kvm);
-                kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+        if (ret)
-                        (unsigned long long)resource_size(&vcpu_res),
+                goto out;
-                        PAGE_SIZE);
-                ret = -ENXIO;
-                goto out_unmap;
-        }
-        vgic_vcpu_base = vcpu_res.start;
+        kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
+                ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
+                if (ret) {
+                        kvm_err("VGIC: Failed to allocate vcpu memory\n");
+                        break;
+                }
-        kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
+                for (i = 0; i < dist->nr_irqs; i++) {
-                 vctrl_res.start, vgic_maint_irq);
+                        if (i < VGIC_NR_PPIS)
-        on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
+                                vgic_bitmap_set_irq_val(&dist->irq_enabled,
+                                                        vcpu->vcpu_id, i, 1);
+                        if (i < VGIC_NR_PRIVATE_IRQS)
+                                vgic_bitmap_set_irq_val(&dist->irq_cfg,
+                                                        vcpu->vcpu_id, i,
+                                                        VGIC_CFG_EDGE);
+                }
-        goto out;
+                vgic_enable(vcpu);
+        }
-out_unmap:
-        iounmap(vgic_vctrl_base);
-out_free_irq:
-        free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus());
 out:
-        of_node_put(vgic_node);
+        if (ret)
+                kvm_vgic_destroy(kvm);
        return ret;
 }
-/**
+static int init_vgic_model(struct kvm *kvm, int type)
- * kvm_vgic_init - Initialize global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs.  We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.  Also
- * initialize the ITARGETSRn regs to 0 on the emulated distributor.
- */
-int kvm_vgic_init(struct kvm *kvm)
 {
-        int ret = 0, i;
+        switch (type) {
+        case KVM_DEV_TYPE_ARM_VGIC_V2:
-        if (!irqchip_in_kernel(kvm))
+                vgic_v2_init_emulation(kvm);
-                return 0;
+                break;
+#ifdef CONFIG_ARM_GIC_V3
-        mutex_lock(&kvm->lock);
+        case KVM_DEV_TYPE_ARM_VGIC_V3:
+                vgic_v3_init_emulation(kvm);
-        if (vgic_initialized(kvm))
+                break;
-                goto out;
+#endif
+        default:
-        if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) ||
+                return -ENODEV;
-            IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) {
-                kvm_err("Need to set vgic cpu and dist addresses first\n");
-                ret = -ENXIO;
-                goto out;
-        }
-        ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
-                                    vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE);
-        if (ret) {
-                kvm_err("Unable to remap VGIC CPU to VCPU\n");
-                goto out;
        }
-        for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4)
+        if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
-                vgic_set_target_reg(kvm, 0, i);
+                return -E2BIG;
-        kvm->arch.vgic.ready = true;
+        return 0;
-out:
-        mutex_unlock(&kvm->lock);
-        return ret;
 }
-int kvm_vgic_create(struct kvm *kvm)
+int kvm_vgic_create(struct kvm *kvm, u32 type)
 {
-        int i, vcpu_lock_idx = -1, ret = 0;
+        int i, vcpu_lock_idx = -1, ret;
        struct kvm_vcpu *vcpu;
        mutex_lock(&kvm->lock);
-        if (kvm->arch.vgic.vctrl_base) {
+        if (irqchip_in_kernel(kvm)) {
                ret = -EEXIST;
                goto out;
        }
        /*
+         * This function is also called by the KVM_CREATE_IRQCHIP handler,
+         * which had no chance yet to check the availability of the GICv2
+         * emulation. So check this here again. KVM_CREATE_DEVICE does
+         * the proper checks already.
+         */
+        if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
+                ret = -ENODEV;
+                goto out;
+        }
+        /*
         * Any time a vcpu is run, vcpu_load is called which tries to grab the
         * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
         * that no other VCPUs are run while we create the vgic.
         */
+        ret = -EBUSY;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (!mutex_trylock(&vcpu->mutex))
                        goto out_unlock;
@@ -1632,16 +1822,22 @@ int kvm_vgic_create(struct kvm *kvm)
        }
        kvm_for_each_vcpu(i, vcpu, kvm) {
-                if (vcpu->arch.has_run_once) {
+                if (vcpu->arch.has_run_once)
-                        ret = -EBUSY;
                        goto out_unlock;
-                }
        }
+        ret = 0;
+        ret = init_vgic_model(kvm, type);
+        if (ret)
+                goto out_unlock;
        spin_lock_init(&kvm->arch.vgic.lock);
-        kvm->arch.vgic.vctrl_base = vgic_vctrl_base;
+        kvm->arch.vgic.in_kernel = true;
+        kvm->arch.vgic.vgic_model = type;
+        kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
        kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
        kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+        kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
 out_unlock:
        for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
@@ -1654,7 +1850,7 @@ out:
        return ret;
 }
-static bool vgic_ioaddr_overlap(struct kvm *kvm)
+static int vgic_ioaddr_overlap(struct kvm *kvm)
 {
        phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
        phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
@@ -1694,7 +1890,7 @@ static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
 /**
 * kvm_vgic_addr - set or get vgic VM base addresses
 * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX
+ * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
 * @addr:  pointer to address value
 * @write: if true set the address in the VM address space, if false read the
 *          address
@@ -1708,211 +1904,64 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 {
        int r = 0;
        struct vgic_dist *vgic = &kvm->arch.vgic;
+        int type_needed;
+        phys_addr_t *addr_ptr, block_size;
+        phys_addr_t alignment;
        mutex_lock(&kvm->lock);
        switch (type) {
        case KVM_VGIC_V2_ADDR_TYPE_DIST:
-                if (write) {
+                type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-                        r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base,
+                addr_ptr = &vgic->vgic_dist_base;
-                                               *addr, KVM_VGIC_V2_DIST_SIZE);
+                block_size = KVM_VGIC_V2_DIST_SIZE;
-                } else {
+                alignment = SZ_4K;
-                        *addr = vgic->vgic_dist_base;
-                }
                break;
        case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                if (write) {
+                type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-                        r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base,
+                addr_ptr = &vgic->vgic_cpu_base;
-                                               *addr, KVM_VGIC_V2_CPU_SIZE);
+                block_size = KVM_VGIC_V2_CPU_SIZE;
-                } else {
+                alignment = SZ_4K;
-                        *addr = vgic->vgic_cpu_base;
-                }
-                break;
-        default:
-                r = -ENODEV;
-        }
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-        u32 reg, mask = 0, shift = 0;
-        bool updated = false;
-        switch (offset & ~0x3) {
-        case GIC_CPU_CTRL:
-                mask = GICH_VMCR_CTRL_MASK;
-                shift = GICH_VMCR_CTRL_SHIFT;
                break;
-        case GIC_CPU_PRIMASK:
+#ifdef CONFIG_ARM_GIC_V3
-                mask = GICH_VMCR_PRIMASK_MASK;
+        case KVM_VGIC_V3_ADDR_TYPE_DIST:
-                shift = GICH_VMCR_PRIMASK_SHIFT;
+                type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+                addr_ptr = &vgic->vgic_dist_base;
+                block_size = KVM_VGIC_V3_DIST_SIZE;
+                alignment = SZ_64K;
                break;
-        case GIC_CPU_BINPOINT:
+        case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-                mask = GICH_VMCR_BINPOINT_MASK;
+                type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-                shift = GICH_VMCR_BINPOINT_SHIFT;
+                addr_ptr = &vgic->vgic_redist_base;
-                break;
+                block_size = KVM_VGIC_V3_REDIST_SIZE;
-        case GIC_CPU_ALIAS_BINPOINT:
+                alignment = SZ_64K;
-                mask = GICH_VMCR_ALIAS_BINPOINT_MASK;
-                shift = GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-                break;
-        }
-        if (!mmio->is_write) {
-                reg = (vgic_cpu->vgic_vmcr & mask) >> shift;
-                mmio_data_write(mmio, ~0, reg);
-        } else {
-                reg = mmio_data_read(mmio, ~0);
-                reg = (reg << shift) & mask;
-                if (reg != (vgic_cpu->vgic_vmcr & mask))
-                        updated = true;
-                vgic_cpu->vgic_vmcr &= ~mask;
-                vgic_cpu->vgic_vmcr |= reg;
-        }
-        return updated;
-}
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
-                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-        return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset)
-{
-        u32 reg;
-        if (mmio->is_write)
-                return false;
-        /* GICC_IIDR */
-        reg = (PRODUCT_ID_KVM << 20) |
-              (GICC_ARCH_VERSION_V2 << 16) |
-              (IMPLEMENTER_ARM << 0);
-        mmio_data_write(mmio, ~0, reg);
-        return false;
-}
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct mmio_range vgic_cpu_ranges[] = {
-        {
-                .base           = GIC_CPU_CTRL,
-                .len            = 12,
-                .handle_mmio    = handle_cpu_mmio_misc,
-        },
-        {
-                .base           = GIC_CPU_ALIAS_BINPOINT,
-                .len            = 4,
-                .handle_mmio    = handle_mmio_abpr,
-        },
-        {
-                .base           = GIC_CPU_ACTIVEPRIO,
-                .len            = 16,
-                .handle_mmio    = handle_mmio_raz_wi,
-        },
-        {
-                .base           = GIC_CPU_IDENT,
-                .len            = 4,
-                .handle_mmio    = handle_cpu_mmio_ident,
-        },
-};
-static int vgic_attr_regs_access(struct kvm_device *dev,
-                                 struct kvm_device_attr *attr,
-                                 u32 *reg, bool is_write)
-{
-        const struct mmio_range *r = NULL, *ranges;
-        phys_addr_t offset;
-        int ret, cpuid, c;
-        struct kvm_vcpu *vcpu, *tmp_vcpu;
-        struct vgic_dist *vgic;
-        struct kvm_exit_mmio mmio;
-        offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-        cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-        mutex_lock(&dev->kvm->lock);
-        if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-        vgic = &dev->kvm->arch.vgic;
-        mmio.len = 4;
-        mmio.is_write = is_write;
-        if (is_write)
-                mmio_data_write(&mmio, ~0, *reg);
-        switch (attr->group) {
-        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-                mmio.phys_addr = vgic->vgic_dist_base + offset;
-                ranges = vgic_dist_ranges;
-                break;
-        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-                mmio.phys_addr = vgic->vgic_cpu_base + offset;
-                ranges = vgic_cpu_ranges;
                break;
+#endif
        default:
-                BUG();
+                r = -ENODEV;
+                goto out;
        }
-        r = find_matching_range(ranges, &mmio, offset);
-        if (unlikely(!r || !r->handle_mmio)) {
+        if (vgic->vgic_model != type_needed) {
-                ret = -ENXIO;
+                r = -ENODEV;
                goto out;
        }
+        if (write) {
-        spin_lock(&vgic->lock);
+                if (!IS_ALIGNED(*addr, alignment))
+                        r = -EINVAL;
-        /*
+                else
-         * Ensure that no other VCPU is running by checking the vcpu->cpu
+                        r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
-         * field.  If no other VPCUs are running we can safely access the VGIC
+                                               block_size);
-         * state, because even if another VPU is run after this point, that
+        } else {
-         * VCPU will not touch the vgic state, because it will block on
+                *addr = *addr_ptr;
-         * getting the vgic->lock in kvm_vgic_sync_hwstate().
-         */
-        kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-                if (unlikely(tmp_vcpu->cpu != -1)) {
-                        ret = -EBUSY;
-                        goto out_vgic_unlock;
-                }
        }
-        /*
-         * Move all pending IRQs from the LRs on all VCPUs so the pending
-         * state can be properly represented in the register state accessible
-         * through this API.
-         */
-        kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
-                vgic_unqueue_irqs(tmp_vcpu);
-        offset -= r->base;
-        r->handle_mmio(vcpu, &mmio, offset);
-        if (!is_write)
-                *reg = mmio_data_read(&mmio, ~0);
-        ret = 0;
-out_vgic_unlock:
-        spin_unlock(&vgic->lock);
 out:
-        mutex_unlock(&dev->kvm->lock);
+        mutex_unlock(&kvm->lock);
-        return ret;
+        return r;
 }
-static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
        int r;
@@ -1928,24 +1977,50 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
                r = kvm_vgic_addr(dev->kvm, type, &addr, true);
                return (r == -ENODEV) ? -ENXIO : r;
        }
+        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-                u32 reg;
+                u32 val;
+                int ret = 0;
-                if (get_user(reg, uaddr))
+                if (get_user(val, uaddr))
                        return -EFAULT;
-                return vgic_attr_regs_access(dev, attr, &reg, true);
+                /*
-        }
+                 * We require:
+                 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+                 * - at most 1024 interrupts
+                 * - a multiple of 32 interrupts
+                 */
+                if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+                    val > VGIC_MAX_IRQS ||
+                    (val & 31))
+                        return -EINVAL;
+                mutex_lock(&dev->kvm->lock);
+                if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
+                        ret = -EBUSY;
+                else
+                        dev->kvm->arch.vgic.nr_irqs = val;
+                mutex_unlock(&dev->kvm->lock);
+                return ret;
+        }
+        case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+                switch (attr->attr) {
+                case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                        r = vgic_init(dev->kvm);
+                        return r;
+                }
+                break;
+        }
        }
        return -ENXIO;
 }
-static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
        int r = -ENXIO;
@@ -1963,16 +2038,10 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
                        return -EFAULT;
                break;
        }
+        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-                u32 reg = 0;
-                r = vgic_attr_regs_access(dev, attr, &reg, false);
+                r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
-                if (r)
-                        return r;
-                r = put_user(reg, uaddr);
                break;
        }
@@ -1981,55 +2050,124 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
        return r;
 }
-static int vgic_has_attr_regs(const struct mmio_range *ranges,
+int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-                              phys_addr_t offset)
 {
-        struct kvm_exit_mmio dev_attr_mmio;
+        if (vgic_find_range(ranges, 4, offset))
-        dev_attr_mmio.len = 4;
-        if (find_matching_range(ranges, &dev_attr_mmio, offset))
                return 0;
        else
                return -ENXIO;
 }
-static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+static void vgic_init_maintenance_interrupt(void *info)
 {
-        phys_addr_t offset;
+        enable_percpu_irq(vgic->maint_irq, 0);
+}
-        switch (attr->group) {
+static int vgic_cpu_notify(struct notifier_block *self,
-        case KVM_DEV_ARM_VGIC_GRP_ADDR:
+                           unsigned long action, void *cpu)
-                switch (attr->attr) {
+{
-                case KVM_VGIC_V2_ADDR_TYPE_DIST:
+        switch (action) {
-                case KVM_VGIC_V2_ADDR_TYPE_CPU:
+        case CPU_STARTING:
-                        return 0;
+        case CPU_STARTING_FROZEN:
-                }
+                vgic_init_maintenance_interrupt(NULL);
+                break;
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
+                disable_percpu_irq(vgic->maint_irq);
                break;
-        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-                offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-                return vgic_has_attr_regs(vgic_dist_ranges, offset);
-        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-                offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-                return vgic_has_attr_regs(vgic_cpu_ranges, offset);
        }
-        return -ENXIO;
+        return NOTIFY_OK;
+}
+static struct notifier_block vgic_cpu_nb = {
+        .notifier_call = vgic_cpu_notify,
+};
+static const struct of_device_id vgic_ids[] = {
+        { .compatible = "arm,cortex-a15-gic",   .data = vgic_v2_probe, },
+        { .compatible = "arm,cortex-a7-gic",    .data = vgic_v2_probe, },
+        { .compatible = "arm,gic-400",          .data = vgic_v2_probe, },
+        { .compatible = "arm,gic-v3",           .data = vgic_v3_probe, },
+        {},
+};
+int kvm_vgic_hyp_init(void)
+{
+        const struct of_device_id *matched_id;
+        const int (*vgic_probe)(struct device_node *,const struct vgic_ops **,
+                                const struct vgic_params **);
+        struct device_node *vgic_node;
+        int ret;
+        vgic_node = of_find_matching_node_and_match(NULL,
+                                                    vgic_ids, &matched_id);
+        if (!vgic_node) {
+                kvm_err("error: no compatible GIC node found\n");
+                return -ENODEV;
+        }
+        vgic_probe = matched_id->data;
+        ret = vgic_probe(vgic_node, &vgic_ops, &vgic);
+        if (ret)
+                return ret;
+        ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
+                                 "vgic", kvm_get_running_vcpus());
+        if (ret) {
+                kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
+                return ret;
+        }
+        ret = __register_cpu_notifier(&vgic_cpu_nb);
+        if (ret) {
+                kvm_err("Cannot register vgic CPU notifier\n");
+                goto out_free_irq;
+        }
+        /* Callback into for arch code for setup */
+        vgic_arch_setup(vgic);
+        on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
+        return 0;
+out_free_irq:
+        free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
+        return ret;
 }
-static void vgic_destroy(struct kvm_device *dev)
+int kvm_irq_map_gsi(struct kvm *kvm,
+                    struct kvm_kernel_irq_routing_entry *entries,
+                    int gsi)
 {
-        kfree(dev);
+        return gsi;
 }
-static int vgic_create(struct kvm_device *dev, u32 type)
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
-        return kvm_vgic_create(dev->kvm);
+        return pin;
 }
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+int kvm_set_irq(struct kvm *kvm, int irq_source_id,
-        .name = "kvm-arm-vgic",
+                u32 irq, int level, bool line_status)
-        .create = vgic_create,
+{
-        .destroy = vgic_destroy,
+        unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-        .set_attr = vgic_set_attr,
-        .get_attr = vgic_get_attr,
+        trace_kvm_set_irq(irq, level, irq_source_id);
-        .has_attr = vgic_has_attr,
-};
+        BUG_ON(!vgic_initialized(kvm));
+        if (spi > kvm->arch.vgic.nr_irqs)
+                return -EINVAL;
+        return kvm_vgic_inject_irq(kvm, 0, spi, level);
+}
+/* MSI not implemented yet */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+                struct kvm *kvm, int irq_source_id,
+                int level, bool line_status)
+{
+        return 0;
+}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
new file mode 100644
index 000000000000..0df74cbb6200
--- /dev/null
+++ b/virt/kvm/arm/vgic.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2012-2014 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Derived from virt/kvm/arm/vgic.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KVM_VGIC_H__
+#define __KVM_VGIC_H__
+#include <kvm/iodev.h>
+#define VGIC_ADDR_UNDEF         (-1)
+#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
+#define PRODUCT_ID_KVM          0x4b    /* ASCII code K */
+#define IMPLEMENTER_ARM         0x43b
+#define ACCESS_READ_VALUE       (1 << 0)
+#define ACCESS_READ_RAZ         (0 << 0)
+#define ACCESS_READ_MASK(x)     ((x) & (1 << 0))
+#define ACCESS_WRITE_IGNORED    (0 << 1)
+#define ACCESS_WRITE_SETBIT     (1 << 1)
+#define ACCESS_WRITE_CLEARBIT   (2 << 1)
+#define ACCESS_WRITE_VALUE      (3 << 1)
+#define ACCESS_WRITE_MASK(x)    ((x) & (3 << 1))
+#define VCPU_NOT_ALLOCATED      ((u8)-1)
+unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
+void vgic_update_state(struct kvm *kvm);
+int vgic_init_common_maps(struct kvm *kvm);
+u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
+u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
+void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
+void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
+void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
+void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
+                             int irq, int val);
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
+void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
+struct kvm_exit_mmio {
+        phys_addr_t     phys_addr;
+        void            *data;
+        u32             len;
+        bool            is_write;
+        void            *private;
+};
+void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
+                     phys_addr_t offset, int mode);
+bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+                        phys_addr_t offset);
+static inline
+u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
+{
+        return le32_to_cpu(*((u32 *)mmio->data)) & mask;
+}
+static inline
+void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
+{
+        *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
+}
+struct vgic_io_range {
+        phys_addr_t base;
+        unsigned long len;
+        int bits_per_irq;
+        bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+                            phys_addr_t offset);
+};
+int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
+                             const struct vgic_io_range *ranges,
+                             int redist_id,
+                             struct vgic_io_device *iodev);
+static inline bool is_in_range(phys_addr_t addr, unsigned long len,
+                               phys_addr_t baseaddr, unsigned long size)
+{
+        return (addr >= baseaddr) && (addr + len <= baseaddr + size);
+}
+const
+struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
+                                      int len, gpa_t offset);
+bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
+                            phys_addr_t offset, int vcpu_id, int access);
+bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
+                                 phys_addr_t offset, int vcpu_id);
+bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
+                                   phys_addr_t offset, int vcpu_id);
+bool vgic_handle_set_active_reg(struct kvm *kvm,
+                                struct kvm_exit_mmio *mmio,
+                                phys_addr_t offset, int vcpu_id);
+bool vgic_handle_clear_active_reg(struct kvm *kvm,
+                                  struct kvm_exit_mmio *mmio,
+                                  phys_addr_t offset, int vcpu_id);
+bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
+                         phys_addr_t offset);
+void vgic_kick_vcpus(struct kvm *kvm);
+int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
+int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_init(struct kvm *kvm);
+void vgic_v2_init_emulation(struct kvm *kvm);
+void vgic_v3_init_emulation(struct kvm *kvm);
+#endif
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
deleted file mode 100644
index bf06577fea51..000000000000
--- a/virt/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1024 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-                                                      int assigned_dev_id)
-{
-        struct list_head *ptr;
-        struct kvm_assigned_dev_kernel *match;
-        list_for_each(ptr, head) {
-                match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
-                if (match->assigned_dev_id == assigned_dev_id)
-                        return match;
-        }
-        return NULL;
-}
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-                                    *assigned_dev, int irq)
-{
-        int i, index;
-        struct msix_entry *host_msix_entries;
-        host_msix_entries = assigned_dev->host_msix_entries;
-        index = -1;
-        for (i = 0; i < assigned_dev->entries_nr; i++)
-                if (irq == host_msix_entries[i].vector) {
-                        index = i;
-                        break;
-                }
-        if (index < 0)
-                printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-        return index;
-}
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int ret;
-        spin_lock(&assigned_dev->intx_lock);
-        if (pci_check_and_mask_intx(assigned_dev->dev)) {
-                assigned_dev->host_irq_disabled = true;
-                ret = IRQ_WAKE_THREAD;
-        } else
-                ret = IRQ_NONE;
-        spin_unlock(&assigned_dev->intx_lock);
-        return ret;
-}
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-                                 int vector)
-{
-        if (unlikely(assigned_dev->irq_requested_type &
-                     KVM_DEV_IRQ_GUEST_INTX)) {
-                spin_lock(&assigned_dev->intx_mask_lock);
-                if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-                        kvm_set_irq(assigned_dev->kvm,
-                                    assigned_dev->irq_source_id, vector, 1,
-                                    false);
-                spin_unlock(&assigned_dev->intx_mask_lock);
-        } else
-                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                            vector, 1, false);
-}
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                spin_lock_irq(&assigned_dev->intx_lock);
-                disable_irq_nosync(irq);
-                assigned_dev->host_irq_disabled = true;
-                spin_unlock_irq(&assigned_dev->intx_lock);
-        }
-        kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                         assigned_dev->guest_irq);
-        return IRQ_HANDLED;
-}
-#ifdef __KVM_HAVE_MSI
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                       assigned_dev->irq_source_id,
-                                       assigned_dev->guest_irq, 1);
-        return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                         assigned_dev->guest_irq);
-        return IRQ_HANDLED;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int index = find_index_from_host_irq(assigned_dev, irq);
-        u32 vector;
-        int ret = 0;
-        if (index >= 0) {
-                vector = assigned_dev->guest_msix_entries[index].vector;
-                ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                           assigned_dev->irq_source_id,
-                                           vector, 1);
-        }
-        return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int index = find_index_from_host_irq(assigned_dev, irq);
-        u32 vector;
-        if (index >= 0) {
-                vector = assigned_dev->guest_msix_entries[index].vector;
-                kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-        }
-        return IRQ_HANDLED;
-}
-#endif
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-        struct kvm_assigned_dev_kernel *dev =
-                container_of(kian, struct kvm_assigned_dev_kernel,
-                             ack_notifier);
-        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-        spin_lock(&dev->intx_mask_lock);
-        if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-                bool reassert = false;
-                spin_lock_irq(&dev->intx_lock);
-                /*
-                 * The guest IRQ may be shared so this ack can come from an
-                 * IRQ for another guest device.
-                 */
-                if (dev->host_irq_disabled) {
-                        if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-                                enable_irq(dev->host_irq);
-                        else if (!pci_check_and_unmask_intx(dev->dev))
-                                reassert = true;
-                        dev->host_irq_disabled = reassert;
-                }
-                spin_unlock_irq(&dev->intx_lock);
-                if (reassert)
-                        kvm_set_irq(dev->kvm, dev->irq_source_id,
-                                    dev->guest_irq, 1, false);
-        }
-        spin_unlock(&dev->intx_mask_lock);
-}
-static void deassign_guest_irq(struct kvm *kvm,
-                               struct kvm_assigned_dev_kernel *assigned_dev)
-{
-        if (assigned_dev->ack_notifier.gsi != -1)
-                kvm_unregister_irq_ack_notifier(kvm,
-                                                &assigned_dev->ack_notifier);
-        kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                    assigned_dev->guest_irq, 0, false);
-        if (assigned_dev->irq_source_id != -1)
-                kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-        assigned_dev->irq_source_id = -1;
-        assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-                              struct kvm_assigned_dev_kernel *assigned_dev)
-{
-        /*
-         * We disable irq here to prevent further events.
-         *
-         * Notice this maybe result in nested disable if the interrupt type is
-         * INTx, but it's OK for we are going to free it.
-         *
-         * If this function is a part of VM destroy, please ensure that till
-         * now, the kvm state is still legal for probably we also have to wait
-         * on a currently running IRQ handler.
-         */
-        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-                int i;
-                for (i = 0; i < assigned_dev->entries_nr; i++)
-                        disable_irq(assigned_dev->host_msix_entries[i].vector);
-                for (i = 0; i < assigned_dev->entries_nr; i++)
-                        free_irq(assigned_dev->host_msix_entries[i].vector,
-                                 assigned_dev);
-                assigned_dev->entries_nr = 0;
-                kfree(assigned_dev->host_msix_entries);
-                kfree(assigned_dev->guest_msix_entries);
-                pci_disable_msix(assigned_dev->dev);
-        } else {
-                /* Deal with MSI and INTx */
-                if ((assigned_dev->irq_requested_type &
-                     KVM_DEV_IRQ_HOST_INTX) &&
-                    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                        spin_lock_irq(&assigned_dev->intx_lock);
-                        pci_intx(assigned_dev->dev, false);
-                        spin_unlock_irq(&assigned_dev->intx_lock);
-                        synchronize_irq(assigned_dev->host_irq);
-                } else
-                        disable_irq(assigned_dev->host_irq);
-                free_irq(assigned_dev->host_irq, assigned_dev);
-                if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-                        pci_disable_msi(assigned_dev->dev);
-        }
-        assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-static int kvm_deassign_irq(struct kvm *kvm,
-                            struct kvm_assigned_dev_kernel *assigned_dev,
-                            unsigned long irq_requested_type)
-{
-        unsigned long guest_irq_type, host_irq_type;
-        if (!irqchip_in_kernel(kvm))
-                return -EINVAL;
-        /* no irq assignment to deassign */
-        if (!assigned_dev->irq_requested_type)
-                return -ENXIO;
-        host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-        guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-        if (host_irq_type)
-                deassign_host_irq(kvm, assigned_dev);
-        if (guest_irq_type)
-                deassign_guest_irq(kvm, assigned_dev);
-        return 0;
-}
-static void kvm_free_assigned_irq(struct kvm *kvm,
-                                  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-        kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-static void kvm_free_assigned_device(struct kvm *kvm,
-                                     struct kvm_assigned_dev_kernel
-                                     *assigned_dev)
-{
-        kvm_free_assigned_irq(kvm, assigned_dev);
-        pci_reset_function(assigned_dev->dev);
-        if (pci_load_and_free_saved_state(assigned_dev->dev,
-                                          &assigned_dev->pci_saved_state))
-                printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-                       __func__, dev_name(&assigned_dev->dev->dev));
-        else
-                pci_restore_state(assigned_dev->dev);
-        assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
-        pci_release_regions(assigned_dev->dev);
-        pci_disable_device(assigned_dev->dev);
-        pci_dev_put(assigned_dev->dev);
-        list_del(&assigned_dev->list);
-        kfree(assigned_dev);
-}
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-        struct list_head *ptr, *ptr2;
-        struct kvm_assigned_dev_kernel *assigned_dev;
-        list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-                assigned_dev = list_entry(ptr,
-                                          struct kvm_assigned_dev_kernel,
-                                          list);
-                kvm_free_assigned_device(kvm, assigned_dev);
-        }
-}
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-                                            struct kvm_assigned_dev_kernel *dev)
-{
-        irq_handler_t irq_handler;
-        unsigned long flags;
-        dev->host_irq = dev->dev->irq;
-        /*
-         * We can only share the IRQ line with other host devices if we are
-         * able to disable the IRQ source at device-level - independently of
-         * the guest driver. Otherwise host devices may suffer from unbounded
-         * IRQ latencies when the guest keeps the line asserted.
-         */
-        if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-                irq_handler = kvm_assigned_dev_intx;
-                flags = IRQF_SHARED;
-        } else {
-                irq_handler = NULL;
-                flags = IRQF_ONESHOT;
-        }
-        if (request_threaded_irq(dev->host_irq, irq_handler,
-                                 kvm_assigned_dev_thread_intx, flags,
-                                 dev->irq_name, dev))
-                return -EIO;
-        if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-                spin_lock_irq(&dev->intx_lock);
-                pci_intx(dev->dev, true);
-                spin_unlock_irq(&dev->intx_lock);
-        }
-        return 0;
-}
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-                                           struct kvm_assigned_dev_kernel *dev)
-{
-        int r;
-        if (!dev->dev->msi_enabled) {
-                r = pci_enable_msi(dev->dev);
-                if (r)
-                        return r;
-        }
-        dev->host_irq = dev->dev->irq;
-        if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-                                 kvm_assigned_dev_thread_msi, 0,
-                                 dev->irq_name, dev)) {
-                pci_disable_msi(dev->dev);
-                return -EIO;
-        }
-        return 0;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-                                            struct kvm_assigned_dev_kernel *dev)
-{
-        int i, r = -EINVAL;
-        /* host_msix_entries and guest_msix_entries should have been
-         * initialized */
-        if (dev->entries_nr == 0)
-                return r;
-        r = pci_enable_msix_exact(dev->dev,
-                                  dev->host_msix_entries, dev->entries_nr);
-        if (r)
-                return r;
-        for (i = 0; i < dev->entries_nr; i++) {
-                r = request_threaded_irq(dev->host_msix_entries[i].vector,
-                                         kvm_assigned_dev_msix,
-                                         kvm_assigned_dev_thread_msix,
-                                         0, dev->irq_name, dev);
-                if (r)
-                        goto err;
-        }
-        return 0;
-err:
-        for (i -= 1; i >= 0; i--)
-                free_irq(dev->host_msix_entries[i].vector, dev);
-        pci_disable_msix(dev->dev);
-        return r;
-}
-#endif
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-                                struct kvm_assigned_dev_kernel *dev,
-                                struct kvm_assigned_irq *irq)
-{
-        dev->guest_irq = irq->guest_irq;
-        dev->ack_notifier.gsi = irq->guest_irq;
-        return 0;
-}
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-                        struct kvm_assigned_dev_kernel *dev,
-                        struct kvm_assigned_irq *irq)
-{
-        dev->guest_irq = irq->guest_irq;
-        dev->ack_notifier.gsi = -1;
-        return 0;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-                        struct kvm_assigned_dev_kernel *dev,
-                        struct kvm_assigned_irq *irq)
-{
-        dev->guest_irq = irq->guest_irq;
-        dev->ack_notifier.gsi = -1;
-        return 0;
-}
-#endif
-static int assign_host_irq(struct kvm *kvm,
-                           struct kvm_assigned_dev_kernel *dev,
-                           __u32 host_irq_type)
-{
-        int r = -EEXIST;
-        if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-                return r;
-        snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-                 pci_name(dev->dev));
-        switch (host_irq_type) {
-        case KVM_DEV_IRQ_HOST_INTX:
-                r = assigned_device_enable_host_intx(kvm, dev);
-                break;
-#ifdef __KVM_HAVE_MSI
-        case KVM_DEV_IRQ_HOST_MSI:
-                r = assigned_device_enable_host_msi(kvm, dev);
-                break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-        case KVM_DEV_IRQ_HOST_MSIX:
-                r = assigned_device_enable_host_msix(kvm, dev);
-                break;
-#endif
-        default:
-                r = -EINVAL;
-        }
-        dev->host_irq_disabled = false;
-        if (!r)
-                dev->irq_requested_type |= host_irq_type;
-        return r;
-}
-static int assign_guest_irq(struct kvm *kvm,
-                            struct kvm_assigned_dev_kernel *dev,
-                            struct kvm_assigned_irq *irq,
-                            unsigned long guest_irq_type)
-{
-        int id;
-        int r = -EEXIST;
-        if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-                return r;
-        id = kvm_request_irq_source_id(kvm);
-        if (id < 0)
-                return id;
-        dev->irq_source_id = id;
-        switch (guest_irq_type) {
-        case KVM_DEV_IRQ_GUEST_INTX:
-                r = assigned_device_enable_guest_intx(kvm, dev, irq);
-                break;
-#ifdef __KVM_HAVE_MSI
-        case KVM_DEV_IRQ_GUEST_MSI:
-                r = assigned_device_enable_guest_msi(kvm, dev, irq);
-                break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-        case KVM_DEV_IRQ_GUEST_MSIX:
-                r = assigned_device_enable_guest_msix(kvm, dev, irq);
-                break;
-#endif
-        default:
-                r = -EINVAL;
-        }
-        if (!r) {
-                dev->irq_requested_type |= guest_irq_type;
-                if (dev->ack_notifier.gsi != -1)
-                        kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-        } else
-                kvm_free_irq_source_id(kvm, dev->irq_source_id);
-        return r;
-}
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-                                   struct kvm_assigned_irq *assigned_irq)
-{
-        int r = -EINVAL;
-        struct kvm_assigned_dev_kernel *match;
-        unsigned long host_irq_type, guest_irq_type;
-        if (!irqchip_in_kernel(kvm))
-                return r;
-        mutex_lock(&kvm->lock);
-        r = -ENODEV;
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_irq->assigned_dev_id);
-        if (!match)
-                goto out;
-        host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-        guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-        r = -EINVAL;
-        /* can only assign one type at a time */
-        if (hweight_long(host_irq_type) > 1)
-                goto out;
-        if (hweight_long(guest_irq_type) > 1)
-                goto out;
-        if (host_irq_type == 0 && guest_irq_type == 0)
-                goto out;
-        r = 0;
-        if (host_irq_type)
-                r = assign_host_irq(kvm, match, host_irq_type);
-        if (r)
-                goto out;
-        if (guest_irq_type)
-                r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-                                         struct kvm_assigned_irq
-                                         *assigned_irq)
-{
-        int r = -ENODEV;
-        struct kvm_assigned_dev_kernel *match;
-        unsigned long irq_type;
-        mutex_lock(&kvm->lock);
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_irq->assigned_dev_id);
-        if (!match)
-                goto out;
-        irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-                                          KVM_DEV_IRQ_GUEST_MASK);
-        r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-        int i;
-        bool bar_found = false;
-        for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-                char *kpath, *syspath;
-                struct path path;
-                struct inode *inode;
-                int r;
-                if (!pci_resource_len(dev, i))
-                        continue;
-                kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-                if (!kpath)
-                        return -ENOMEM;
-                /* Per sysfs-rules, sysfs is always at /sys */
-                syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-                kfree(kpath);
-                if (!syspath)
-                        return -ENOMEM;
-                r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-                kfree(syspath);
-                if (r)
-                        return r;
-                inode = path.dentry->d_inode;
-                r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-                path_put(&path);
-                if (r)
-                        return r;
-                bar_found = true;
-        }
-        /* If no resources, probably something special */
-        if (!bar_found)
-                return -EPERM;
-        return 0;
-#else
-        return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-                                      struct kvm_assigned_pci_dev *assigned_dev)
-{
-        int r = 0, idx;
-        struct kvm_assigned_dev_kernel *match;
-        struct pci_dev *dev;
-        if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-                return -EINVAL;
-        mutex_lock(&kvm->lock);
-        idx = srcu_read_lock(&kvm->srcu);
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_dev->assigned_dev_id);
-        if (match) {
-                /* device already assigned */
-                r = -EEXIST;
-                goto out;
-        }
-        match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-        if (match == NULL) {
-                printk(KERN_INFO "%s: Couldn't allocate memory\n",
-                       __func__);
-                r = -ENOMEM;
-                goto out;
-        }
-        dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-                                   assigned_dev->busnr,
-                                   assigned_dev->devfn);
-        if (!dev) {
-                printk(KERN_INFO "%s: host device not found\n", __func__);
-                r = -EINVAL;
-                goto out_free;
-        }
-        /* Don't allow bridges to be assigned */
-        if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-                r = -EPERM;
-                goto out_put;
-        }
-        r = probe_sysfs_permissions(dev);
-        if (r)
-                goto out_put;
-        if (pci_enable_device(dev)) {
-                printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-                r = -EBUSY;
-                goto out_put;
-        }
-        r = pci_request_regions(dev, "kvm_assigned_device");
-        if (r) {
-                printk(KERN_INFO "%s: Could not get access to device regions\n",
-                       __func__);
-                goto out_disable;
-        }
-        pci_reset_function(dev);
-        pci_save_state(dev);
-        match->pci_saved_state = pci_store_saved_state(dev);
-        if (!match->pci_saved_state)
-                printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-                       __func__, dev_name(&dev->dev));
-        if (!pci_intx_mask_supported(dev))
-                assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-        match->assigned_dev_id = assigned_dev->assigned_dev_id;
-        match->host_segnr = assigned_dev->segnr;
-        match->host_busnr = assigned_dev->busnr;
-        match->host_devfn = assigned_dev->devfn;
-        match->flags = assigned_dev->flags;
-        match->dev = dev;
-        spin_lock_init(&match->intx_lock);
-        spin_lock_init(&match->intx_mask_lock);
-        match->irq_source_id = -1;
-        match->kvm = kvm;
-        match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-        list_add(&match->list, &kvm->arch.assigned_dev_head);
-        if (!kvm->arch.iommu_domain) {
-                r = kvm_iommu_map_guest(kvm);
-                if (r)
-                        goto out_list_del;
-        }
-        r = kvm_assign_device(kvm, match);
-        if (r)
-                goto out_list_del;
-out:
-        srcu_read_unlock(&kvm->srcu, idx);
-        mutex_unlock(&kvm->lock);
-        return r;
-out_list_del:
-        if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-                printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-                       __func__, dev_name(&dev->dev));
-        list_del(&match->list);
-        pci_release_regions(dev);
-out_disable:
-        pci_disable_device(dev);
-out_put:
-        pci_dev_put(dev);
-out_free:
-        kfree(match);
-        srcu_read_unlock(&kvm->srcu, idx);
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-                struct kvm_assigned_pci_dev *assigned_dev)
-{
-        int r = 0;
-        struct kvm_assigned_dev_kernel *match;
-        mutex_lock(&kvm->lock);
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_dev->assigned_dev_id);
-        if (!match) {
-                printk(KERN_INFO "%s: device hasn't been assigned before, "
-                  "so cannot be deassigned\n", __func__);
-                r = -EINVAL;
-                goto out;
-        }
-        kvm_deassign_device(kvm, match);
-        kvm_free_assigned_device(kvm, match);
-out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-#ifdef __KVM_HAVE_MSIX
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-                                    struct kvm_assigned_msix_nr *entry_nr)
-{
-        int r = 0;
-        struct kvm_assigned_dev_kernel *adev;
-        mutex_lock(&kvm->lock);
-        adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      entry_nr->assigned_dev_id);
-        if (!adev) {
-                r = -EINVAL;
-                goto msix_nr_out;
-        }
-        if (adev->entries_nr == 0) {
-                adev->entries_nr = entry_nr->entry_nr;
-                if (adev->entries_nr == 0 ||
-                    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-                        r = -EINVAL;
-                        goto msix_nr_out;
-                }
-                adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-                                                entry_nr->entry_nr,
-                                                GFP_KERNEL);
-                if (!adev->host_msix_entries) {
-                        r = -ENOMEM;
-                        goto msix_nr_out;
-                }
-                adev->guest_msix_entries =
-                        kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-                                GFP_KERNEL);
-                if (!adev->guest_msix_entries) {
-                        kfree(adev->host_msix_entries);
-                        r = -ENOMEM;
-                        goto msix_nr_out;
-                }
-        } else /* Not allowed set MSI-X number twice */
-                r = -EINVAL;
-msix_nr_out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-                                       struct kvm_assigned_msix_entry *entry)
-{
-        int r = 0, i;
-        struct kvm_assigned_dev_kernel *adev;
-        mutex_lock(&kvm->lock);
-        adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      entry->assigned_dev_id);
-        if (!adev) {
-                r = -EINVAL;
-                goto msix_entry_out;
-        }
-        for (i = 0; i < adev->entries_nr; i++)
-                if (adev->guest_msix_entries[i].vector == 0 ||
-                    adev->guest_msix_entries[i].entry == entry->entry) {
-                        adev->guest_msix_entries[i].entry = entry->entry;
-                        adev->guest_msix_entries[i].vector = entry->gsi;
-                        adev->host_msix_entries[i].entry = entry->entry;
-                        break;
-                }
-        if (i == adev->entries_nr) {
-                r = -ENOSPC;
-                goto msix_entry_out;
-        }
-msix_entry_out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-#endif
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-                struct kvm_assigned_pci_dev *assigned_dev)
-{
-        int r = 0;
-        struct kvm_assigned_dev_kernel *match;
-        mutex_lock(&kvm->lock);
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_dev->assigned_dev_id);
-        if (!match) {
-                r = -ENODEV;
-                goto out;
-        }
-        spin_lock(&match->intx_mask_lock);
-        match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-        match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-        if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-                if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-                        kvm_set_irq(match->kvm, match->irq_source_id,
-                                    match->guest_irq, 0, false);
-                        /*
-                         * Masking at hardware-level is performed on demand,
-                         * i.e. when an IRQ actually arrives at the host.
-                         */
-                } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                        /*
-                         * Unmask the IRQ line if required. Unmasking at
-                         * device level will be performed by user space.
-                         */
-                        spin_lock_irq(&match->intx_lock);
-                        if (match->host_irq_disabled) {
-                                enable_irq(match->host_irq);
-                                match->host_irq_disabled = false;
-                        }
-                        spin_unlock_irq(&match->intx_lock);
-                }
-        }
-        spin_unlock(&match->intx_mask_lock);
-out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-                                  unsigned long arg)
-{
-        void __user *argp = (void __user *)arg;
-        int r;
-        switch (ioctl) {
-        case KVM_ASSIGN_PCI_DEVICE: {
-                struct kvm_assigned_pci_dev assigned_dev;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                        goto out;
-                r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-                if (r)
-                        goto out;
-                break;
-        }
-        case KVM_ASSIGN_IRQ: {
-                r = -EOPNOTSUPP;
-                break;
-        }
-        case KVM_ASSIGN_DEV_IRQ: {
-                struct kvm_assigned_irq assigned_irq;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                        goto out;
-                r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-                if (r)
-                        goto out;
-                break;
-        }
-        case KVM_DEASSIGN_DEV_IRQ: {
-                struct kvm_assigned_irq assigned_irq;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                        goto out;
-                r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-                if (r)
-                        goto out;
-                break;
-        }
-        case KVM_DEASSIGN_PCI_DEVICE: {
-                struct kvm_assigned_pci_dev assigned_dev;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                        goto out;
-                r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-                if (r)
-                        goto out;
-                break;
-        }
-#ifdef __KVM_HAVE_MSIX
-        case KVM_ASSIGN_SET_MSIX_NR: {
-                struct kvm_assigned_msix_nr entry_nr;
-                r = -EFAULT;
-                if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-                        goto out;
-                r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-                if (r)
-                        goto out;
-                break;
-        }
-        case KVM_ASSIGN_SET_MSIX_ENTRY: {
-                struct kvm_assigned_msix_entry entry;
-                r = -EFAULT;
-                if (copy_from_user(&entry, argp, sizeof entry))
-                        goto out;
-                r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-                if (r)
-                        goto out;
-                break;
-        }
-#endif
-        case KVM_ASSIGN_SET_INTX_MASK: {
-                struct kvm_assigned_pci_dev assigned_dev;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                        goto out;
-                r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-                break;
-        }
-        default:
-                r = -ENOTTY;
-                break;
-        }
-out:
-        return r;
-}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d0993d88..44660aee335f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
        might_sleep();
-        down_read(&mm->mmap_sem);
+        get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
-        get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-        up_read(&mm->mmap_sem);
        kvm_async_page_present_sync(vcpu, apf);
        spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 00d86427af0f..571c1ce37d15 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -8,7 +8,7 @@
 *
 */
-#include "iodev.h"
+#include <kvm/iodev.h>
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
@@ -60,8 +60,9 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
        return 1;
 }
-static int coalesced_mmio_write(struct kvm_io_device *this,
+static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
-                                gpa_t addr, int len, const void *val)
+                                struct kvm_io_device *this, gpa_t addr,
+                                int len, const void *val)
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 20c3af7692c5..9ff4193dfa49 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -33,10 +33,12 @@
 #include <linux/kernel.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/seqlock.h>
+#include <trace/events/kvm.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQFD
 /*
 * --------------------------------------------------------------------
 * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -75,7 +77,8 @@ struct _irqfd {
        struct kvm *kvm;
        wait_queue_t wait;
        /* Update side is protected by irqfds.lock */
-        struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+        struct kvm_kernel_irq_routing_entry irq_entry;
+        seqcount_t irq_entry_sc;
        /* Used for level IRQ fast-path */
        int gsi;
        struct work_struct inject;
@@ -223,16 +226,20 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
        unsigned long flags = (unsigned long)key;
-        struct kvm_kernel_irq_routing_entry *irq;
+        struct kvm_kernel_irq_routing_entry irq;
        struct kvm *kvm = irqfd->kvm;
+        unsigned seq;
        int idx;
        if (flags & POLLIN) {
                idx = srcu_read_lock(&kvm->irq_srcu);
-                irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
+                do {
+                        seq = read_seqcount_begin(&irqfd->irq_entry_sc);
+                        irq = irqfd->irq_entry;
+                } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
                /* An event has been signaled, inject an interrupt */
-                if (irq)
+                if (irq.type == KVM_IRQ_ROUTING_MSI)
-                        kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+                        kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
                                        false);
                else
                        schedule_work(&irqfd->inject);
@@ -272,34 +279,40 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 }
 /* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
-                         struct kvm_irq_routing_table *irq_rt)
 {
        struct kvm_kernel_irq_routing_entry *e;
+        struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+        int i, n_entries;
-        if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+        n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
-                rcu_assign_pointer(irqfd->irq_entry, NULL);
-                return;
+        write_seqcount_begin(&irqfd->irq_entry_sc);
-        }
+        irqfd->irq_entry.type = 0;
-        hlist_for_each_entry(e, &irq_rt->map[irqfd->gsi], link) {
+        e = entries;
+        for (i = 0; i < n_entries; ++i, ++e) {
                /* Only fast-path MSI. */
                if (e->type == KVM_IRQ_ROUTING_MSI)
-                        rcu_assign_pointer(irqfd->irq_entry, e);
+                        irqfd->irq_entry = *e;
-                else
-                        rcu_assign_pointer(irqfd->irq_entry, NULL);
        }
+        write_seqcount_end(&irqfd->irq_entry_sc);
 }
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-        struct kvm_irq_routing_table *irq_rt;
        struct _irqfd *irqfd, *tmp;
        struct fd f;
        struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
        int ret;
        unsigned int events;
+        int idx;
+        if (!kvm_arch_intc_initialized(kvm))
+                return -EAGAIN;
        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
        if (!irqfd)
@@ -310,6 +323,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        INIT_LIST_HEAD(&irqfd->list);
        INIT_WORK(&irqfd->inject, irqfd_inject);
        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
+        seqcount_init(&irqfd->irq_entry_sc);
        f = fdget(args->fd);
        if (!f.file) {
@@ -392,9 +406,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
                goto fail;
        }
-        irq_rt = rcu_dereference_protected(kvm->irq_routing,
+        idx = srcu_read_lock(&kvm->irq_srcu);
-                                           lockdep_is_held(&kvm->irqfds.lock));
+        irqfd_update(kvm, irqfd);
-        irqfd_update(kvm, irqfd, irq_rt);
+        srcu_read_unlock(&kvm->irq_srcu, idx);
        list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -433,12 +447,69 @@ out:
        kfree(irqfd);
        return ret;
 }
+bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+        struct kvm_irq_ack_notifier *kian;
+        int gsi, idx;
+        idx = srcu_read_lock(&kvm->irq_srcu);
+        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+        if (gsi != -1)
+                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+                                         link)
+                        if (kian->gsi == gsi) {
+                                srcu_read_unlock(&kvm->irq_srcu, idx);
+                                return true;
+                        }
+        srcu_read_unlock(&kvm->irq_srcu, idx);
+        return false;
+}
+EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+        struct kvm_irq_ack_notifier *kian;
+        int gsi, idx;
+        trace_kvm_ack_irq(irqchip, pin);
+        idx = srcu_read_lock(&kvm->irq_srcu);
+        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+        if (gsi != -1)
+                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+                                         link)
+                        if (kian->gsi == gsi)
+                                kian->irq_acked(kian);
+        srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+                                   struct kvm_irq_ack_notifier *kian)
+{
+        mutex_lock(&kvm->irq_lock);
+        hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
+        mutex_unlock(&kvm->irq_lock);
+        kvm_vcpu_request_scan_ioapic(kvm);
+}
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+                                    struct kvm_irq_ack_notifier *kian)
+{
+        mutex_lock(&kvm->irq_lock);
+        hlist_del_init_rcu(&kian->link);
+        mutex_unlock(&kvm->irq_lock);
+        synchronize_srcu(&kvm->irq_srcu);
+        kvm_vcpu_request_scan_ioapic(kvm);
+}
 #endif
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQFD
        spin_lock_init(&kvm->irqfds.lock);
        INIT_LIST_HEAD(&kvm->irqfds.items);
        INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
@@ -447,7 +518,7 @@ kvm_eventfd_init(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->ioeventfds);
 }
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQFD
 /*
 * shutdown any irqfd's that match fd+gsi
 */
@@ -466,14 +537,14 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
                if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
                        /*
-                         * This rcu_assign_pointer is needed for when
+                         * This clearing of irq_entry.type is needed for when
                         * another thread calls kvm_irq_routing_update before
                         * we flush workqueue below (we synchronize with
                         * kvm_irq_routing_update using irqfds.lock).
-                         * It is paired with synchronize_srcu done by caller
-                         * of that function.
                         */
-                        rcu_assign_pointer(irqfd->irq_entry, NULL);
+                        write_seqcount_begin(&irqfd->irq_entry_sc);
+                        irqfd->irq_entry.type = 0;
+                        write_seqcount_end(&irqfd->irq_entry_sc);
                        irqfd_deactivate(irqfd);
                }
        }
@@ -528,20 +599,17 @@ kvm_irqfd_release(struct kvm *kvm)
 }
 /*
- * Change irq_routing and irqfd.
+ * Take note of a change in irq routing.
 * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
 */
-void kvm_irq_routing_update(struct kvm *kvm,
+void kvm_irq_routing_update(struct kvm *kvm)
-                            struct kvm_irq_routing_table *irq_rt)
 {
        struct _irqfd *irqfd;
        spin_lock_irq(&kvm->irqfds.lock);
-        rcu_assign_pointer(kvm->irq_routing, irq_rt);
        list_for_each_entry(irqfd, &kvm->irqfds.items, list)
-                irqfd_update(kvm, irqfd, irq_rt);
+                irqfd_update(kvm, irqfd);
        spin_unlock_irq(&kvm->irqfds.lock);
 }
@@ -647,8 +715,8 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
-ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
+ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
-                const void *val)
+                int len, const void *val)
 {
        struct _ioeventfd *p = to_ioeventfd(this);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
deleted file mode 100644
index 2458a1dc2ba9..000000000000
--- a/virt/kvm/ioapic.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- *  Copyright (C) 2001  MandrakeSoft S.A.
- *  Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- *    MandrakeSoft S.A.
- *    43, rue d'Aboukir
- *    75002 Paris - France
- *    http://www.linux-mandrake.com/
- *    http://www.mandrakesoft.com/
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Yunhong Jiang <yunhong.jiang@intel.com>
- *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *  Based on Xen 3.1 code.
- */
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <trace/events/kvm.h>
-#include "ioapic.h"
-#include "lapic.h"
-#include "irq.h"
-#if 0
-#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
-#else
-#define ioapic_debug(fmt, arg...)
-#endif
-static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
-                bool line_status);
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
-                                          unsigned long addr,
-                                          unsigned long length)
-{
-        unsigned long result = 0;
-        switch (ioapic->ioregsel) {
-        case IOAPIC_REG_VERSION:
-                result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
-                          | (IOAPIC_VERSION_ID & 0xff));
-                break;
-        case IOAPIC_REG_APIC_ID:
-        case IOAPIC_REG_ARB_ID:
-                result = ((ioapic->id & 0xf) << 24);
-                break;
-        default:
-                {
-                        u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
-                        u64 redir_content;
-                        if (redir_index < IOAPIC_NUM_PINS)
-                                redir_content =
-                                        ioapic->redirtbl[redir_index].bits;
-                        else
-                                redir_content = ~0ULL;
-                        result = (ioapic->ioregsel & 0x1) ?
-                            (redir_content >> 32) & 0xffffffff :
-                            redir_content & 0xffffffff;
-                        break;
-                }
-        }
-        return result;
-}
-static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
-{
-        ioapic->rtc_status.pending_eoi = 0;
-        bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
-}
-static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
-static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
-{
-        if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
-                kvm_rtc_eoi_tracking_restore_all(ioapic);
-}
-static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
-{
-        bool new_val, old_val;
-        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
-        union kvm_ioapic_redirect_entry *e;
-        e = &ioapic->redirtbl[RTC_GSI];
-        if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
-                                e->fields.dest_mode))
-                return;
-        new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
-        old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
-        if (new_val == old_val)
-                return;
-        if (new_val) {
-                __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
-                ioapic->rtc_status.pending_eoi++;
-        } else {
-                __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
-                ioapic->rtc_status.pending_eoi--;
-                rtc_status_pending_eoi_check_valid(ioapic);
-        }
-}
-void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
-{
-        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
-        spin_lock(&ioapic->lock);
-        __rtc_irq_eoi_tracking_restore_one(vcpu);
-        spin_unlock(&ioapic->lock);
-}
-static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
-{
-        struct kvm_vcpu *vcpu;
-        int i;
-        if (RTC_GSI >= IOAPIC_NUM_PINS)
-                return;
-        rtc_irq_eoi_tracking_reset(ioapic);
-        kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
-            __rtc_irq_eoi_tracking_restore_one(vcpu);
-}
-static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
-{
-        if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
-                --ioapic->rtc_status.pending_eoi;
-                rtc_status_pending_eoi_check_valid(ioapic);
-        }
-}
-static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
-{
-        if (ioapic->rtc_status.pending_eoi > 0)
-                return true; /* coalesced */
-        return false;
-}
-static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
-                int irq_level, bool line_status)
-{
-        union kvm_ioapic_redirect_entry entry;
-        u32 mask = 1 << irq;
-        u32 old_irr;
-        int edge, ret;
-        entry = ioapic->redirtbl[irq];
-        edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
-        if (!irq_level) {
-                ioapic->irr &= ~mask;
-                ret = 1;
-                goto out;
-        }
-        /*
-         * Return 0 for coalesced interrupts; for edge-triggered interrupts,
-         * this only happens if a previous edge has not been delivered due
-         * do masking.  For level interrupts, the remote_irr field tells
-         * us if the interrupt is waiting for an EOI.
-         *
-         * RTC is special: it is edge-triggered, but userspace likes to know
-         * if it has been already ack-ed via EOI because coalesced RTC
-         * interrupts lead to time drift in Windows guests.  So we track
-         * EOI manually for the RTC interrupt.
-         */
-        if (irq == RTC_GSI && line_status &&
-                rtc_irq_check_coalesced(ioapic)) {
-                ret = 0;
-                goto out;
-        }
-        old_irr = ioapic->irr;
-        ioapic->irr |= mask;
-        if ((edge && old_irr == ioapic->irr) ||
-            (!edge && entry.fields.remote_irr)) {
-                ret = 0;
-                goto out;
-        }
-        ret = ioapic_service(ioapic, irq, line_status);
-out:
-        trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
-        return ret;
-}
-static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
-{
-        u32 idx;
-        rtc_irq_eoi_tracking_reset(ioapic);
-        for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
-                ioapic_set_irq(ioapic, idx, 1, true);
-        kvm_rtc_eoi_tracking_restore_all(ioapic);
-}
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
-        DECLARE_BITMAP(handled_vectors, 256);
-        int i;
-        memset(handled_vectors, 0, sizeof(handled_vectors));
-        for (i = 0; i < IOAPIC_NUM_PINS; ++i)
-                __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
-        memcpy(ioapic->handled_vectors, handled_vectors,
-               sizeof(handled_vectors));
-        smp_wmb();
-}
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-                        u32 *tmr)
-{
-        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
-        union kvm_ioapic_redirect_entry *e;
-        int index;
-        spin_lock(&ioapic->lock);
-        for (index = 0; index < IOAPIC_NUM_PINS; index++) {
-                e = &ioapic->redirtbl[index];
-                if (!e->fields.mask &&
-                        (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
-                         kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC,
-                                 index) || index == RTC_GSI)) {
-                        if (kvm_apic_match_dest(vcpu, NULL, 0,
-                                e->fields.dest_id, e->fields.dest_mode)) {
-                                __set_bit(e->fields.vector,
-                                        (unsigned long *)eoi_exit_bitmap);
-                                if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-                                        __set_bit(e->fields.vector,
-                                                (unsigned long *)tmr);
-                        }
-                }
-        }
-        spin_unlock(&ioapic->lock);
-}
-#ifdef CONFIG_X86
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
-{
-        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        if (!ioapic)
-                return;
-        kvm_make_scan_ioapic_request(kvm);
-}
-#else
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
-{
-        return;
-}
-#endif
-static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
-{
-        unsigned index;
-        bool mask_before, mask_after;
-        union kvm_ioapic_redirect_entry *e;
-        switch (ioapic->ioregsel) {
-        case IOAPIC_REG_VERSION:
-                /* Writes are ignored. */
-                break;
-        case IOAPIC_REG_APIC_ID:
-                ioapic->id = (val >> 24) & 0xf;
-                break;
-        case IOAPIC_REG_ARB_ID:
-                break;
-        default:
-                index = (ioapic->ioregsel - 0x10) >> 1;
-                ioapic_debug("change redir index %x val %x\n", index, val);
-                if (index >= IOAPIC_NUM_PINS)
-                        return;
-                e = &ioapic->redirtbl[index];
-                mask_before = e->fields.mask;
-                if (ioapic->ioregsel & 1) {
-                        e->bits &= 0xffffffff;
-                        e->bits |= (u64) val << 32;
-                } else {
-                        e->bits &= ~0xffffffffULL;
-                        e->bits |= (u32) val;
-                        e->fields.remote_irr = 0;
-                }
-                update_handled_vectors(ioapic);
-                mask_after = e->fields.mask;
-                if (mask_before != mask_after)
-                        kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
-                if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
-                    && ioapic->irr & (1 << index))
-                        ioapic_service(ioapic, index, false);
-                kvm_vcpu_request_scan_ioapic(ioapic->kvm);
-                break;
-        }
-}
-static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
-{
-        union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
-        struct kvm_lapic_irq irqe;
-        int ret;
-        if (entry->fields.mask)
-                return -1;
-        ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-                     "vector=%x trig_mode=%x\n",
-                     entry->fields.dest_id, entry->fields.dest_mode,
-                     entry->fields.delivery_mode, entry->fields.vector,
-                     entry->fields.trig_mode);
-        irqe.dest_id = entry->fields.dest_id;
-        irqe.vector = entry->fields.vector;
-        irqe.dest_mode = entry->fields.dest_mode;
-        irqe.trig_mode = entry->fields.trig_mode;
-        irqe.delivery_mode = entry->fields.delivery_mode << 8;
-        irqe.level = 1;
-        irqe.shorthand = 0;
-        if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
-                ioapic->irr &= ~(1 << irq);
-        if (irq == RTC_GSI && line_status) {
-                /*
-                 * pending_eoi cannot ever become negative (see
-                 * rtc_status_pending_eoi_check_valid) and the caller
-                 * ensures that it is only called if it is >= zero, namely
-                 * if rtc_irq_check_coalesced returns false).
-                 */
-                BUG_ON(ioapic->rtc_status.pending_eoi != 0);
-                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
-                                ioapic->rtc_status.dest_map);
-                ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
-        } else
-                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
-        if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
-                entry->fields.remote_irr = 1;
-        return ret;
-}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-                       int level, bool line_status)
-{
-        int ret, irq_level;
-        BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
-        spin_lock(&ioapic->lock);
-        irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
-                                         irq_source_id, level);
-        ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
-        spin_unlock(&ioapic->lock);
-        return ret;
-}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
-        int i;
-        spin_lock(&ioapic->lock);
-        for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
-                __clear_bit(irq_source_id, &ioapic->irq_states[i]);
-        spin_unlock(&ioapic->lock);
-}
-static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
-                        struct kvm_ioapic *ioapic, int vector, int trigger_mode)
-{
-        int i;
-        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
-                union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
-                if (ent->fields.vector != vector)
-                        continue;
-                if (i == RTC_GSI)
-                        rtc_irq_eoi(ioapic, vcpu);
-                /*
-                 * We are dropping lock while calling ack notifiers because ack
-                 * notifier callbacks for assigned devices call into IOAPIC
-                 * recursively. Since remote_irr is cleared only after call
-                 * to notifiers if the same vector will be delivered while lock
-                 * is dropped it will be put into irr and will be delivered
-                 * after ack notifier returns.
-                 */
-                spin_unlock(&ioapic->lock);
-                kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
-                spin_lock(&ioapic->lock);
-                if (trigger_mode != IOAPIC_LEVEL_TRIG)
-                        continue;
-                ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
-                ent->fields.remote_irr = 0;
-                if (ioapic->irr & (1 << i))
-                        ioapic_service(ioapic, i, false);
-        }
-}
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
-        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        smp_rmb();
-        return test_bit(vector, ioapic->handled_vectors);
-}
-void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
-{
-        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
-        spin_lock(&ioapic->lock);
-        __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
-        spin_unlock(&ioapic->lock);
-}
-static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
-{
-        return container_of(dev, struct kvm_ioapic, dev);
-}
-static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
-{
-        return ((addr >= ioapic->base_address &&
-                 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
-}
-static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-                            void *val)
-{
-        struct kvm_ioapic *ioapic = to_ioapic(this);
-        u32 result;
-        if (!ioapic_in_range(ioapic, addr))
-                return -EOPNOTSUPP;
-        ioapic_debug("addr %lx\n", (unsigned long)addr);
-        ASSERT(!(addr & 0xf));  /* check alignment */
-        addr &= 0xff;
-        spin_lock(&ioapic->lock);
-        switch (addr) {
-        case IOAPIC_REG_SELECT:
-                result = ioapic->ioregsel;
-                break;
-        case IOAPIC_REG_WINDOW:
-                result = ioapic_read_indirect(ioapic, addr, len);
-                break;
-        default:
-                result = 0;
-                break;
-        }
-        spin_unlock(&ioapic->lock);
-        switch (len) {
-        case 8:
-                *(u64 *) val = result;
-                break;
-        case 1:
-        case 2:
-        case 4:
-                memcpy(val, (char *)&result, len);
-                break;
-        default:
-                printk(KERN_WARNING "ioapic: wrong length %d\n", len);
-        }
-        return 0;
-}
-static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-                             const void *val)
-{
-        struct kvm_ioapic *ioapic = to_ioapic(this);
-        u32 data;
-        if (!ioapic_in_range(ioapic, addr))
-                return -EOPNOTSUPP;
-        ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
-                     (void*)addr, len, val);
-        ASSERT(!(addr & 0xf));  /* check alignment */
-        switch (len) {
-        case 8:
-        case 4:
-                data = *(u32 *) val;
-                break;
-        case 2:
-                data = *(u16 *) val;
-                break;
-        case 1:
-                data = *(u8  *) val;
-                break;
-        default:
-                printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
-                return 0;
-        }
-        addr &= 0xff;
-        spin_lock(&ioapic->lock);
-        switch (addr) {
-        case IOAPIC_REG_SELECT:
-                ioapic->ioregsel = data & 0xFF; /* 8-bit register */
-                break;
-        case IOAPIC_REG_WINDOW:
-                ioapic_write_indirect(ioapic, data);
-                break;
-#ifdef  CONFIG_IA64
-        case IOAPIC_REG_EOI:
-                __kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG);
-                break;
-#endif
-        default:
-                break;
-        }
-        spin_unlock(&ioapic->lock);
-        return 0;
-}
-static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
-{
-        int i;
-        for (i = 0; i < IOAPIC_NUM_PINS; i++)
-                ioapic->redirtbl[i].fields.mask = 1;
-        ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
-        ioapic->ioregsel = 0;
-        ioapic->irr = 0;
-        ioapic->id = 0;
-        rtc_irq_eoi_tracking_reset(ioapic);
-        update_handled_vectors(ioapic);
-}
-static const struct kvm_io_device_ops ioapic_mmio_ops = {
-        .read     = ioapic_mmio_read,
-        .write    = ioapic_mmio_write,
-};
-int kvm_ioapic_init(struct kvm *kvm)
-{
-        struct kvm_ioapic *ioapic;
-        int ret;
-        ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
-        if (!ioapic)
-                return -ENOMEM;
-        spin_lock_init(&ioapic->lock);
-        kvm->arch.vioapic = ioapic;
-        kvm_ioapic_reset(ioapic);
-        kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
-        ioapic->kvm = kvm;
-        mutex_lock(&kvm->slots_lock);
-        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
-                                      IOAPIC_MEM_LENGTH, &ioapic->dev);
-        mutex_unlock(&kvm->slots_lock);
-        if (ret < 0) {
-                kvm->arch.vioapic = NULL;
-                kfree(ioapic);
-        }
-        return ret;
-}
-void kvm_ioapic_destroy(struct kvm *kvm)
-{
-        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        if (ioapic) {
-                kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
-                kvm->arch.vioapic = NULL;
-                kfree(ioapic);
-        }
-}
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
-{
-        struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-        if (!ioapic)
-                return -EINVAL;
-        spin_lock(&ioapic->lock);
-        memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
-        spin_unlock(&ioapic->lock);
-        return 0;
-}
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
-{
-        struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-        if (!ioapic)
-                return -EINVAL;
-        spin_lock(&ioapic->lock);
-        memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
-        ioapic->irr = 0;
-        update_handled_vectors(ioapic);
-        kvm_vcpu_request_scan_ioapic(kvm);
-        kvm_ioapic_inject_all(ioapic, state->irr);
-        spin_unlock(&ioapic->lock);
-        return 0;
-}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
deleted file mode 100644
index 90d43e95dcf8..000000000000
--- a/virt/kvm/ioapic.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __KVM_IO_APIC_H
-#define __KVM_IO_APIC_H
-#include <linux/kvm_host.h>
-#include "iodev.h"
-struct kvm;
-struct kvm_vcpu;
-#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11  /* IOAPIC version */
-#define IOAPIC_EDGE_TRIG  0
-#define IOAPIC_LEVEL_TRIG 1
-#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
-#define IOAPIC_MEM_LENGTH            0x100
-/* Direct registers. */
-#define IOAPIC_REG_SELECT  0x00
-#define IOAPIC_REG_WINDOW  0x10
-#define IOAPIC_REG_EOI     0x40 /* IA64 IOSAPIC only */
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID  0x02 /* x86 IOAPIC only */
-/*ioapic delivery mode*/
-#define IOAPIC_FIXED                    0x0
-#define IOAPIC_LOWEST_PRIORITY          0x1
-#define IOAPIC_PMI                      0x2
-#define IOAPIC_NMI                      0x4
-#define IOAPIC_INIT                     0x5
-#define IOAPIC_EXTINT                   0x7
-#ifdef CONFIG_X86
-#define RTC_GSI 8
-#else
-#define RTC_GSI -1U
-#endif
-struct rtc_status {
-        int pending_eoi;
-        DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
-};
-struct kvm_ioapic {
-        u64 base_address;
-        u32 ioregsel;
-        u32 id;
-        u32 irr;
-        u32 pad;
-        union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
-        unsigned long irq_states[IOAPIC_NUM_PINS];
-        struct kvm_io_device dev;
-        struct kvm *kvm;
-        void (*ack_notifier)(void *opaque, int irq);
-        spinlock_t lock;
-        DECLARE_BITMAP(handled_vectors, 256);
-        struct rtc_status rtc_status;
-};
-#ifdef DEBUG
-#define ASSERT(x)                                                       \
-do {                                                                    \
-        if (!(x)) {                                                     \
-                printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
-                       __FILE__, __LINE__, #x);                         \
-                BUG();                                                  \
-        }                                                               \
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-        return kvm->arch.vioapic;
-}
-void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-                int short_hand, int dest, int dest_mode);
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
-                        int trigger_mode);
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-                       int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-                struct kvm_lapic_irq *irq, unsigned long *dest_map);
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-                        u32 *tmr);
-#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
deleted file mode 100644
index 12fd3caffd2b..000000000000
--- a/virt/kvm/iodev.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-#ifndef __KVM_IODEV_H__
-#define __KVM_IODEV_H__
-#include <linux/kvm_types.h>
-#include <asm/errno.h>
-struct kvm_io_device;
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-        int (*read)(struct kvm_io_device *this,
-                    gpa_t addr,
-                    int len,
-                    void *val);
-        int (*write)(struct kvm_io_device *this,
-                     gpa_t addr,
-                     int len,
-                     const void *val);
-        void (*destructor)(struct kvm_io_device *this);
-};
-struct kvm_io_device {
-        const struct kvm_io_device_ops *ops;
-};
-static inline void kvm_iodevice_init(struct kvm_io_device *dev,
-                                     const struct kvm_io_device_ops *ops)
-{
-        dev->ops = ops;
-}
-static inline int kvm_iodevice_read(struct kvm_io_device *dev,
-                                    gpa_t addr, int l, void *v)
-{
-        return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
-}
-static inline int kvm_iodevice_write(struct kvm_io_device *dev,
-                                     gpa_t addr, int l, const void *v)
-{
-        return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
-}
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-        if (dev->ops->destructor)
-                dev->ops->destructor(dev);
-}
-#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
deleted file mode 100644
index 0df7d4b34dfe..000000000000
--- a/virt/kvm/iommu.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/dmar.h>
-#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-                   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-                                gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-                           unsigned long size)
-{
-        gfn_t end_gfn;
-        pfn_t pfn;
-        pfn     = gfn_to_pfn_memslot(slot, gfn);
-        end_gfn = gfn + (size >> PAGE_SHIFT);
-        gfn    += 1;
-        if (is_error_noslot_pfn(pfn))
-                return pfn;
-        while (gfn < end_gfn)
-                gfn_to_pfn_memslot(slot, gfn++);
-        return pfn;
-}
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-        gfn_t gfn, end_gfn;
-        pfn_t pfn;
-        int r = 0;
-        struct iommu_domain *domain = kvm->arch.iommu_domain;
-        int flags;
-        /* check if iommu exists and in use */
-        if (!domain)
-                return 0;
-        gfn     = slot->base_gfn;
-        end_gfn = gfn + slot->npages;
-        flags = IOMMU_READ;
-        if (!(slot->flags & KVM_MEM_READONLY))
-                flags |= IOMMU_WRITE;
-        if (!kvm->arch.iommu_noncoherent)
-                flags |= IOMMU_CACHE;
-        while (gfn < end_gfn) {
-                unsigned long page_size;
-                /* Check if already mapped */
-                if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-                        gfn += 1;
-                        continue;
-                }
-                /* Get the page size we could use to map */
-                page_size = kvm_host_page_size(kvm, gfn);
-                /* Make sure the page_size does not exceed the memslot */
-                while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-                        page_size >>= 1;
-                /* Make sure gfn is aligned to the page size we want to map */
-                while ((gfn << PAGE_SHIFT) & (page_size - 1))
-                        page_size >>= 1;
-                /* Make sure hva is aligned to the page size we want to map */
-                while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-                        page_size >>= 1;
-                /*
-                 * Pin all pages we are about to map in memory. This is
-                 * important because we unmap and unpin in 4kb steps later.
-                 */
-                pfn = kvm_pin_pages(slot, gfn, page_size);
-                if (is_error_noslot_pfn(pfn)) {
-                        gfn += 1;
-                        continue;
-                }
-                /* Map into IO address space */
-                r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-                              page_size, flags);
-                if (r) {
-                        printk(KERN_ERR "kvm_iommu_map_address:"
-                               "iommu failed to map pfn=%llx\n", pfn);
-                        goto unmap_pages;
-                }
-                gfn += page_size >> PAGE_SHIFT;
-        }
-        return 0;
-unmap_pages:
-        kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
-        return r;
-}
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-        int idx, r = 0;
-        struct kvm_memslots *slots;
-        struct kvm_memory_slot *memslot;
-        if (kvm->arch.iommu_noncoherent)
-                kvm_arch_register_noncoherent_dma(kvm);
-        idx = srcu_read_lock(&kvm->srcu);
-        slots = kvm_memslots(kvm);
-        kvm_for_each_memslot(memslot, slots) {
-                r = kvm_iommu_map_pages(kvm, memslot);
-                if (r)
-                        break;
-        }
-        srcu_read_unlock(&kvm->srcu, idx);
-        return r;
-}
-int kvm_assign_device(struct kvm *kvm,
-                      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-        struct pci_dev *pdev = NULL;
-        struct iommu_domain *domain = kvm->arch.iommu_domain;
-        int r;
-        bool noncoherent;
-        /* check if iommu exists and in use */
-        if (!domain)
-                return 0;
-        pdev = assigned_dev->dev;
-        if (pdev == NULL)
-                return -ENODEV;
-        r = iommu_attach_device(domain, &pdev->dev);
-        if (r) {
-                dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-                return r;
-        }
-        noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain,
-                                            IOMMU_CAP_CACHE_COHERENCY);
-        /* Check if need to update IOMMU page table for guest memory */
-        if (noncoherent != kvm->arch.iommu_noncoherent) {
-                kvm_iommu_unmap_memslots(kvm);
-                kvm->arch.iommu_noncoherent = noncoherent;
-                r = kvm_iommu_map_memslots(kvm);
-                if (r)
-                        goto out_unmap;
-        }
-        pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
-        dev_info(&pdev->dev, "kvm assign device\n");
-        return 0;
-out_unmap:
-        kvm_iommu_unmap_memslots(kvm);
-        return r;
-}
-int kvm_deassign_device(struct kvm *kvm,
-                        struct kvm_assigned_dev_kernel *assigned_dev)
-{
-        struct iommu_domain *domain = kvm->arch.iommu_domain;
-        struct pci_dev *pdev = NULL;
-        /* check if iommu exists and in use */
-        if (!domain)
-                return 0;
-        pdev = assigned_dev->dev;
-        if (pdev == NULL)
-                return -ENODEV;
-        iommu_detach_device(domain, &pdev->dev);
-        pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
-        dev_info(&pdev->dev, "kvm deassign device\n");
-        return 0;
-}
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-        int r;
-        if (!iommu_present(&pci_bus_type)) {
-                printk(KERN_ERR "%s: iommu not found\n", __func__);
-                return -ENODEV;
-        }
-        mutex_lock(&kvm->slots_lock);
-        kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-        if (!kvm->arch.iommu_domain) {
-                r = -ENOMEM;
-                goto out_unlock;
-        }
-        if (!allow_unsafe_assigned_interrupts &&
-            !iommu_domain_has_cap(kvm->arch.iommu_domain,
-                                  IOMMU_CAP_INTR_REMAP)) {
-                printk(KERN_WARNING "%s: No interrupt remapping support,"
-                       " disallowing device assignment."
-                       " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
-                       " module option.\n", __func__);
-                iommu_domain_free(kvm->arch.iommu_domain);
-                kvm->arch.iommu_domain = NULL;
-                r = -EPERM;
-                goto out_unlock;
-        }
-        r = kvm_iommu_map_memslots(kvm);
-        if (r)
-                kvm_iommu_unmap_memslots(kvm);
-out_unlock:
-        mutex_unlock(&kvm->slots_lock);
-        return r;
-}
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
-{
-        unsigned long i;
-        for (i = 0; i < npages; ++i)
-                kvm_release_pfn_clean(pfn + i);
-}
-static void kvm_iommu_put_pages(struct kvm *kvm,
-                                gfn_t base_gfn, unsigned long npages)
-{
-        struct iommu_domain *domain;
-        gfn_t end_gfn, gfn;
-        pfn_t pfn;
-        u64 phys;
-        domain  = kvm->arch.iommu_domain;
-        end_gfn = base_gfn + npages;
-        gfn     = base_gfn;
-        /* check if iommu exists and in use */
-        if (!domain)
-                return;
-        while (gfn < end_gfn) {
-                unsigned long unmap_pages;
-                size_t size;
-                /* Get physical address */
-                phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-                if (!phys) {
-                        gfn++;
-                        continue;
-                }
-                pfn  = phys >> PAGE_SHIFT;
-                /* Unmap address from IO address space */
-                size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-                unmap_pages = 1ULL << get_order(size);
-                /* Unpin all pages we just unmapped to not leak any memory */
-                kvm_unpin_pages(kvm, pfn, unmap_pages);
-                gfn += unmap_pages;
-        }
-}
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-        kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-        int idx;
-        struct kvm_memslots *slots;
-        struct kvm_memory_slot *memslot;
-        idx = srcu_read_lock(&kvm->srcu);
-        slots = kvm_memslots(kvm);
-        kvm_for_each_memslot(memslot, slots)
-                kvm_iommu_unmap_pages(kvm, memslot);
-        srcu_read_unlock(&kvm->srcu, idx);
-        if (kvm->arch.iommu_noncoherent)
-                kvm_arch_unregister_noncoherent_dma(kvm);
-        return 0;
-}
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-        struct iommu_domain *domain = kvm->arch.iommu_domain;
-        /* check if iommu exists and in use */
-        if (!domain)
-                return 0;
-        mutex_lock(&kvm->slots_lock);
-        kvm_iommu_unmap_memslots(kvm);
-        kvm->arch.iommu_domain = NULL;
-        kvm->arch.iommu_noncoherent = false;
-        mutex_unlock(&kvm->slots_lock);
-        iommu_domain_free(domain);
-        return 0;
-}
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
deleted file mode 100644
index ced4a542a031..000000000000
--- a/virt/kvm/irq_comm.c
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <trace/events/kvm.h>
-#include <asm/msidef.h>
-#ifdef CONFIG_IA64
-#include <asm/iosapic.h>
-#endif
-#include "irq.h"
-#include "ioapic.h"
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-                           struct kvm *kvm, int irq_source_id, int level,
-                           bool line_status)
-{
-#ifdef CONFIG_X86
-        struct kvm_pic *pic = pic_irqchip(kvm);
-        return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-#else
-        return -1;
-#endif
-}
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-                              struct kvm *kvm, int irq_source_id, int level,
-                              bool line_status)
-{
-        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
-                                line_status);
-}
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
-#ifdef CONFIG_IA64
-        return irq->delivery_mode ==
-                (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
-#else
-        return irq->delivery_mode == APIC_DM_LOWEST;
-#endif
-}
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-                struct kvm_lapic_irq *irq, unsigned long *dest_map)
-{
-        int i, r = -1;
-        struct kvm_vcpu *vcpu, *lowest = NULL;
-        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-                        kvm_is_dm_lowest_prio(irq)) {
-                printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
-                irq->delivery_mode = APIC_DM_FIXED;
-        }
-        if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
-                return r;
-        kvm_for_each_vcpu(i, vcpu, kvm) {
-                if (!kvm_apic_present(vcpu))
-                        continue;
-                if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
-                                        irq->dest_id, irq->dest_mode))
-                        continue;
-                if (!kvm_is_dm_lowest_prio(irq)) {
-                        if (r < 0)
-                                r = 0;
-                        r += kvm_apic_set_irq(vcpu, irq, dest_map);
-                } else if (kvm_lapic_enabled(vcpu)) {
-                        if (!lowest)
-                                lowest = vcpu;
-                        else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
-                                lowest = vcpu;
-                }
-        }
-        if (lowest)
-                r = kvm_apic_set_irq(lowest, irq, dest_map);
-        return r;
-}
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-                                   struct kvm_lapic_irq *irq)
-{
-        trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
-        irq->dest_id = (e->msi.address_lo &
-                        MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-        irq->vector = (e->msi.data &
-                        MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-        irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
-        irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
-        irq->delivery_mode = e->msi.data & 0x700;
-        irq->level = 1;
-        irq->shorthand = 0;
-        /* TODO Deal with RH bit of MSI message address */
-}
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-                struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
-        struct kvm_lapic_irq irq;
-        if (!level)
-                return -1;
-        kvm_set_msi_irq(e, &irq);
-        return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
-                         struct kvm *kvm)
-{
-        struct kvm_lapic_irq irq;
-        int r;
-        kvm_set_msi_irq(e, &irq);
-        if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
-                return r;
-        else
-                return -EWOULDBLOCK;
-}
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-        struct kvm_kernel_irq_routing_entry *e;
-        int ret = -EINVAL;
-        struct kvm_irq_routing_table *irq_rt;
-        int idx;
-        trace_kvm_set_irq(irq, level, irq_source_id);
-        /*
-         * Injection into either PIC or IOAPIC might need to scan all CPUs,
-         * which would need to be retried from thread context;  when same GSI
-         * is connected to both PIC and IOAPIC, we'd have to report a
-         * partial failure here.
-         * Since there's no easy way to do this, we only support injecting MSI
-         * which is limited to 1:1 GSI mapping.
-         */
-        idx = srcu_read_lock(&kvm->irq_srcu);
-        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-        if (irq < irq_rt->nr_rt_entries)
-                hlist_for_each_entry(e, &irq_rt->map[irq], link) {
-                        if (likely(e->type == KVM_IRQ_ROUTING_MSI))
-                                ret = kvm_set_msi_inatomic(e, kvm);
-                        else
-                                ret = -EWOULDBLOCK;
-                        break;
-                }
-        srcu_read_unlock(&kvm->irq_srcu, idx);
-        return ret;
-}
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
-        unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
-        int irq_source_id;
-        mutex_lock(&kvm->irq_lock);
-        irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-        if (irq_source_id >= BITS_PER_LONG) {
-                printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
-                irq_source_id = -EFAULT;
-                goto unlock;
-        }
-        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-#ifdef CONFIG_X86
-        ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-#endif
-        set_bit(irq_source_id, bitmap);
-unlock:
-        mutex_unlock(&kvm->irq_lock);
-        return irq_source_id;
-}
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
-        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-#ifdef CONFIG_X86
-        ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-#endif
-        mutex_lock(&kvm->irq_lock);
-        if (irq_source_id < 0 ||
-            irq_source_id >= BITS_PER_LONG) {
-                printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
-                goto unlock;
-        }
-        clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-        if (!irqchip_in_kernel(kvm))
-                goto unlock;
-        kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
-#ifdef CONFIG_X86
-        kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
-#endif
-unlock:
-        mutex_unlock(&kvm->irq_lock);
-}
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
-                                    struct kvm_irq_mask_notifier *kimn)
-{
-        mutex_lock(&kvm->irq_lock);
-        kimn->irq = irq;
-        hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
-        mutex_unlock(&kvm->irq_lock);
-}
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
-                                      struct kvm_irq_mask_notifier *kimn)
-{
-        mutex_lock(&kvm->irq_lock);
-        hlist_del_rcu(&kimn->link);
-        mutex_unlock(&kvm->irq_lock);
-        synchronize_srcu(&kvm->irq_srcu);
-}
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
-                             bool mask)
-{
-        struct kvm_irq_mask_notifier *kimn;
-        int idx, gsi;
-        idx = srcu_read_lock(&kvm->irq_srcu);
-        gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
-        if (gsi != -1)
-                hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
-                        if (kimn->irq == gsi)
-                                kimn->func(kimn, mask);
-        srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
-                          struct kvm_kernel_irq_routing_entry *e,
-                          const struct kvm_irq_routing_entry *ue)
-{
-        int r = -EINVAL;
-        int delta;
-        unsigned max_pin;
-        switch (ue->type) {
-        case KVM_IRQ_ROUTING_IRQCHIP:
-                delta = 0;
-                switch (ue->u.irqchip.irqchip) {
-                case KVM_IRQCHIP_PIC_MASTER:
-                        e->set = kvm_set_pic_irq;
-                        max_pin = PIC_NUM_PINS;
-                        break;
-                case KVM_IRQCHIP_PIC_SLAVE:
-                        e->set = kvm_set_pic_irq;
-                        max_pin = PIC_NUM_PINS;
-                        delta = 8;
-                        break;
-                case KVM_IRQCHIP_IOAPIC:
-                        max_pin = KVM_IOAPIC_NUM_PINS;
-                        e->set = kvm_set_ioapic_irq;
-                        break;
-                default:
-                        goto out;
-                }
-                e->irqchip.irqchip = ue->u.irqchip.irqchip;
-                e->irqchip.pin = ue->u.irqchip.pin + delta;
-                if (e->irqchip.pin >= max_pin)
-                        goto out;
-                rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
-                break;
-        case KVM_IRQ_ROUTING_MSI:
-                e->set = kvm_set_msi;
-                e->msi.address_lo = ue->u.msi.address_lo;
-                e->msi.address_hi = ue->u.msi.address_hi;
-                e->msi.data = ue->u.msi.data;
-                break;
-        default:
-                goto out;
-        }
-        r = 0;
-out:
-        return r;
-}
-#define IOAPIC_ROUTING_ENTRY(irq) \
-        { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,  \
-          .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-#ifdef CONFIG_X86
-#  define PIC_ROUTING_ENTRY(irq) \
-        { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,  \
-          .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
-#  define ROUTING_ENTRY2(irq) \
-        IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-#else
-#  define ROUTING_ENTRY2(irq) \
-        IOAPIC_ROUTING_ENTRY(irq)
-#endif
-static const struct kvm_irq_routing_entry default_routing[] = {
-        ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
-        ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
-        ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
-        ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
-        ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
-        ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
-        ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
-        ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
-        ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
-        ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
-        ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
-        ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-#ifdef CONFIG_IA64
-        ROUTING_ENTRY1(24), ROUTING_ENTRY1(25),
-        ROUTING_ENTRY1(26), ROUTING_ENTRY1(27),
-        ROUTING_ENTRY1(28), ROUTING_ENTRY1(29),
-        ROUTING_ENTRY1(30), ROUTING_ENTRY1(31),
-        ROUTING_ENTRY1(32), ROUTING_ENTRY1(33),
-        ROUTING_ENTRY1(34), ROUTING_ENTRY1(35),
-        ROUTING_ENTRY1(36), ROUTING_ENTRY1(37),
-        ROUTING_ENTRY1(38), ROUTING_ENTRY1(39),
-        ROUTING_ENTRY1(40), ROUTING_ENTRY1(41),
-        ROUTING_ENTRY1(42), ROUTING_ENTRY1(43),
-        ROUTING_ENTRY1(44), ROUTING_ENTRY1(45),
-        ROUTING_ENTRY1(46), ROUTING_ENTRY1(47),
-#endif
-};
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
-        return kvm_set_irq_routing(kvm, default_routing,
-                                   ARRAY_SIZE(default_routing), 0);
-}
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index b43c275775cd..1d56a901e791 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,65 +31,42 @@
 #include <trace/events/kvm.h>
 #include "irq.h"
-bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
+struct kvm_irq_routing_table {
-{
+        int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
-        struct kvm_irq_ack_notifier *kian;
+        struct kvm_kernel_irq_routing_entry *rt_entries;
-        int gsi, idx;
+        u32 nr_rt_entries;
+        /*
-        idx = srcu_read_lock(&kvm->irq_srcu);
+         * Array indexed by gsi. Each entry contains list of irq chips
-        gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
+         * the gsi is connected to.
-        if (gsi != -1)
+         */
-                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+        struct hlist_head map[0];
-                                         link)
+};
-                        if (kian->gsi == gsi) {
-                                srcu_read_unlock(&kvm->irq_srcu, idx);
-                                return true;
-                        }
-        srcu_read_unlock(&kvm->irq_srcu, idx);
-        return false;
-}
-EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+int kvm_irq_map_gsi(struct kvm *kvm,
+                    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 {
-        struct kvm_irq_ack_notifier *kian;
+        struct kvm_irq_routing_table *irq_rt;
-        int gsi, idx;
+        struct kvm_kernel_irq_routing_entry *e;
+        int n = 0;
-        trace_kvm_ack_irq(irqchip, pin);
+        irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+                                        lockdep_is_held(&kvm->irq_lock));
+        if (gsi < irq_rt->nr_rt_entries) {
+                hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+                        entries[n] = *e;
+                        ++n;
+                }
+        }
-        idx = srcu_read_lock(&kvm->irq_srcu);
+        return n;
-        gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
-        if (gsi != -1)
-                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                         link)
-                        if (kian->gsi == gsi)
-                                kian->irq_acked(kian);
-        srcu_read_unlock(&kvm->irq_srcu, idx);
 }
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-                                   struct kvm_irq_ack_notifier *kian)
 {
-        mutex_lock(&kvm->irq_lock);
+        struct kvm_irq_routing_table *irq_rt;
-        hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
-        mutex_unlock(&kvm->irq_lock);
-#ifdef __KVM_HAVE_IOAPIC
-        kvm_vcpu_request_scan_ioapic(kvm);
-#endif
-}
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-                                    struct kvm_irq_ack_notifier *kian)
+        return irq_rt->chip[irqchip][pin];
-{
-        mutex_lock(&kvm->irq_lock);
-        hlist_del_init_rcu(&kian->link);
-        mutex_unlock(&kvm->irq_lock);
-        synchronize_srcu(&kvm->irq_srcu);
-#ifdef __KVM_HAVE_IOAPIC
-        kvm_vcpu_request_scan_ioapic(kvm);
-#endif
 }
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
@@ -115,9 +92,8 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
                bool line_status)
 {
-        struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+        struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
-        int ret = -1, i = 0, idx;
+        int ret = -1, i, idx;
-        struct kvm_irq_routing_table *irq_rt;
        trace_kvm_set_irq(irq, level, irq_source_id);
@@ -126,13 +102,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
         * writes to the unused one.
         */
        idx = srcu_read_lock(&kvm->irq_srcu);
-        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+        i = kvm_irq_map_gsi(kvm, irq_set, irq);
-        if (irq < irq_rt->nr_rt_entries)
-                hlist_for_each_entry(e, &irq_rt->map[irq], link)
-                        irq_set[i++] = *e;
        srcu_read_unlock(&kvm->irq_srcu, idx);
-        while(i--) {
+        while (i--) {
                int r;
                r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
                                   line_status);
@@ -171,9 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
        e->gsi = ue->gsi;
        e->type = ue->type;
-        r = kvm_set_routing_entry(rt, e, ue);
+        r = kvm_set_routing_entry(e, ue);
        if (r)
                goto out;
+        if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
+                rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
        hlist_add_head(&e->link, &rt->map[e->gsi]);
        r = 0;
@@ -224,7 +199,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
        mutex_lock(&kvm->irq_lock);
        old = kvm->irq_routing;
-        kvm_irq_routing_update(kvm, new);
+        rcu_assign_pointer(kvm->irq_routing, new);
+        kvm_irq_routing_update(kvm);
        mutex_unlock(&kvm->irq_lock);
        synchronize_srcu_expedited(&kvm->irq_srcu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4b6c01b477f9..d3fc9399062a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -16,7 +16,7 @@
 *
 */
-#include "iodev.h"
+#include <kvm/iodev.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
@@ -52,11 +52,13 @@
 #include <asm/processor.h>
 #include <asm/io.h>
+#include <asm/ioctl.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "vfio.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -64,10 +66,13 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
+static unsigned int halt_poll_ns;
+module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 /*
 * Ordering of locks:
 *
- *              kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+ *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 */
 DEFINE_SPINLOCK(kvm_lock);
@@ -75,7 +80,7 @@ static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 static cpumask_var_t cpus_hardware_enabled;
-static int kvm_usage_count = 0;
+static int kvm_usage_count;
 static atomic_t hardware_enable_failed;
 struct kmem_cache *kvm_vcpu_cache;
@@ -87,7 +92,7 @@ struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
                                  unsigned long arg);
 #endif
@@ -95,8 +100,6 @@ static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void update_memslots(struct kvm_memslots *slots,
-                            struct kvm_memory_slot *new, u64 last_generation);
 static void kvm_release_pfn_dirty(pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -107,7 +110,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 static bool largepages_enabled = true;
-bool kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_reserved_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn))
                return PageReserved(pfn_to_page(pfn));
@@ -124,14 +127,6 @@ int vcpu_load(struct kvm_vcpu *vcpu)
        if (mutex_lock_killable(&vcpu->mutex))
                return -EINTR;
-        if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
-                /* The thread running this VCPU changed. */
-                struct pid *oldpid = vcpu->pid;
-                struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
-                rcu_assign_pointer(vcpu->pid, newpid);
-                synchronize_rcu();
-                put_pid(oldpid);
-        }
        cpu = get_cpu();
        preempt_notifier_register(&vcpu->preempt_notifier);
        kvm_arch_vcpu_load(vcpu, cpu);
@@ -152,7 +147,7 @@ static void ack_flush(void *_completed)
 {
 }
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        int i, cpu, me;
        cpumask_var_t cpus;
@@ -184,30 +179,32 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
        return called;
 }
+#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
        long dirty_count = kvm->tlbs_dirty;
        smp_mb();
-        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+        if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
+#endif
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-        make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -295,6 +292,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
                kvm_flush_remote_tlbs(kvm);
        spin_unlock(&kvm->mmu_lock);
+        kvm_arch_mmu_notifier_invalidate_page(kvm, address);
        srcu_read_unlock(&kvm->srcu, idx);
 }
@@ -368,7 +368,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              struct mm_struct *mm,
-                                              unsigned long address)
+                                              unsigned long start,
+                                              unsigned long end)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        int young, idx;
@@ -376,7 +377,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
-        young = kvm_age_hva(kvm, address);
+        young = kvm_age_hva(kvm, start, end);
        if (young)
                kvm_flush_remote_tlbs(kvm);
@@ -463,17 +464,23 @@ static struct kvm *kvm_create_vm(unsigned long type)
        if (r)
                goto out_err_no_disable;
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
+#ifdef CONFIG_HAVE_KVM_IRQFD
-        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
        r = -ENOMEM;
-        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+        kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
        if (!kvm->memslots)
                goto out_err_no_srcu;
+        /*
+         * Init kvm generation close to the maximum to easily test the
+         * code of handling generation number wrap-around.
+         */
+        kvm->memslots->generation = -150;
        kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
@@ -515,7 +522,7 @@ out_err_no_srcu:
 out_err_no_disable:
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm->buses[i]);
-        kfree(kvm->memslots);
+        kvfree(kvm->memslots);
        kvm_arch_free_vm(kvm);
        return ERR_PTR(r);
 }
@@ -532,20 +539,12 @@ void *kvm_kvzalloc(unsigned long size)
                return kzalloc(size, GFP_KERNEL);
 }
-void kvm_kvfree(const void *addr)
-{
-        if (is_vmalloc_addr(addr))
-                vfree(addr);
-        else
-                kfree(addr);
-}
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
        if (!memslot->dirty_bitmap)
                return;
-        kvm_kvfree(memslot->dirty_bitmap);
+        kvfree(memslot->dirty_bitmap);
        memslot->dirty_bitmap = NULL;
 }
@@ -571,7 +570,7 @@ static void kvm_free_physmem(struct kvm *kvm)
        kvm_for_each_memslot(memslot, slots)
                kvm_free_physmem_slot(kvm, memslot, NULL);
-        kfree(kvm->memslots);
+        kvfree(kvm->memslots);
 }
 static void kvm_destroy_devices(struct kvm *kvm)
@@ -654,58 +653,68 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
        return 0;
 }
-static int cmp_memslot(const void *slot1, const void *slot2)
-{
-        struct kvm_memory_slot *s1, *s2;
-        s1 = (struct kvm_memory_slot *)slot1;
-        s2 = (struct kvm_memory_slot *)slot2;
-        if (s1->npages < s2->npages)
-                return 1;
-        if (s1->npages > s2->npages)
-                return -1;
-        return 0;
-}
 /*
- * Sort the memslots base on its size, so the larger slots
+ * Insert memslot and re-sort memslots based on their GFN,
- * will get better fit.
+ * so binary search could be used to lookup GFN.
+ * Sorting algorithm takes advantage of having initially
+ * sorted array and known changed memslot position.
 */
-static void sort_memslots(struct kvm_memslots *slots)
-{
-        int i;
-        sort(slots->memslots, KVM_MEM_SLOTS_NUM,
-              sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
-        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-                slots->id_to_index[slots->memslots[i].id] = i;
-}
 static void update_memslots(struct kvm_memslots *slots,
-                            struct kvm_memory_slot *new,
+                            struct kvm_memory_slot *new)
-                            u64 last_generation)
+{
-{
+        int id = new->id;
-        if (new) {
+        int i = slots->id_to_index[id];
-                int id = new->id;
+        struct kvm_memory_slot *mslots = slots->memslots;
-                struct kvm_memory_slot *old = id_to_memslot(slots, id);
-                unsigned long npages = old->npages;
+        WARN_ON(mslots[i].id != id);
+        if (!new->npages) {
+                WARN_ON(!mslots[i].npages);
+                new->base_gfn = 0;
+                new->flags = 0;
+                if (mslots[i].npages)
+                        slots->used_slots--;
+        } else {
+                if (!mslots[i].npages)
+                        slots->used_slots++;
+        }
-                *old = *new;
+        while (i < KVM_MEM_SLOTS_NUM - 1 &&
-                if (new->npages != npages)
+               new->base_gfn <= mslots[i + 1].base_gfn) {
-                        sort_memslots(slots);
+                if (!mslots[i + 1].npages)
+                        break;
+                mslots[i] = mslots[i + 1];
+                slots->id_to_index[mslots[i].id] = i;
+                i++;
        }
-        slots->generation = last_generation + 1;
+        /*
+         * The ">=" is needed when creating a slot with base_gfn == 0,
+         * so that it moves before all those with base_gfn == npages == 0.
+         *
+         * On the other hand, if new->npages is zero, the above loop has
+         * already left i pointing to the beginning of the empty part of
+         * mslots, and the ">=" would move the hole backwards in this
+         * case---which is wrong.  So skip the loop when deleting a slot.
+         */
+        if (new->npages) {
+                while (i > 0 &&
+                       new->base_gfn >= mslots[i - 1].base_gfn) {
+                        mslots[i] = mslots[i - 1];
+                        slots->id_to_index[mslots[i].id] = i;
+                        i--;
+                }
+        } else
+                WARN_ON_ONCE(i != slots->used_slots);
+        mslots[i] = *new;
+        slots->id_to_index[mslots[i].id] = i;
 }
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
-#ifdef KVM_CAP_READONLY_MEM
+#ifdef __KVM_HAVE_READONLY_MEM
        valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -716,14 +725,27 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 }
 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
-                struct kvm_memslots *slots, struct kvm_memory_slot *new)
+                struct kvm_memslots *slots)
 {
        struct kvm_memslots *old_memslots = kvm->memslots;
-        update_memslots(slots, new, kvm->memslots->generation);
+        /*
+         * Set the low bit in the generation, which disables SPTE caching
+         * until the end of synchronize_srcu_expedited.
+         */
+        WARN_ON(old_memslots->generation & 1);
+        slots->generation = old_memslots->generation + 1;
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
+        /*
+         * Increment the new memslot generation a second time. This prevents
+         * vm exits that race with memslot updates from caching a memslot
+         * generation that will (potentially) be valid forever.
+         */
+        slots->generation++;
        kvm_arch_memslots_updated(kvm);
        return old_memslots;
@@ -735,7 +757,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 *
 * Discontiguous memory is allowed, mostly for framebuffers.
 *
- * Must be called holding mmap_sem for write.
+ * Must be called holding kvm->slots_lock for write.
 */
 int __kvm_set_memory_region(struct kvm *kvm,
                            struct kvm_userspace_memory_region *mem)
@@ -774,7 +796,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
-        r = -EINVAL;
        if (npages > KVM_MEM_MAX_NR_PAGES)
                goto out;
@@ -788,7 +809,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        new.npages = npages;
        new.flags = mem->flags;
-        r = -EINVAL;
        if (npages) {
                if (!old.npages)
                        change = KVM_MR_CREATE;
@@ -843,16 +863,16 @@ int __kvm_set_memory_region(struct kvm *kvm,
                        goto out_free;
        }
+        slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
+        if (!slots)
+                goto out_free;
+        memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
        if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-                r = -ENOMEM;
-                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
-                                GFP_KERNEL);
-                if (!slots)
-                        goto out_free;
                slot = id_to_memslot(slots, mem->slot);
                slot->flags |= KVM_MEMSLOT_INVALID;
-                old_memslots = install_new_memslots(kvm, slots, NULL);
+                old_memslots = install_new_memslots(kvm, slots);
                /* slot was deleted or moved, clear iommu mapping */
                kvm_iommu_unmap_pages(kvm, &old);
@@ -860,10 +880,16 @@ int __kvm_set_memory_region(struct kvm *kvm,
                 * or moved, memslot will be created.
                 *
                 * validation of sp->gfn happens in:
-                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                 *      - kvm_is_visible_gfn (mmu_check_roots)
+                 *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
                kvm_arch_flush_shadow_memslot(kvm, slot);
+                /*
+                 * We can re-use the old_memslots from above, the only difference
+                 * from the currently installed memslots is the invalid flag.  This
+                 * will get overwritten by update_memslots anyway.
+                 */
                slots = old_memslots;
        }
@@ -871,31 +897,19 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (r)
                goto out_slots;
-        r = -ENOMEM;
-        /*
-         * We can re-use the old_memslots from above, the only difference
-         * from the currently installed memslots is the invalid flag.  This
-         * will get overwritten by update_memslots anyway.
-         */
-        if (!slots) {
-                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
-                                GFP_KERNEL);
-                if (!slots)
-                        goto out_free;
-        }
        /* actual memory is freed via old in kvm_free_physmem_slot below */
        if (change == KVM_MR_DELETE) {
                new.dirty_bitmap = NULL;
                memset(&new.arch, 0, sizeof(new.arch));
        }
-        old_memslots = install_new_memslots(kvm, slots, &new);
+        update_memslots(slots, &new);
+        old_memslots = install_new_memslots(kvm, slots);
        kvm_arch_commit_memory_region(kvm, mem, &old, change);
        kvm_free_physmem_slot(kvm, &old, &new);
-        kfree(old_memslots);
+        kvfree(old_memslots);
        /*
         * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
@@ -914,7 +928,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        return 0;
 out_slots:
-        kfree(slots);
+        kvfree(slots);
 out_free:
        kvm_free_physmem_slot(kvm, &new, &old);
 out:
@@ -977,6 +991,88 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ *      are dirty write protect them for next write.
+ * @kvm:        pointer to kvm instance
+ * @log:        slot id and address to which we copy the log
+ * @is_dirty:   flag set if any page is dirty
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently. So, to avoid losing track of dirty pages we keep the
+ * following order:
+ *
+ *    1. Take a snapshot of the bit and clear it if needed.
+ *    2. Write protect the corresponding page.
+ *    3. Copy the snapshot to the userspace.
+ *    4. Upon return caller flushes TLB's if needed.
+ *
+ * Between 2 and 4, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page is reported dirty using
+ * the snapshot taken before and step 4 ensures that writes done after
+ * exiting to userspace will be logged for the next call.
+ *
+ */
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+                        struct kvm_dirty_log *log, bool *is_dirty)
+{
+        struct kvm_memory_slot *memslot;
+        int r, i;
+        unsigned long n;
+        unsigned long *dirty_bitmap;
+        unsigned long *dirty_bitmap_buffer;
+        r = -EINVAL;
+        if (log->slot >= KVM_USER_MEM_SLOTS)
+                goto out;
+        memslot = id_to_memslot(kvm->memslots, log->slot);
+        dirty_bitmap = memslot->dirty_bitmap;
+        r = -ENOENT;
+        if (!dirty_bitmap)
+                goto out;
+        n = kvm_dirty_bitmap_bytes(memslot);
+        dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+        memset(dirty_bitmap_buffer, 0, n);
+        spin_lock(&kvm->mmu_lock);
+        *is_dirty = false;
+        for (i = 0; i < n / sizeof(long); i++) {
+                unsigned long mask;
+                gfn_t offset;
+                if (!dirty_bitmap[i])
+                        continue;
+                *is_dirty = true;
+                mask = xchg(&dirty_bitmap[i], 0);
+                dirty_bitmap_buffer[i] = mask;
+                if (mask) {
+                        offset = i * BITS_PER_LONG;
+                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+                                                                offset, mask);
+                }
+        }
+        spin_unlock(&kvm->mmu_lock);
+        r = -EFAULT;
+        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+                goto out;
+        r = 0;
+out:
+        return r;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+#endif
 bool kvm_largepages_enabled(void)
 {
        return largepages_enabled;
@@ -1073,9 +1169,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
 * If writable is set to false, the hva returned by this function is only
 * allowed to be read.
 */
-unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+                                      gfn_t gfn, bool *writable)
 {
-        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
        if (!kvm_is_error_hva(hva) && writable)
@@ -1084,14 +1180,11 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
        return hva;
 }
-static int kvm_read_hva(void *data, void __user *hva, int len)
+unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
 {
-        return __copy_from_user(data, hva, len);
+        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
-}
-static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+        return gfn_to_hva_memslot_prot(slot, gfn, writable);
-{
-        return __copy_from_user_inatomic(data, hva, len);
 }
 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1168,8 +1261,9 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
                                              addr, write_fault, page);
                up_read(&current->mm->mmap_sem);
        } else
-                npages = get_user_pages_fast(addr, 1, write_fault,
+                npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
-                                             page);
+                                                   write_fault, 0, page,
+                                                   FOLL_TOUCH|FOLL_HWPOISON);
        if (npages != 1)
                return npages;
@@ -1249,7 +1343,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
        else if ((vma->vm_flags & VM_PFNMAP)) {
                pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
-                BUG_ON(!kvm_is_mmio_pfn(pfn));
+                BUG_ON(!kvm_is_reserved_pfn(pfn));
        } else {
                if (async && vma_is_valid(vma, write_fault))
                        *async = true;
@@ -1355,7 +1449,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
        if (is_error_noslot_pfn(pfn))
                return KVM_ERR_PTR_BAD_PAGE;
-        if (kvm_is_mmio_pfn(pfn)) {
+        if (kvm_is_reserved_pfn(pfn)) {
                WARN_ON(1);
                return KVM_ERR_PTR_BAD_PAGE;
        }
@@ -1371,7 +1465,6 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
        return kvm_pfn_to_page(pfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 void kvm_release_page_clean(struct page *page)
@@ -1384,7 +1477,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-        if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
                put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -1405,8 +1498,9 @@ static void kvm_release_pfn_dirty(pfn_t pfn)
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-        if (!kvm_is_mmio_pfn(pfn)) {
+        if (!kvm_is_reserved_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
                if (!PageReserved(page))
                        SetPageDirty(page);
        }
@@ -1415,14 +1509,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-        if (!kvm_is_mmio_pfn(pfn))
+        if (!kvm_is_reserved_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 void kvm_get_pfn(pfn_t pfn)
 {
-        if (!kvm_is_mmio_pfn(pfn))
+        if (!kvm_is_reserved_pfn(pfn))
                get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
@@ -1444,7 +1538,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
        addr = gfn_to_hva_prot(kvm, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-        r = kvm_read_hva(data, (void __user *)addr + offset, len);
+        r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@ -1483,7 +1577,7 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
-        r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
+        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
@@ -1527,6 +1621,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest);
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                              gpa_t gpa, unsigned long len)
@@ -1542,8 +1637,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        ghc->generation = slots->generation;
        ghc->len = len;
        ghc->memslot = gfn_to_memslot(kvm, start_gfn);
-        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail);
+        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
-        if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) {
+        if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
                ghc->hva += offset;
        } else {
                /*
@@ -1631,7 +1726,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
        int offset = offset_in_page(gpa);
        int ret;
-        while ((seg = next_segment(len, offset)) != 0) {
+        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
                if (ret < 0)
                        return ret;
@@ -1663,29 +1758,61 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
+static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+        if (kvm_arch_vcpu_runnable(vcpu)) {
+                kvm_make_request(KVM_REQ_UNHALT, vcpu);
+                return -EINTR;
+        }
+        if (kvm_cpu_has_pending_timer(vcpu))
+                return -EINTR;
+        if (signal_pending(current))
+                return -EINTR;
+        return 0;
+}
 /*
 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
 */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
+        ktime_t start, cur;
        DEFINE_WAIT(wait);
+        bool waited = false;
+        start = cur = ktime_get();
+        if (halt_poll_ns) {
+                ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+                do {
+                        /*
+                         * This sets KVM_REQ_UNHALT if an interrupt
+                         * arrives.
+                         */
+                        if (kvm_vcpu_check_block(vcpu) < 0) {
+                                ++vcpu->stat.halt_successful_poll;
+                                goto out;
+                        }
+                        cur = ktime_get();
+                } while (single_task_running() && ktime_before(cur, stop));
+        }
        for (;;) {
                prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
-                if (kvm_arch_vcpu_runnable(vcpu)) {
+                if (kvm_vcpu_check_block(vcpu) < 0)
-                        kvm_make_request(KVM_REQ_UNHALT, vcpu);
-                        break;
-                }
-                if (kvm_cpu_has_pending_timer(vcpu))
-                        break;
-                if (signal_pending(current))
                        break;
+                waited = true;
                schedule();
        }
        finish_wait(&vcpu->wq, &wait);
+        cur = ktime_get();
+out:
+        trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
@@ -1723,14 +1850,10 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target)
        rcu_read_lock();
        pid = rcu_dereference(target->pid);
        if (pid)
-                task = get_pid_task(target->pid, PIDTYPE_PID);
+                task = get_pid_task(pid, PIDTYPE_PID);
        rcu_read_unlock();
        if (!task)
                return ret;
-        if (task->flags & PF_VCPU) {
-                put_task_struct(task);
-                return ret;
-        }
        ret = yield_to(task, 1);
        put_task_struct(task);
@@ -1766,8 +1889,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
        bool eligible;
        eligible = !vcpu->spin_loop.in_spin_loop ||
-                        (vcpu->spin_loop.in_spin_loop &&
+                    vcpu->spin_loop.dy_eligible;
-                         vcpu->spin_loop.dy_eligible);
        if (vcpu->spin_loop.in_spin_loop)
                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1873,7 +1995,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
        .compat_ioctl   = kvm_vcpu_compat_ioctl,
 #endif
        .mmap           = kvm_vcpu_mmap,
@@ -1973,12 +2095,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
        if (vcpu->kvm->mm != current->mm)
                return -EIO;
+        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+                return -EINVAL;
 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
        /*
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
         * so vcpu_load() would break it.
         */
-        if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+        if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 #endif
@@ -1991,6 +2116,16 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = -EINVAL;
                if (arg)
                        goto out;
+                if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+                        /* The thread running this VCPU changed. */
+                        struct pid *oldpid = vcpu->pid;
+                        struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+                        rcu_assign_pointer(vcpu->pid, newpid);
+                        if (oldpid)
+                                synchronize_rcu();
+                        put_pid(oldpid);
+                }
                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
                break;
@@ -2056,7 +2191,7 @@ out_free1:
                if (r)
                        goto out;
                r = -EFAULT;
-                if (copy_to_user(argp, &mp_state, sizeof mp_state))
+                if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
                        goto out;
                r = 0;
                break;
@@ -2065,7 +2200,7 @@ out_free1:
                struct kvm_mp_state mp_state;
                r = -EFAULT;
-                if (copy_from_user(&mp_state, argp, sizeof mp_state))
+                if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
                break;
@@ -2074,13 +2209,13 @@ out_free1:
                struct kvm_translation tr;
                r = -EFAULT;
-                if (copy_from_user(&tr, argp, sizeof tr))
+                if (copy_from_user(&tr, argp, sizeof(tr)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
                if (r)
                        goto out;
                r = -EFAULT;
-                if (copy_to_user(argp, &tr, sizeof tr))
+                if (copy_to_user(argp, &tr, sizeof(tr)))
                        goto out;
                r = 0;
                break;
@@ -2089,7 +2224,7 @@ out_free1:
                struct kvm_guest_debug dbg;
                r = -EFAULT;
-                if (copy_from_user(&dbg, argp, sizeof dbg))
+                if (copy_from_user(&dbg, argp, sizeof(dbg)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
                break;
@@ -2103,14 +2238,14 @@ out_free1:
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                           sizeof kvm_sigmask))
+                                           sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                        if (kvm_sigmask.len != sizeof sigset)
+                        if (kvm_sigmask.len != sizeof(sigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&sigset, sigmask_arg->sigset,
-                                           sizeof sigset))
+                                           sizeof(sigset)))
                                goto out;
                        p = &sigset;
                }
@@ -2151,7 +2286,7 @@ out:
        return r;
 }
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
 static long kvm_vcpu_compat_ioctl(struct file *filp,
                                  unsigned int ioctl, unsigned long arg)
 {
@@ -2172,14 +2307,14 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                           sizeof kvm_sigmask))
+                                           sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                        if (kvm_sigmask.len != sizeof csigset)
+                        if (kvm_sigmask.len != sizeof(csigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&csigset, sigmask_arg->sigset,
-                                           sizeof csigset))
+                                           sizeof(csigset)))
                                goto out;
                        sigset_from_compat(&sigset, &csigset);
                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
@@ -2243,7 +2378,7 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
 static const struct file_operations kvm_device_fops = {
        .unlocked_ioctl = kvm_device_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
        .compat_ioctl = kvm_device_ioctl,
 #endif
        .release = kvm_device_release,
@@ -2257,6 +2392,35 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
        return filp->private_data;
 }
+static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+#ifdef CONFIG_KVM_MPIC
+        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
+        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
+#endif
+#ifdef CONFIG_KVM_XICS
+        [KVM_DEV_TYPE_XICS]             = &kvm_xics_ops,
+#endif
+};
+int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+{
+        if (type >= ARRAY_SIZE(kvm_device_ops_table))
+                return -ENOSPC;
+        if (kvm_device_ops_table[type] != NULL)
+                return -EEXIST;
+        kvm_device_ops_table[type] = ops;
+        return 0;
+}
+void kvm_unregister_device_ops(u32 type)
+{
+        if (kvm_device_ops_table[type] != NULL)
+                kvm_device_ops_table[type] = NULL;
+}
 static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
 {
@@ -2265,36 +2429,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int ret;
-        switch (cd->type) {
+        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
-#ifdef CONFIG_KVM_MPIC
+                return -ENODEV;
-        case KVM_DEV_TYPE_FSL_MPIC_20:
-        case KVM_DEV_TYPE_FSL_MPIC_42:
+        ops = kvm_device_ops_table[cd->type];
-                ops = &kvm_mpic_ops;
+        if (ops == NULL)
-                break;
-#endif
-#ifdef CONFIG_KVM_XICS
-        case KVM_DEV_TYPE_XICS:
-                ops = &kvm_xics_ops;
-                break;
-#endif
-#ifdef CONFIG_KVM_VFIO
-        case KVM_DEV_TYPE_VFIO:
-                ops = &kvm_vfio_ops;
-                break;
-#endif
-#ifdef CONFIG_KVM_ARM_VGIC
-        case KVM_DEV_TYPE_ARM_VGIC_V2:
-                ops = &kvm_arm_vgic_v2_ops;
-                break;
-#endif
-#ifdef CONFIG_S390
-        case KVM_DEV_TYPE_FLIC:
-                ops = &kvm_flic_ops;
-                break;
-#endif
-        default:
                return -ENODEV;
-        }
        if (test)
                return 0;
@@ -2324,6 +2464,35 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        return 0;
 }
+static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
+{
+        switch (arg) {
+        case KVM_CAP_USER_MEMORY:
+        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+        case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
+        case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+        case KVM_CAP_SIGNAL_MSI:
+#endif
+#ifdef CONFIG_HAVE_KVM_IRQFD
+        case KVM_CAP_IRQFD:
+        case KVM_CAP_IRQFD_RESAMPLE:
+#endif
+        case KVM_CAP_CHECK_EXTENSION_VM:
+                return 1;
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+        case KVM_CAP_IRQ_ROUTING:
+                return KVM_MAX_IRQ_ROUTES;
+#endif
+        default:
+                break;
+        }
+        return kvm_vm_ioctl_check_extension(kvm, arg);
+}
 static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
@@ -2342,7 +2511,7 @@ static long kvm_vm_ioctl(struct file *filp,
                r = -EFAULT;
                if (copy_from_user(&kvm_userspace_mem, argp,
-                                                sizeof kvm_userspace_mem))
+                                                sizeof(kvm_userspace_mem)))
                        goto out;
                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
@@ -2352,7 +2521,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_dirty_log log;
                r = -EFAULT;
-                if (copy_from_user(&log, argp, sizeof log))
+                if (copy_from_user(&log, argp, sizeof(log)))
                        goto out;
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
@@ -2360,16 +2529,18 @@ static long kvm_vm_ioctl(struct file *filp,
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        case KVM_REGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
                r = -EFAULT;
-                if (copy_from_user(&zone, argp, sizeof zone))
+                if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
                break;
        }
        case KVM_UNREGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
                r = -EFAULT;
-                if (copy_from_user(&zone, argp, sizeof zone))
+                if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
                break;
@@ -2379,7 +2550,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_irqfd data;
                r = -EFAULT;
-                if (copy_from_user(&data, argp, sizeof data))
+                if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_irqfd(kvm, &data);
                break;
@@ -2388,7 +2559,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_ioeventfd data;
                r = -EFAULT;
-                if (copy_from_user(&data, argp, sizeof data))
+                if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_ioeventfd(kvm, &data);
                break;
@@ -2409,7 +2580,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_msi msi;
                r = -EFAULT;
-                if (copy_from_user(&msi, argp, sizeof msi))
+                if (copy_from_user(&msi, argp, sizeof(msi)))
                        goto out;
                r = kvm_send_userspace_msi(kvm, &msi);
                break;
@@ -2421,7 +2592,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_irq_level irq_event;
                r = -EFAULT;
-                if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
                        goto out;
                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
@@ -2431,7 +2602,7 @@ static long kvm_vm_ioctl(struct file *filp,
                r = -EFAULT;
                if (ioctl == KVM_IRQ_LINE_STATUS) {
-                        if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                        if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
                                goto out;
                }
@@ -2464,7 +2635,7 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out_free_irq_routing;
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
-        out_free_irq_routing:
+out_free_irq_routing:
                vfree(entries);
                break;
        }
@@ -2487,16 +2658,17 @@ static long kvm_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+        case KVM_CHECK_EXTENSION:
+                r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
+                break;
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
-                if (r == -ENOTTY)
-                        r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
        }
 out:
        return r;
 }
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
 struct compat_kvm_dirty_log {
        __u32 slot;
        __u32 padding1;
@@ -2543,7 +2715,7 @@ out:
 static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
        .compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
        .llseek         = noop_llseek,
@@ -2571,33 +2743,6 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
        return r;
 }
-static long kvm_dev_ioctl_check_extension_generic(long arg)
-{
-        switch (arg) {
-        case KVM_CAP_USER_MEMORY:
-        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-        case KVM_CAP_SET_BOOT_CPU_ID:
-#endif
-        case KVM_CAP_INTERNAL_ERROR_DATA:
-#ifdef CONFIG_HAVE_KVM_MSI
-        case KVM_CAP_SIGNAL_MSI:
-#endif
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
-        case KVM_CAP_IRQFD_RESAMPLE:
-#endif
-                return 1;
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
-        case KVM_CAP_IRQ_ROUTING:
-                return KVM_MAX_IRQ_ROUTES;
-#endif
-        default:
-                break;
-        }
-        return kvm_dev_ioctl_check_extension(arg);
-}
 static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
 {
@@ -2605,7 +2750,6 @@ static long kvm_dev_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_GET_API_VERSION:
-                r = -EINVAL;
                if (arg)
                        goto out;
                r = KVM_API_VERSION;
@@ -2614,10 +2758,9 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_dev_ioctl_create_vm(arg);
                break;
        case KVM_CHECK_EXTENSION:
-                r = kvm_dev_ioctl_check_extension_generic(arg);
+                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
-                r = -EINVAL;
                if (arg)
                        goto out;
                r = PAGE_SIZE;     /* struct kvm_run */
@@ -2662,13 +2805,12 @@ static void hardware_enable_nolock(void *junk)
        cpumask_set_cpu(cpu, cpus_hardware_enabled);
-        r = kvm_arch_hardware_enable(NULL);
+        r = kvm_arch_hardware_enable();
        if (r) {
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
                atomic_inc(&hardware_enable_failed);
-                printk(KERN_INFO "kvm: enabling virtualization on "
+                pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
-                                 "CPU%d failed\n", cpu);
        }
 }
@@ -2687,7 +2829,7 @@ static void hardware_disable_nolock(void *junk)
        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-        kvm_arch_hardware_disable(NULL);
+        kvm_arch_hardware_disable();
 }
 static void hardware_disable(void)
@@ -2744,12 +2886,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
-                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+                pr_info("kvm: disabling virtualization on CPU%d\n",
                       cpu);
                hardware_disable();
                break;
        case CPU_STARTING:
-                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+                pr_info("kvm: enabling virtualization on CPU%d\n",
                       cpu);
                hardware_enable();
                break;
@@ -2766,7 +2908,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
         *
         * And Intel TXT required VMX off for all cpu when system shutdown.
         */
-        printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+        pr_info("kvm: exiting hardware virtualization\n");
        kvm_rebooting = true;
        on_each_cpu(hardware_disable_nolock, NULL, 1);
        return NOTIFY_OK;
@@ -2790,7 +2932,7 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 }
 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
-                                 const struct kvm_io_range *r2)
+                                 const struct kvm_io_range *r2)
 {
        if (r1->addr < r2->addr)
                return -1;
@@ -2843,7 +2985,7 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
        return off;
 }
-static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
                              struct kvm_io_range *range, const void *val)
 {
        int idx;
@@ -2854,7 +2996,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+                if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
                                        range->len, val))
                        return idx;
                idx++;
@@ -2864,7 +3006,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
 }
 /* kvm_io_bus_write - called under kvm->slots_lock */
-int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
        struct kvm_io_bus *bus;
@@ -2876,14 +3018,14 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
-        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
-        r = __kvm_io_bus_write(bus, &range, val);
+        r = __kvm_io_bus_write(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
-int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
-                            int len, const void *val, long cookie)
+                            gpa_t addr, int len, const void *val, long cookie)
 {
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
@@ -2893,12 +3035,12 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
-        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
        /* First try the device referenced by cookie. */
        if ((cookie >= 0) && (cookie < bus->dev_count) &&
            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
-                if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+                if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
                                        val))
                        return cookie;
@@ -2906,11 +3048,11 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
         * cookie contained garbage; fall back to search and return the
         * correct cookie value.
         */
-        return __kvm_io_bus_write(bus, &range, val);
+        return __kvm_io_bus_write(vcpu, bus, &range, val);
 }
-static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
+static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
-                             void *val)
+                             struct kvm_io_range *range, void *val)
 {
        int idx;
@@ -2920,7 +3062,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+                if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
                                       range->len, val))
                        return idx;
                idx++;
@@ -2931,7 +3073,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
 /* kvm_io_bus_read - called under kvm->slots_lock */
-int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
        struct kvm_io_bus *bus;
@@ -2943,8 +3085,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
-        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
-        r = __kvm_io_bus_read(bus, &range, val);
+        r = __kvm_io_bus_read(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
@@ -3114,9 +3256,12 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
        if (vcpu->preempted)
                vcpu->preempted = false;
+        kvm_arch_sched_in(vcpu, cpu);
        kvm_arch_vcpu_load(vcpu, cpu);
 }
@@ -3193,7 +3338,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        r = misc_register(&kvm_dev);
        if (r) {
-                printk(KERN_ERR "kvm: misc device register failed\n");
+                pr_err("kvm: misc device register failed\n");
                goto out_unreg;
        }
@@ -3204,10 +3349,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        r = kvm_init_debug();
        if (r) {
-                printk(KERN_ERR "kvm: create debugfs files failed\n");
+                pr_err("kvm: create debugfs files failed\n");
                goto out_undebugfs;
        }
+        r = kvm_vfio_ops_init();
+        WARN_ON(r);
        return 0;
 out_undebugfs:
@@ -3248,5 +3396,6 @@ void kvm_exit(void)
        kvm_arch_exit();
        kvm_irqfd_exit();
        free_cpumask_var(cpus_hardware_enabled);
+        kvm_vfio_ops_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index ba1a93f935c7..620e37f741b8 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#include "vfio.h"
 struct kvm_vfio_group {
        struct list_head node;
@@ -246,6 +247,16 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
        kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
 }
+static int kvm_vfio_create(struct kvm_device *dev, u32 type);
+static struct kvm_device_ops kvm_vfio_ops = {
+        .name = "kvm-vfio",
+        .create = kvm_vfio_create,
+        .destroy = kvm_vfio_destroy,
+        .set_attr = kvm_vfio_set_attr,
+        .has_attr = kvm_vfio_has_attr,
+};
 static int kvm_vfio_create(struct kvm_device *dev, u32 type)
 {
        struct kvm_device *tmp;
@@ -268,10 +279,12 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
        return 0;
 }
-struct kvm_device_ops kvm_vfio_ops = {
+int kvm_vfio_ops_init(void)
-        .name = "kvm-vfio",
+{
-        .create = kvm_vfio_create,
+        return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO);
-        .destroy = kvm_vfio_destroy,
+}
-        .set_attr = kvm_vfio_set_attr,
-        .has_attr = kvm_vfio_has_attr,
+void kvm_vfio_ops_exit(void)
-};
+{
+        kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO);
+}
diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h
new file mode 100644
index 000000000000..ab88c7dc0514
--- /dev/null
+++ b/virt/kvm/vfio.h
@@ -0,0 +1,17 @@
+#ifndef __KVM_VFIO_H
+#define __KVM_VFIO_H
+#ifdef CONFIG_KVM_VFIO
+int kvm_vfio_ops_init(void);
+void kvm_vfio_ops_exit(void);
+#else
+static inline int kvm_vfio_ops_init(void)
+{
+        return 0;
+}
+static inline void kvm_vfio_ops_exit(void)
+{
+}
+#endif
+#endif