Patched in Tegra support.

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
commit: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree: a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /virt
parent: 406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
11 files changed, 717 insertions, 1388 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index d01b24b72c6..f63ccb0a598 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,9 +18,3 @@ config KVM_MMIO
 config KVM_ASYNC_PF
       bool
-config HAVE_KVM_MSI
-       bool
-config HAVE_KVM_CPU_RELAX_INTERCEPT
-       bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 3642239252b..af7910228fb 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -49,157 +49,71 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
                        index = i;
                        break;
                }
-        if (index < 0)
+        if (index < 0) {
                printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+                return 0;
+        }
        return index;
 }
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int ret;
+        u32 vector;
+        int index;
-        spin_lock(&assigned_dev->intx_lock);
+        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-        if (pci_check_and_mask_intx(assigned_dev->dev)) {
+                spin_lock(&assigned_dev->intx_lock);
+                disable_irq_nosync(irq);
                assigned_dev->host_irq_disabled = true;
-                ret = IRQ_WAKE_THREAD;
+                spin_unlock(&assigned_dev->intx_lock);
-        } else
+        }
-                ret = IRQ_NONE;
-        spin_unlock(&assigned_dev->intx_lock);
-        return ret;
-}
-static void
+        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+                index = find_index_from_host_irq(assigned_dev, irq);
-                                 int vector)
+                if (index >= 0) {
-{
+                        vector = assigned_dev->
-        if (unlikely(assigned_dev->irq_requested_type &
+                                        guest_msix_entries[index].vector;
-                     KVM_DEV_IRQ_GUEST_INTX)) {
-                spin_lock(&assigned_dev->intx_mask_lock);
-                if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
                        kvm_set_irq(assigned_dev->kvm,
                                    assigned_dev->irq_source_id, vector, 1);
-                spin_unlock(&assigned_dev->intx_mask_lock);
+                }
        } else
                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                            vector, 1);
+                            assigned_dev->guest_irq, 1);
-}
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                spin_lock_irq(&assigned_dev->intx_lock);
-                disable_irq_nosync(irq);
-                assigned_dev->host_irq_disabled = true;
-                spin_unlock_irq(&assigned_dev->intx_lock);
-        }
-        kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                         assigned_dev->guest_irq);
        return IRQ_HANDLED;
 }
-#ifdef __KVM_HAVE_MSI
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                       assigned_dev->irq_source_id,
-                                       assigned_dev->guest_irq, 1);
-        return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                         assigned_dev->guest_irq);
-        return IRQ_HANDLED;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int index = find_index_from_host_irq(assigned_dev, irq);
-        u32 vector;
-        int ret = 0;
-        if (index >= 0) {
-                vector = assigned_dev->guest_msix_entries[index].vector;
-                ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                           assigned_dev->irq_source_id,
-                                           vector, 1);
-        }
-        return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-        int index = find_index_from_host_irq(assigned_dev, irq);
-        u32 vector;
-        if (index >= 0) {
-                vector = assigned_dev->guest_msix_entries[index].vector;
-                kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-        }
-        return IRQ_HANDLED;
-}
-#endif
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
-        struct kvm_assigned_dev_kernel *dev =
+        struct kvm_assigned_dev_kernel *dev;
-                container_of(kian, struct kvm_assigned_dev_kernel,
-                             ack_notifier);
-        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+        if (kian->gsi == -1)
+                return;
-        spin_lock(&dev->intx_mask_lock);
+        dev = container_of(kian, struct kvm_assigned_dev_kernel,
+                           ack_notifier);
-        if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-                bool reassert = false;
-                spin_lock_irq(&dev->intx_lock);
-                /*
-                 * The guest IRQ may be shared so this ack can come from an
-                 * IRQ for another guest device.
-                 */
-                if (dev->host_irq_disabled) {
-                        if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-                                enable_irq(dev->host_irq);
-                        else if (!pci_check_and_unmask_intx(dev->dev))
-                                reassert = true;
-                        dev->host_irq_disabled = reassert;
-                }
-                spin_unlock_irq(&dev->intx_lock);
-                if (reassert)
+        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-                        kvm_set_irq(dev->kvm, dev->irq_source_id,
-                                    dev->guest_irq, 1);
-        }
-        spin_unlock(&dev->intx_mask_lock);
+        /* The guest irq may be shared so this ack may be
+         * from another device.
+         */
+        spin_lock(&dev->intx_lock);
+        if (dev->host_irq_disabled) {
+                enable_irq(dev->host_irq);
+                dev->host_irq_disabled = false;
+        }
+        spin_unlock(&dev->intx_lock);
 }
 static void deassign_guest_irq(struct kvm *kvm,
                               struct kvm_assigned_dev_kernel *assigned_dev)
 {
-        if (assigned_dev->ack_notifier.gsi != -1)
+        kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-                kvm_unregister_irq_ack_notifier(kvm,
+        assigned_dev->ack_notifier.gsi = -1;
-                                                &assigned_dev->ack_notifier);
        kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
                    assigned_dev->guest_irq, 0);
@@ -231,7 +145,7 @@ static void deassign_host_irq(struct kvm *kvm,
                for (i = 0; i < assigned_dev->entries_nr; i++)
                        free_irq(assigned_dev->host_msix_entries[i].vector,
-                                 assigned_dev);
+                                 (void *)assigned_dev);
                assigned_dev->entries_nr = 0;
                kfree(assigned_dev->host_msix_entries);
@@ -239,17 +153,9 @@ static void deassign_host_irq(struct kvm *kvm,
                pci_disable_msix(assigned_dev->dev);
        } else {
                /* Deal with MSI and INTx */
-                if ((assigned_dev->irq_requested_type &
+                disable_irq(assigned_dev->host_irq);
-                     KVM_DEV_IRQ_HOST_INTX) &&
-                    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+                free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-                        spin_lock_irq(&assigned_dev->intx_lock);
-                        pci_intx(assigned_dev->dev, false);
-                        spin_unlock_irq(&assigned_dev->intx_lock);
-                        synchronize_irq(assigned_dev->host_irq);
-                } else
-                        disable_irq(assigned_dev->host_irq);
-                free_irq(assigned_dev->host_irq, assigned_dev);
                if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
                        pci_disable_msi(assigned_dev->dev);
@@ -301,8 +207,6 @@ static void kvm_free_assigned_device(struct kvm *kvm,
        else
                pci_restore_state(assigned_dev->dev);
-        assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
        pci_release_regions(assigned_dev->dev);
        pci_disable_device(assigned_dev->dev);
        pci_dev_put(assigned_dev->dev);
@@ -328,34 +232,15 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 static int assigned_device_enable_host_intx(struct kvm *kvm,
                                            struct kvm_assigned_dev_kernel *dev)
 {
-        irq_handler_t irq_handler;
-        unsigned long flags;
        dev->host_irq = dev->dev->irq;
+        /* Even though this is PCI, we don't want to use shared
-        /*
+         * interrupts. Sharing host devices with guest-assigned devices
-         * We can only share the IRQ line with other host devices if we are
+         * on the same interrupt line is not a happy situation: there
-         * able to disable the IRQ source at device-level - independently of
+         * are going to be long delays in accepting, acking, etc.
-         * the guest driver. Otherwise host devices may suffer from unbounded
-         * IRQ latencies when the guest keeps the line asserted.
         */
-        if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+        if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                irq_handler = kvm_assigned_dev_intx;
+                                 IRQF_ONESHOT, dev->irq_name, (void *)dev))
-                flags = IRQF_SHARED;
-        } else {
-                irq_handler = NULL;
-                flags = IRQF_ONESHOT;
-        }
-        if (request_threaded_irq(dev->host_irq, irq_handler,
-                                 kvm_assigned_dev_thread_intx, flags,
-                                 dev->irq_name, dev))
                return -EIO;
-        if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-                spin_lock_irq(&dev->intx_lock);
-                pci_intx(dev->dev, true);
-                spin_unlock_irq(&dev->intx_lock);
-        }
        return 0;
 }
@@ -372,9 +257,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
        }
        dev->host_irq = dev->dev->irq;
-        if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
+        if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                                 kvm_assigned_dev_thread_msi, 0,
+                                 0, dev->irq_name, (void *)dev)) {
-                                 dev->irq_name, dev)) {
                pci_disable_msi(dev->dev);
                return -EIO;
        }
@@ -400,9 +284,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
        for (i = 0; i < dev->entries_nr; i++) {
                r = request_threaded_irq(dev->host_msix_entries[i].vector,
-                                         kvm_assigned_dev_msix,
+                                         NULL, kvm_assigned_dev_thread,
-                                         kvm_assigned_dev_thread_msix,
+                                         0, dev->irq_name, (void *)dev);
-                                         0, dev->irq_name, dev);
                if (r)
                        goto err;
        }
@@ -410,7 +293,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
        return 0;
 err:
        for (i -= 1; i >= 0; i--)
-                free_irq(dev->host_msix_entries[i].vector, dev);
+                free_irq(dev->host_msix_entries[i].vector, (void *)dev);
        pci_disable_msix(dev->dev);
        return r;
 }
@@ -433,6 +316,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 {
        dev->guest_irq = irq->guest_irq;
        dev->ack_notifier.gsi = -1;
+        dev->host_irq_disabled = false;
        return 0;
 }
 #endif
@@ -444,6 +328,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 {
        dev->guest_irq = irq->guest_irq;
        dev->ack_notifier.gsi = -1;
+        dev->host_irq_disabled = false;
        return 0;
 }
 #endif
@@ -477,7 +362,6 @@ static int assign_host_irq(struct kvm *kvm,
        default:
                r = -EINVAL;
        }
-        dev->host_irq_disabled = false;
        if (!r)
                dev->irq_requested_type |= host_irq_type;
@@ -522,8 +406,7 @@ static int assign_guest_irq(struct kvm *kvm,
        if (!r) {
                dev->irq_requested_type |= guest_irq_type;
-                if (dev->ack_notifier.gsi != -1)
+                kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-                        kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
        } else
                kvm_free_irq_source_id(kvm, dev->irq_source_id);
@@ -579,7 +462,6 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 {
        int r = -ENODEV;
        struct kvm_assigned_dev_kernel *match;
-        unsigned long irq_type;
        mutex_lock(&kvm->lock);
@@ -588,9 +470,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
        if (!match)
                goto out;
-        irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+        r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
-                                          KVM_DEV_IRQ_GUEST_MASK);
-        r = kvm_deassign_irq(kvm, match, irq_type);
 out:
        mutex_unlock(&kvm->lock);
        return r;
@@ -662,6 +542,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        int r = 0, idx;
        struct kvm_assigned_dev_kernel *match;
        struct pci_dev *dev;
+        u8 header_type;
        if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
                return -EINVAL;
@@ -694,7 +575,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        }
        /* Don't allow bridges to be assigned */
-        if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
+        pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
+        if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) {
                r = -EPERM;
                goto out_put;
        }
@@ -721,10 +603,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        if (!match->pci_saved_state)
                printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
                       __func__, dev_name(&dev->dev));
-        if (!pci_intx_mask_supported(dev))
-                assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
        match->assigned_dev_id = assigned_dev->assigned_dev_id;
        match->host_segnr = assigned_dev->segnr;
        match->host_busnr = assigned_dev->busnr;
@@ -732,7 +610,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        match->flags = assigned_dev->flags;
        match->dev = dev;
        spin_lock_init(&match->intx_lock);
-        spin_lock_init(&match->intx_mask_lock);
        match->irq_source_id = -1;
        match->kvm = kvm;
        match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -878,55 +755,6 @@ msix_entry_out:
 }
 #endif
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-                struct kvm_assigned_pci_dev *assigned_dev)
-{
-        int r = 0;
-        struct kvm_assigned_dev_kernel *match;
-        mutex_lock(&kvm->lock);
-        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                      assigned_dev->assigned_dev_id);
-        if (!match) {
-                r = -ENODEV;
-                goto out;
-        }
-        spin_lock(&match->intx_mask_lock);
-        match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-        match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-        if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-                if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-                        kvm_set_irq(match->kvm, match->irq_source_id,
-                                    match->guest_irq, 0);
-                        /*
-                         * Masking at hardware-level is performed on demand,
-                         * i.e. when an IRQ actually arrives at the host.
-                         */
-                } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                        /*
-                         * Unmask the IRQ line if required. Unmasking at
-                         * device level will be performed by user space.
-                         */
-                        spin_lock_irq(&match->intx_lock);
-                        if (match->host_irq_disabled) {
-                                enable_irq(match->host_irq);
-                                match->host_irq_disabled = false;
-                        }
-                        spin_unlock_irq(&match->intx_lock);
-                }
-        }
-        spin_unlock(&match->intx_mask_lock);
-out:
-        mutex_unlock(&kvm->lock);
-        return r;
-}
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                                  unsigned long arg)
 {
@@ -1034,15 +862,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                break;
        }
 #endif
-        case KVM_ASSIGN_SET_INTX_MASK: {
-                struct kvm_assigned_pci_dev assigned_dev;
-                r = -EFAULT;
-                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                        goto out;
-                r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-                break;
-        }
        default:
                r = -ENOTTY;
                break;
@@ -1050,3 +869,4 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 out:
        return r;
 }
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index ea475cd0351..74268b4c2ee 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                        list_entry(vcpu->async_pf.done.next,
                                   typeof(*work), link);
                list_del(&work->link);
-                if (!is_error_page(work->page))
+                if (work->page)
-                        kvm_release_page_clean(work->page);
+                        put_page(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
        spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
                list_del(&work->queue);
                vcpu->async_pf.queued--;
-                if (!is_error_page(work->page))
+                if (work->page)
-                        kvm_release_page_clean(work->page);
+                        put_page(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
 }
@@ -203,7 +203,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
        if (!work)
                return -ENOMEM;
-        work->page = KVM_ERR_PTR_BAD_PAGE;
+        work->page = bad_page;
+        get_page(bad_page);
        INIT_LIST_HEAD(&work->queue); /* for list_del to work */
        spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 88b2fe3ddf4..fc8487564d1 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -24,25 +24,10 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
                                   gpa_t addr, int len)
 {
-        /* is it in a batchable area ?
+        struct kvm_coalesced_mmio_zone *zone;
-         * (addr,len) is fully included in
-         * (zone->addr, zone->size)
-         */
-        if (len < 0)
-                return 0;
-        if (addr + len < addr)
-                return 0;
-        if (addr < dev->zone.addr)
-                return 0;
-        if (addr + len > dev->zone.addr + dev->zone.size)
-                return 0;
-        return 1;
-}
-static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
-{
        struct kvm_coalesced_mmio_ring *ring;
        unsigned avail;
+        int i;
        /* Are we able to batch it ? */
@@ -52,12 +37,25 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
         */
        ring = dev->kvm->coalesced_mmio_ring;
        avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
-        if (avail == 0) {
+        if (avail < KVM_MAX_VCPUS) {
                /* full */
                return 0;
        }
-        return 1;
+        /* is it in a batchable area ? */
+        for (i = 0; i < dev->nb_zones; i++) {
+                zone = &dev->zone[i];
+                /* (addr,len) is fully included in
+                 * (zone->addr, zone->size)
+                 */
+                if (zone->addr <= addr &&
+                    addr + len <= zone->addr + zone->size)
+                        return 1;
+        }
+        return 0;
 }
 static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -65,16 +63,10 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
        if (!coalesced_mmio_in_range(dev, addr, len))
                return -EOPNOTSUPP;
-        spin_lock(&dev->kvm->ring_lock);
+        spin_lock(&dev->lock);
-        if (!coalesced_mmio_has_room(dev)) {
-                spin_unlock(&dev->kvm->ring_lock);
-                return -EOPNOTSUPP;
-        }
        /* copy data in first free entry of the ring */
@@ -83,7 +75,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
        memcpy(ring->coalesced_mmio[ring->last].data, val, len);
        smp_wmb();
        ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
-        spin_unlock(&dev->kvm->ring_lock);
+        spin_unlock(&dev->lock);
        return 0;
 }
@@ -91,8 +83,6 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
-        list_del(&dev->list);
        kfree(dev);
 }
@@ -103,6 +93,7 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
+        struct kvm_coalesced_mmio_dev *dev;
        struct page *page;
        int ret;
@@ -110,18 +101,31 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!page)
                goto out_err;
-        ret = 0;
        kvm->coalesced_mmio_ring = page_address(page);
-        /*
+        ret = -ENOMEM;
-         * We're using this spinlock to sync access to the coalesced ring.
+        dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-         * The list doesn't need it's own lock since device registration and
+        if (!dev)
-         * unregistration should only happen when kvm->slots_lock is held.
+                goto out_free_page;
-         */
+        spin_lock_init(&dev->lock);
-        spin_lock_init(&kvm->ring_lock);
+        kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
-        INIT_LIST_HEAD(&kvm->coalesced_zones);
+        dev->kvm = kvm;
+        kvm->coalesced_mmio_dev = dev;
+        mutex_lock(&kvm->slots_lock);
+        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+        mutex_unlock(&kvm->slots_lock);
+        if (ret < 0)
+                goto out_free_dev;
+        return ret;
+out_free_dev:
+        kvm->coalesced_mmio_dev = NULL;
+        kfree(dev);
+out_free_page:
+        kvm->coalesced_mmio_ring = NULL;
+        __free_page(page);
 out_err:
        return ret;
 }
@@ -135,50 +139,51 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
                                         struct kvm_coalesced_mmio_zone *zone)
 {
-        int ret;
+        struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
-        struct kvm_coalesced_mmio_dev *dev;
-        dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-        if (!dev)
-                return -ENOMEM;
-        kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+        if (dev == NULL)
-        dev->kvm = kvm;
+                return -ENXIO;
-        dev->zone = *zone;
        mutex_lock(&kvm->slots_lock);
-        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
+        if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
-                                      zone->size, &dev->dev);
+                mutex_unlock(&kvm->slots_lock);
-        if (ret < 0)
+                return -ENOBUFS;
-                goto out_free_dev;
+        }
-        list_add_tail(&dev->list, &kvm->coalesced_zones);
-        mutex_unlock(&kvm->slots_lock);
-        return ret;
+        dev->zone[dev->nb_zones] = *zone;
+        dev->nb_zones++;
-out_free_dev:
        mutex_unlock(&kvm->slots_lock);
-        kfree(dev);
-        if (dev == NULL)
-                return -ENXIO;
        return 0;
 }
 int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
 {
-        struct kvm_coalesced_mmio_dev *dev, *tmp;
+        int i;
+        struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+        struct kvm_coalesced_mmio_zone *z;
+        if (dev == NULL)
+                return -ENXIO;
        mutex_lock(&kvm->slots_lock);
-        list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+        i = dev->nb_zones;
-                if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+        while (i) {
-                        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+                z = &dev->zone[i - 1];
-                        kvm_iodevice_destructor(&dev->dev);
+                /* unregister all zones
+                 * included in (zone->addr, zone->size)
+                 */
+                if (zone->addr <= z->addr &&
+                    z->addr + z->size <= zone->addr + zone->size) {
+                        dev->nb_zones--;
+                        *z = dev->zone[dev->nb_zones];
                }
+                i--;
+        }
        mutex_unlock(&kvm->slots_lock);
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index b280c20444d..8a5959e3535 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,13 +12,14 @@
 #ifdef CONFIG_KVM_MMIO
-#include <linux/list.h>
+#define KVM_COALESCED_MMIO_ZONE_MAX 100
 struct kvm_coalesced_mmio_dev {
-        struct list_head list;
        struct kvm_io_device dev;
        struct kvm *kvm;
-        struct kvm_coalesced_mmio_zone zone;
+        spinlock_t lock;
+        int nb_zones;
+        struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
 };
 int kvm_coalesced_mmio_init(struct kvm *kvm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b6eea5cc7b3..73358d256fa 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,7 +35,6 @@
 #include "iodev.h"
-#ifdef __KVM_HAVE_IOAPIC
 /*
 * --------------------------------------------------------------------
 * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -44,31 +43,6 @@
 * --------------------------------------------------------------------
 */
-/*
- * Resampling irqfds are a special variety of irqfds used to emulate
- * level triggered interrupts.  The interrupt is asserted on eventfd
- * trigger.  On acknowledgement through the irq ack notifier, the
- * interrupt is de-asserted and userspace is notified through the
- * resamplefd.  All resamplers on the same gsi are de-asserted
- * together, so we don't need to track the state of each individual
- * user.  We can also therefore share the same irq source ID.
- */
-struct _irqfd_resampler {
-        struct kvm *kvm;
-        /*
-         * List of resampling struct _irqfd objects sharing this gsi.
-         * RCU list modified under kvm->irqfds.resampler_lock
-         */
-        struct list_head list;
-        struct kvm_irq_ack_notifier notifier;
-        /*
-         * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
-         * resamplers among irqfds on the same gsi.
-         * Accessed and modified under kvm->irqfds.resampler_lock
-         */
-        struct list_head link;
-};
 struct _irqfd {
        /* Used for MSI fast-path */
        struct kvm *kvm;
@@ -78,12 +52,6 @@ struct _irqfd {
        /* Used for level IRQ fast-path */
        int gsi;
        struct work_struct inject;
-        /* The resampler used by this irqfd (resampler-only) */
-        struct _irqfd_resampler *resampler;
-        /* Eventfd notified on resample (resampler-only) */
-        struct eventfd_ctx *resamplefd;
-        /* Entry in list of irqfds for a resampler (resampler-only) */
-        struct list_head resampler_link;
        /* Used for setup/shutdown */
        struct eventfd_ctx *eventfd;
        struct list_head list;
@@ -99,58 +67,8 @@ irqfd_inject(struct work_struct *work)
        struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
        struct kvm *kvm = irqfd->kvm;
-        if (!irqfd->resampler) {
+        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-        } else
-                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                            irqfd->gsi, 1);
-}
-/*
- * Since resampler irqfds share an IRQ source ID, we de-assert once
- * then notify all of the resampler irqfds using this GSI.  We can't
- * do multiple de-asserts or we risk racing with incoming re-asserts.
- */
-static void
-irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
-{
-        struct _irqfd_resampler *resampler;
-        struct _irqfd *irqfd;
-        resampler = container_of(kian, struct _irqfd_resampler, notifier);
-        kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                    resampler->notifier.gsi, 0);
-        rcu_read_lock();
-        list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
-                eventfd_signal(irqfd->resamplefd, 1);
-        rcu_read_unlock();
-}
-static void
-irqfd_resampler_shutdown(struct _irqfd *irqfd)
-{
-        struct _irqfd_resampler *resampler = irqfd->resampler;
-        struct kvm *kvm = resampler->kvm;
-        mutex_lock(&kvm->irqfds.resampler_lock);
-        list_del_rcu(&irqfd->resampler_link);
-        synchronize_rcu();
-        if (list_empty(&resampler->list)) {
-                list_del(&resampler->link);
-                kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
-                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                            resampler->notifier.gsi, 0);
-                kfree(resampler);
-        }
-        mutex_unlock(&kvm->irqfds.resampler_lock);
 }
 /*
@@ -172,12 +90,7 @@ irqfd_shutdown(struct work_struct *work)
         * We know no new events will be scheduled at this point, so block
         * until all previously outstanding events have completed
         */
-        flush_work(&irqfd->inject);
+        flush_work_sync(&irqfd->inject);
-        if (irqfd->resampler) {
-                irqfd_resampler_shutdown(irqfd);
-                eventfd_ctx_put(irqfd->resamplefd);
-        }
        /*
         * It is now safe to release the object's resources
@@ -285,12 +198,12 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
 }
 static int
-kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
+kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 {
        struct kvm_irq_routing_table *irq_rt;
        struct _irqfd *irqfd, *tmp;
        struct file *file = NULL;
-        struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+        struct eventfd_ctx *eventfd = NULL;
        int ret;
        unsigned int events;
@@ -299,12 +212,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
                return -ENOMEM;
        irqfd->kvm = kvm;
-        irqfd->gsi = args->gsi;
+        irqfd->gsi = gsi;
        INIT_LIST_HEAD(&irqfd->list);
        INIT_WORK(&irqfd->inject, irqfd_inject);
        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
-        file = eventfd_fget(args->fd);
+        file = eventfd_fget(fd);
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
                goto fail;
@@ -318,54 +231,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        irqfd->eventfd = eventfd;
-        if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
-                struct _irqfd_resampler *resampler;
-                resamplefd = eventfd_ctx_fdget(args->resamplefd);
-                if (IS_ERR(resamplefd)) {
-                        ret = PTR_ERR(resamplefd);
-                        goto fail;
-                }
-                irqfd->resamplefd = resamplefd;
-                INIT_LIST_HEAD(&irqfd->resampler_link);
-                mutex_lock(&kvm->irqfds.resampler_lock);
-                list_for_each_entry(resampler,
-                                    &kvm->irqfds.resampler_list, link) {
-                        if (resampler->notifier.gsi == irqfd->gsi) {
-                                irqfd->resampler = resampler;
-                                break;
-                        }
-                }
-                if (!irqfd->resampler) {
-                        resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
-                        if (!resampler) {
-                                ret = -ENOMEM;
-                                mutex_unlock(&kvm->irqfds.resampler_lock);
-                                goto fail;
-                        }
-                        resampler->kvm = kvm;
-                        INIT_LIST_HEAD(&resampler->list);
-                        resampler->notifier.gsi = irqfd->gsi;
-                        resampler->notifier.irq_acked = irqfd_resampler_ack;
-                        INIT_LIST_HEAD(&resampler->link);
-                        list_add(&resampler->link, &kvm->irqfds.resampler_list);
-                        kvm_register_irq_ack_notifier(kvm,
-                                                      &resampler->notifier);
-                        irqfd->resampler = resampler;
-                }
-                list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
-                synchronize_rcu();
-                mutex_unlock(&kvm->irqfds.resampler_lock);
-        }
        /*
         * Install our own custom wake-up handling so we are notified via
         * a callback whenever someone signals the underlying eventfd
@@ -411,12 +276,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        return 0;
 fail:
-        if (irqfd->resampler)
-                irqfd_resampler_shutdown(irqfd);
-        if (resamplefd && !IS_ERR(resamplefd))
-                eventfd_ctx_put(resamplefd);
        if (eventfd && !IS_ERR(eventfd))
                eventfd_ctx_put(eventfd);
@@ -426,38 +285,32 @@ fail:
        kfree(irqfd);
        return ret;
 }
-#endif
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
-#ifdef __KVM_HAVE_IOAPIC
        spin_lock_init(&kvm->irqfds.lock);
        INIT_LIST_HEAD(&kvm->irqfds.items);
-        INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
-        mutex_init(&kvm->irqfds.resampler_lock);
-#endif
        INIT_LIST_HEAD(&kvm->ioeventfds);
 }
-#ifdef __KVM_HAVE_IOAPIC
 /*
 * shutdown any irqfd's that match fd+gsi
 */
 static int
-kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
+kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 {
        struct _irqfd *irqfd, *tmp;
        struct eventfd_ctx *eventfd;
-        eventfd = eventfd_ctx_fdget(args->fd);
+        eventfd = eventfd_ctx_fdget(fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);
        spin_lock_irq(&kvm->irqfds.lock);
        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
-                if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
+                if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
                        /*
                         * This rcu_assign_pointer is needed for when
                         * another thread calls kvm_irq_routing_update before
@@ -485,15 +338,12 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 }
 int
-kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
+kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
-        if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
+        if (flags & KVM_IRQFD_FLAG_DEASSIGN)
-                return -EINVAL;
+                return kvm_irqfd_deassign(kvm, fd, gsi);
-        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
-                return kvm_irqfd_deassign(kvm, args);
-        return kvm_irqfd_assign(kvm, args);
+        return kvm_irqfd_assign(kvm, fd, gsi);
 }
 /*
@@ -560,7 +410,6 @@ static void __exit irqfd_module_exit(void)
 module_init(irqfd_module_init);
 module_exit(irqfd_module_exit);
-#endif
 /*
 * --------------------------------------------------------------------
@@ -737,8 +586,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
        kvm_iodevice_init(&p->dev, &ioeventfd_ops);
-        ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
+        ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
-                                      &p->dev);
        if (ret < 0)
                goto unlock_fail;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index cfb7e4d52dc..8df1ca104a7 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -185,56 +185,42 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
                irqe.dest_mode = 0; /* Physical mode. */
                /* need to read apic_id from apic regiest since
                 * it can be rewritten */
-                irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
+                irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
        }
 #endif
        return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
 }
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
-                       int level)
 {
        u32 old_irr;
        u32 mask = 1 << irq;
        union kvm_ioapic_redirect_entry entry;
-        int ret, irq_level;
+        int ret = 1;
-        BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
        spin_lock(&ioapic->lock);
        old_irr = ioapic->irr;
-        irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+        if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
-                                         irq_source_id, level);
+                entry = ioapic->redirtbl[irq];
-        entry = ioapic->redirtbl[irq];
+                level ^= entry.fields.polarity;
-        irq_level ^= entry.fields.polarity;
+                if (!level)
-        if (!irq_level) {
+                        ioapic->irr &= ~mask;
-                ioapic->irr &= ~mask;
+                else {
-                ret = 1;
+                        int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
-        } else {
+                        ioapic->irr |= mask;
-                int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+                        if ((edge && old_irr != ioapic->irr) ||
-                ioapic->irr |= mask;
+                            (!edge && !entry.fields.remote_irr))
-                if ((edge && old_irr != ioapic->irr) ||
+                                ret = ioapic_service(ioapic, irq);
-                    (!edge && !entry.fields.remote_irr))
+                        else
-                        ret = ioapic_service(ioapic, irq);
+                                ret = 0; /* report coalesced interrupt */
-                else
+                }
-                        ret = 0; /* report coalesced interrupt */
+                trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        }
-        trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        spin_unlock(&ioapic->lock);
        return ret;
 }
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
-        int i;
-        spin_lock(&ioapic->lock);
-        for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
-                __clear_bit(irq_source_id, &ioapic->irq_states[i]);
-        spin_unlock(&ioapic->lock);
-}
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
                                     int trigger_mode)
 {
@@ -268,17 +254,13 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
        }
 }
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
-        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        smp_rmb();
-        return test_bit(vector, ioapic->handled_vectors);
-}
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+        smp_rmb();
+        if (!test_bit(vector, ioapic->handled_vectors))
+                return;
        spin_lock(&ioapic->lock);
        __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
        spin_unlock(&ioapic->lock);
@@ -350,18 +332,9 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
                     (void*)addr, len, val);
        ASSERT(!(addr & 0xf));  /* check alignment */
-        switch (len) {
+        if (len == 4 || len == 8)
-        case 8:
-        case 4:
                data = *(u32 *) val;
-                break;
+        else {
-        case 2:
-                data = *(u16 *) val;
-                break;
-        case 1:
-                data = *(u8  *) val;
-                break;
-        default:
                printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
                return 0;
        }
@@ -370,7 +343,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
        spin_lock(&ioapic->lock);
        switch (addr) {
        case IOAPIC_REG_SELECT:
-                ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+                ioapic->ioregsel = data;
                break;
        case IOAPIC_REG_WINDOW:
@@ -421,8 +394,7 @@ int kvm_ioapic_init(struct kvm *kvm)
        kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
        ioapic->kvm = kvm;
        mutex_lock(&kvm->slots_lock);
-        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
-                                      IOAPIC_MEM_LENGTH, &ioapic->dev);
        mutex_unlock(&kvm->slots_lock);
        if (ret < 0) {
                kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a30abfe6ed1..0b190c34ccc 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,12 +71,9 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-                       int level);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4a340cb2301..511e160f706 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -25,14 +25,12 @@
 #include <linux/list.h>
 #include <linux/kvm_host.h>
-#include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/stat.h>
 #include <linux/dmar.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
-static bool allow_unsafe_assigned_interrupts;
+static int allow_unsafe_assigned_interrupts;
 module_param_named(allow_unsafe_assigned_interrupts,
                   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
@@ -42,21 +40,21 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
                                gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long size)
+                           gfn_t gfn, unsigned long size)
 {
        gfn_t end_gfn;
        pfn_t pfn;
-        pfn     = gfn_to_pfn_memslot(slot, gfn);
+        pfn     = gfn_to_pfn_memslot(kvm, slot, gfn);
        end_gfn = gfn + (size >> PAGE_SHIFT);
        gfn    += 1;
-        if (is_error_noslot_pfn(pfn))
+        if (is_error_pfn(pfn))
                return pfn;
        while (gfn < end_gfn)
-                gfn_to_pfn_memslot(slot, gfn++);
+                gfn_to_pfn_memslot(kvm, slot, gfn++);
        return pfn;
 }
@@ -105,8 +103,8 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
                 * Pin all pages we are about to map in memory. This is
                 * important because we unmap and unpin in 4kb steps later.
                 */
-                pfn = kvm_pin_pages(slot, gfn, page_size);
+                pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
-                if (is_error_noslot_pfn(pfn)) {
+                if (is_error_pfn(pfn)) {
                        gfn += 1;
                        continue;
                }
@@ -134,15 +132,14 @@ unmap_pages:
 static int kvm_iommu_map_memslots(struct kvm *kvm)
 {
-        int idx, r = 0;
+        int i, idx, r = 0;
        struct kvm_memslots *slots;
-        struct kvm_memory_slot *memslot;
        idx = srcu_read_lock(&kvm->srcu);
        slots = kvm_memslots(kvm);
-        kvm_for_each_memslot(memslot, slots) {
+        for (i = 0; i < slots->nmemslots; i++) {
-                r = kvm_iommu_map_pages(kvm, memslot);
+                r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
                if (r)
                        break;
        }
@@ -168,7 +165,11 @@ int kvm_assign_device(struct kvm *kvm,
        r = iommu_attach_device(domain, &pdev->dev);
        if (r) {
-                dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
+                printk(KERN_ERR "assign device %x:%x:%x.%x failed",
+                        pci_domain_nr(pdev->bus),
+                        pdev->bus->number,
+                        PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn));
                return r;
        }
@@ -186,8 +187,6 @@ int kvm_assign_device(struct kvm *kvm,
                        goto out_unmap;
        }
-        pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
        printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
                assigned_dev->host_segnr,
                assigned_dev->host_busnr,
@@ -216,8 +215,6 @@ int kvm_deassign_device(struct kvm *kvm,
        iommu_detach_device(domain, &pdev->dev);
-        pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
        printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
                assigned_dev->host_segnr,
                assigned_dev->host_busnr,
@@ -236,13 +233,9 @@ int kvm_iommu_map_guest(struct kvm *kvm)
                return -ENODEV;
        }
-        mutex_lock(&kvm->slots_lock);
        kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-        if (!kvm->arch.iommu_domain) {
+        if (!kvm->arch.iommu_domain)
-                r = -ENOMEM;
+                return -ENOMEM;
-                goto out_unlock;
-        }
        if (!allow_unsafe_assigned_interrupts &&
            !iommu_domain_has_cap(kvm->arch.iommu_domain,
@@ -253,16 +246,17 @@ int kvm_iommu_map_guest(struct kvm *kvm)
                       " module option.\n", __func__);
                iommu_domain_free(kvm->arch.iommu_domain);
                kvm->arch.iommu_domain = NULL;
-                r = -EPERM;
+                return -EPERM;
-                goto out_unlock;
        }
        r = kvm_iommu_map_memslots(kvm);
        if (r)
-                kvm_iommu_unmap_memslots(kvm);
+                goto out_unmap;
-out_unlock:
+        return 0;
-        mutex_unlock(&kvm->slots_lock);
+out_unmap:
+        kvm_iommu_unmap_memslots(kvm);
        return r;
 }
@@ -296,12 +290,6 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
                /* Get physical address */
                phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-                if (!phys) {
-                        gfn++;
-                        continue;
-                }
                pfn  = phys >> PAGE_SHIFT;
                /* Unmap address from IO address space */
@@ -315,23 +303,18 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
        }
 }
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-        kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 {
-        int idx;
+        int i, idx;
        struct kvm_memslots *slots;
-        struct kvm_memory_slot *memslot;
        idx = srcu_read_lock(&kvm->srcu);
        slots = kvm_memslots(kvm);
-        kvm_for_each_memslot(memslot, slots)
+        for (i = 0; i < slots->nmemslots; i++) {
-                kvm_iommu_unmap_pages(kvm, memslot);
+                kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
+                                    slots->memslots[i].npages);
+        }
        srcu_read_unlock(&kvm->srcu, idx);
        return 0;
@@ -345,11 +328,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
        if (!domain)
                return 0;
-        mutex_lock(&kvm->slots_lock);
        kvm_iommu_unmap_memslots(kvm);
-        kvm->arch.iommu_domain = NULL;
-        mutex_unlock(&kvm->slots_lock);
        iommu_domain_free(domain);
        return 0;
 }
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 656fa455e15..9f614b4e365 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,12 +33,26 @@
 #include "ioapic.h"
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+                                     int irq_source_id, int level)
+{
+        /* Logical OR for level trig interrupt */
+        if (level)
+                set_bit(irq_source_id, irq_state);
+        else
+                clear_bit(irq_source_id, irq_state);
+        return !!(*irq_state);
+}
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
                           struct kvm *kvm, int irq_source_id, int level)
 {
 #ifdef CONFIG_X86
        struct kvm_pic *pic = pic_irqchip(kvm);
-        return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+        level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+                                   irq_source_id, level);
+        return kvm_pic_set_irq(pic, e->irqchip.pin, level);
 #else
        return -1;
 #endif
@@ -48,7 +62,10 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
                              struct kvm *kvm, int irq_source_id, int level)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-        return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level);
+        level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+                                   irq_source_id, level);
+        return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
 }
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -68,13 +85,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        struct kvm_vcpu *vcpu, *lowest = NULL;
        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-                        kvm_is_dm_lowest_prio(irq)) {
+                        kvm_is_dm_lowest_prio(irq))
                printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
-                irq->delivery_mode = APIC_DM_FIXED;
-        }
-        if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
-                return r;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (!kvm_apic_present(vcpu))
@@ -102,23 +114,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        return r;
 }
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-                                   struct kvm_lapic_irq *irq)
-{
-        trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
-        irq->dest_id = (e->msi.address_lo &
-                        MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-        irq->vector = (e->msi.data &
-                        MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-        irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
-        irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
-        irq->delivery_mode = e->msi.data & 0x700;
-        irq->level = 1;
-        irq->shorthand = 0;
-        /* TODO Deal with RH bit of MSI message address */
-}
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
                struct kvm *kvm, int irq_source_id, int level)
 {
@@ -127,38 +122,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
        if (!level)
                return -1;
-        kvm_set_msi_irq(e, &irq);
+        trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
-        return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
-}
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
-                         struct kvm *kvm)
-{
-        struct kvm_lapic_irq irq;
-        int r;
-        kvm_set_msi_irq(e, &irq);
-        if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
-                return r;
-        else
-                return -EWOULDBLOCK;
-}
-int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
-        struct kvm_kernel_irq_routing_entry route;
-        if (!irqchip_in_kernel(kvm) || msi->flags != 0)
-                return -EINVAL;
-        route.msi.address_lo = msi->address_lo;
+        irq.dest_id = (e->msi.address_lo &
-        route.msi.address_hi = msi->address_hi;
+                        MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-        route.msi.data = msi->data;
+        irq.vector = (e->msi.data &
+                        MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+        irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+        irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+        irq.delivery_mode = e->msi.data & 0x700;
+        irq.level = 1;
+        irq.shorthand = 0;
-        return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+        /* TODO Deal with RH bit of MSI message address */
+        return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 /*
@@ -199,44 +176,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
        return ret;
 }
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-        struct kvm_kernel_irq_routing_entry *e;
-        int ret = -EINVAL;
-        struct kvm_irq_routing_table *irq_rt;
-        struct hlist_node *n;
-        trace_kvm_set_irq(irq, level, irq_source_id);
-        /*
-         * Injection into either PIC or IOAPIC might need to scan all CPUs,
-         * which would need to be retried from thread context;  when same GSI
-         * is connected to both PIC and IOAPIC, we'd have to report a
-         * partial failure here.
-         * Since there's no easy way to do this, we only support injecting MSI
-         * which is limited to 1:1 GSI mapping.
-         */
-        rcu_read_lock();
-        irq_rt = rcu_dereference(kvm->irq_routing);
-        if (irq < irq_rt->nr_rt_entries)
-                hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
-                        if (likely(e->type == KVM_IRQ_ROUTING_MSI))
-                                ret = kvm_set_msi_inatomic(e, kvm);
-                        else
-                                ret = -EWOULDBLOCK;
-                        break;
-                }
-        rcu_read_unlock();
-        return ret;
-}
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
        struct kvm_irq_ack_notifier *kian;
@@ -287,9 +226,6 @@ int kvm_request_irq_source_id(struct kvm *kvm)
        }
        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-#ifdef CONFIG_X86
-        ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-#endif
        set_bit(irq_source_id, bitmap);
 unlock:
        mutex_unlock(&kvm->irq_lock);
@@ -299,10 +235,9 @@ unlock:
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
+        int i;
        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-#ifdef CONFIG_X86
-        ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-#endif
        mutex_lock(&kvm->irq_lock);
        if (irq_source_id < 0 ||
@@ -314,10 +249,14 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
        if (!irqchip_in_kernel(kvm))
                goto unlock;
-        kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+        for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+                clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+                if (i >= 16)
+                        continue;
 #ifdef CONFIG_X86
-        kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+                clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
 #endif
+        }
 unlock:
        mutex_unlock(&kvm->irq_lock);
 }
@@ -379,7 +318,6 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
         */
        hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
                if (ei->type == KVM_IRQ_ROUTING_MSI ||
-                    ue->type == KVM_IRQ_ROUTING_MSI ||
                    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
                        return r;
@@ -391,11 +329,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
                switch (ue->u.irqchip.irqchip) {
                case KVM_IRQCHIP_PIC_MASTER:
                        e->set = kvm_set_pic_irq;
-                        max_pin = PIC_NUM_PINS;
+                        max_pin = 16;
                        break;
                case KVM_IRQCHIP_PIC_SLAVE:
                        e->set = kvm_set_pic_irq;
-                        max_pin = PIC_NUM_PINS;
+                        max_pin = 16;
                        delta = 8;
                        break;
                case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cd693a76a5..aefdda390f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,8 +47,6 @@
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
-#include <linux/sort.h>
-#include <linux/bsearch.h>
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -100,7 +98,13 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 static bool largepages_enabled = true;
-bool kvm_is_mmio_pfn(pfn_t pfn)
+static struct page *hwpoison_page;
+static pfn_t hwpoison_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
+inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
                int reserved;
@@ -131,12 +135,11 @@ bool kvm_is_mmio_pfn(pfn_t pfn)
 /*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
-int vcpu_load(struct kvm_vcpu *vcpu)
+void vcpu_load(struct kvm_vcpu *vcpu)
 {
        int cpu;
-        if (mutex_lock_killable(&vcpu->mutex))
+        mutex_lock(&vcpu->mutex);
-                return -EINTR;
        if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
                /* The thread running this VCPU changed. */
                struct pid *oldpid = vcpu->pid;
@@ -149,7 +152,6 @@ int vcpu_load(struct kvm_vcpu *vcpu)
        preempt_notifier_register(&vcpu->preempt_notifier);
        kvm_arch_vcpu_load(vcpu, cpu);
        put_cpu();
-        return 0;
 }
 void vcpu_put(struct kvm_vcpu *vcpu)
@@ -199,7 +201,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-        long dirty_count = kvm->tlbs_dirty;
+        int dirty_count = kvm->tlbs_dirty;
        smp_mb();
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -212,11 +214,6 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
-{
-        make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
-}
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
        struct page *page;
@@ -237,9 +234,6 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        }
        vcpu->run = page_address(page);
-        kvm_vcpu_set_in_spin_loop(vcpu, false);
-        kvm_vcpu_set_dy_eligible(vcpu, false);
        r = kvm_arch_vcpu_init(vcpu);
        if (r < 0)
                goto fail_free_run;
@@ -293,15 +287,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
         */
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
        kvm->mmu_notifier_seq++;
        need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
+        spin_unlock(&kvm->mmu_lock);
+        srcu_read_unlock(&kvm->srcu, idx);
        /* we've to flush the tlb before the pages can be freed */
        if (need_tlb_flush)
                kvm_flush_remote_tlbs(kvm);
-        spin_unlock(&kvm->mmu_lock);
-        srcu_read_unlock(&kvm->srcu, idx);
 }
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -336,14 +330,15 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_notifier_count++;
-        need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
+        for (; start < end; start += PAGE_SIZE)
+                need_tlb_flush |= kvm_unmap_hva(kvm, start);
        need_tlb_flush |= kvm->tlbs_dirty;
+        spin_unlock(&kvm->mmu_lock);
+        srcu_read_unlock(&kvm->srcu, idx);
        /* we've to flush the tlb before the pages can be freed */
        if (need_tlb_flush)
                kvm_flush_remote_tlbs(kvm);
-        spin_unlock(&kvm->mmu_lock);
-        srcu_read_unlock(&kvm->srcu, idx);
 }
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -360,11 +355,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * been freed.
         */
        kvm->mmu_notifier_seq++;
-        smp_wmb();
        /*
         * The above sequence increase must be visible before the
-         * below count decrease, which is ensured by the smp_wmb above
+         * below count decrease but both values are read by the kvm
-         * in conjunction with the smp_rmb in mmu_notifier_retry().
+         * page fault under mmu_lock spinlock so we don't need to add
+         * a smb_wmb() here in between the two.
         */
        kvm->mmu_notifier_count--;
        spin_unlock(&kvm->mmu_lock);
@@ -381,14 +376,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
        young = kvm_age_hva(kvm, address);
-        if (young)
-                kvm_flush_remote_tlbs(kvm);
        spin_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
+        if (young)
+                kvm_flush_remote_tlbs(kvm);
        return young;
 }
@@ -415,7 +409,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
        int idx;
        idx = srcu_read_lock(&kvm->srcu);
-        kvm_arch_flush_shadow_all(kvm);
+        kvm_arch_flush_shadow(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 }
@@ -444,16 +438,7 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
-static void kvm_init_memslots_id(struct kvm *kvm)
+static struct kvm *kvm_create_vm(void)
-{
-        int i;
-        struct kvm_memslots *slots = kvm->memslots;
-        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-                slots->id_to_index[i] = slots->memslots[i].id = i;
-}
-static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
        struct kvm *kvm = kvm_arch_alloc_vm();
@@ -461,7 +446,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
        if (!kvm)
                return ERR_PTR(-ENOMEM);
-        r = kvm_arch_init_vm(kvm, type);
+        r = kvm_arch_init_vm(kvm);
        if (r)
                goto out_err_nodisable;
@@ -478,7 +463,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
                goto out_err_nosrcu;
-        kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_nosrcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
@@ -519,33 +503,18 @@ out_err_nodisable:
        return ERR_PTR(r);
 }
-/*
- * Avoid using vmalloc for a small buffer.
- * Should not be used when the size is statically known.
- */
-void *kvm_kvzalloc(unsigned long size)
-{
-        if (size > PAGE_SIZE)
-                return vzalloc(size);
-        else
-                return kzalloc(size, GFP_KERNEL);
-}
-void kvm_kvfree(const void *addr)
-{
-        if (is_vmalloc_addr(addr))
-                vfree(addr);
-        else
-                kfree(addr);
-}
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
        if (!memslot->dirty_bitmap)
                return;
-        kvm_kvfree(memslot->dirty_bitmap);
+        if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
+                vfree(memslot->dirty_bitmap_head);
+        else
+                kfree(memslot->dirty_bitmap_head);
        memslot->dirty_bitmap = NULL;
+        memslot->dirty_bitmap_head = NULL;
 }
 /*
@@ -554,21 +523,33 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
                                  struct kvm_memory_slot *dont)
 {
+        int i;
+        if (!dont || free->rmap != dont->rmap)
+                vfree(free->rmap);
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                kvm_destroy_dirty_bitmap(free);
-        kvm_arch_free_memslot(free, dont);
+        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+                if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
+                        vfree(free->lpage_info[i]);
+                        free->lpage_info[i] = NULL;
+                }
+        }
        free->npages = 0;
+        free->rmap = NULL;
 }
 void kvm_free_physmem(struct kvm *kvm)
 {
+        int i;
        struct kvm_memslots *slots = kvm->memslots;
-        struct kvm_memory_slot *memslot;
-        kvm_for_each_memslot(memslot, slots)
+        for (i = 0; i < slots->nmemslots; ++i)
-                kvm_free_physmem_slot(memslot, NULL);
+                kvm_free_physmem_slot(&slots->memslots[i], NULL);
        kfree(kvm->memslots);
 }
@@ -589,7 +570,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
-        kvm_arch_flush_shadow_all(kvm);
+        kvm_arch_flush_shadow(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
        kvm_free_physmem(kvm);
@@ -623,81 +604,28 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
        return 0;
 }
+#ifndef CONFIG_S390
 /*
 * Allocation size is twice as large as the actual dirty bitmap size.
- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
 */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
-#ifndef CONFIG_S390
        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
-        memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
+        if (dirty_bytes > PAGE_SIZE)
+                memslot->dirty_bitmap = vzalloc(dirty_bytes);
+        else
+                memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
        if (!memslot->dirty_bitmap)
                return -ENOMEM;
-#endif /* !CONFIG_S390 */
+        memslot->dirty_bitmap_head = memslot->dirty_bitmap;
-        return 0;
-}
-static int cmp_memslot(const void *slot1, const void *slot2)
-{
-        struct kvm_memory_slot *s1, *s2;
-        s1 = (struct kvm_memory_slot *)slot1;
-        s2 = (struct kvm_memory_slot *)slot2;
-        if (s1->npages < s2->npages)
-                return 1;
-        if (s1->npages > s2->npages)
-                return -1;
-        return 0;
-}
-/*
- * Sort the memslots base on its size, so the larger slots
- * will get better fit.
- */
-static void sort_memslots(struct kvm_memslots *slots)
-{
-        int i;
-        sort(slots->memslots, KVM_MEM_SLOTS_NUM,
-              sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
-        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-                slots->id_to_index[slots->memslots[i].id] = i;
-}
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
-{
-        if (new) {
-                int id = new->id;
-                struct kvm_memory_slot *old = id_to_memslot(slots, id);
-                unsigned long npages = old->npages;
-                *old = *new;
-                if (new->npages != npages)
-                        sort_memslots(slots);
-        }
-        slots->generation++;
-}
-static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
-{
-        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
-#ifdef KVM_CAP_READONLY_MEM
-        valid_flags |= KVM_MEM_READONLY;
-#endif
-        if (mem->flags & ~valid_flags)
-                return -EINVAL;
        return 0;
 }
+#endif /* !CONFIG_S390 */
 /*
 * Allocate some memory and give it an address in the guest physical address
@@ -714,14 +642,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
        int r;
        gfn_t base_gfn;
        unsigned long npages;
-        struct kvm_memory_slot *memslot, *slot;
+        unsigned long i;
+        struct kvm_memory_slot *memslot;
        struct kvm_memory_slot old, new;
        struct kvm_memslots *slots, *old_memslots;
-        r = check_memory_region_flags(mem);
-        if (r)
-                goto out;
        r = -EINVAL;
        /* General sanity checks */
        if (mem->memory_size & (PAGE_SIZE - 1))
@@ -735,12 +660,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
                        (void __user *)(unsigned long)mem->userspace_addr,
                        mem->memory_size)))
                goto out;
-        if (mem->slot >= KVM_MEM_SLOTS_NUM)
+        if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
                goto out;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
                goto out;
-        memslot = id_to_memslot(kvm->memslots, mem->slot);
+        memslot = &kvm->memslots->memslots[mem->slot];
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
@@ -765,11 +690,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
        /* Check for overlaps */
        r = -EEXIST;
-        kvm_for_each_memslot(slot, kvm->memslots) {
+        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-                if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot)
+                struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
+                if (s == memslot || !s->npages)
                        continue;
-                if (!((base_gfn + npages <= slot->base_gfn) ||
+                if (!((base_gfn + npages <= s->base_gfn) ||
-                      (base_gfn >= slot->base_gfn + slot->npages)))
+                      (base_gfn >= s->base_gfn + s->npages)))
                        goto out_free;
        }
@@ -780,45 +707,92 @@ int __kvm_set_memory_region(struct kvm *kvm,
        r = -ENOMEM;
        /* Allocate if a slot is being created */
-        if (npages && !old.npages) {
+#ifndef CONFIG_S390
+        if (npages && !new.rmap) {
+                new.rmap = vzalloc(npages * sizeof(*new.rmap));
+                if (!new.rmap)
+                        goto out_free;
                new.user_alloc = user_alloc;
                new.userspace_addr = mem->userspace_addr;
+        }
+        if (!npages)
+                goto skip_lpage;
+        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+                unsigned long ugfn;
+                unsigned long j;
+                int lpages;
+                int level = i + 2;
+                /* Avoid unused variable warning if no large pages */
+                (void)level;
+                if (new.lpage_info[i])
+                        continue;
-                if (kvm_arch_create_memslot(&new, npages))
+                lpages = 1 + ((base_gfn + npages - 1)
+                             >> KVM_HPAGE_GFN_SHIFT(level));
+                lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+                new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
+                if (!new.lpage_info[i])
                        goto out_free;
+                if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                        new.lpage_info[i][0].write_count = 1;
+                if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                        new.lpage_info[i][lpages - 1].write_count = 1;
+                ugfn = new.userspace_addr >> PAGE_SHIFT;
+                /*
+                 * If the gfn and userspace address are not aligned wrt each
+                 * other, or if explicitly asked to, disable large page
+                 * support for this slot
+                 */
+                if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                    !largepages_enabled)
+                        for (j = 0; j < lpages; ++j)
+                                new.lpage_info[i][j].write_count = 1;
        }
+skip_lpage:
        /* Allocate page dirty bitmap if needed */
        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
                if (kvm_create_dirty_bitmap(&new) < 0)
                        goto out_free;
                /* destroy any largepage mappings for dirty tracking */
        }
+#else  /* not defined CONFIG_S390 */
+        new.user_alloc = user_alloc;
+        if (user_alloc)
+                new.userspace_addr = mem->userspace_addr;
+#endif /* not defined CONFIG_S390 */
-        if (!npages || base_gfn != old.base_gfn) {
+        if (!npages) {
-                struct kvm_memory_slot *slot;
                r = -ENOMEM;
-                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+                slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-                                GFP_KERNEL);
                if (!slots)
                        goto out_free;
-                slot = id_to_memslot(slots, mem->slot);
+                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-                slot->flags |= KVM_MEMSLOT_INVALID;
+                if (mem->slot >= slots->nmemslots)
+                        slots->nmemslots = mem->slot + 1;
-                update_memslots(slots, NULL);
+                slots->generation++;
+                slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
                old_memslots = kvm->memslots;
                rcu_assign_pointer(kvm->memslots, slots);
                synchronize_srcu_expedited(&kvm->srcu);
-                /* From this point no new shadow pages pointing to a deleted,
+                /* From this point no new shadow pages pointing to a deleted
-                 * or moved, memslot will be created.
+                 * memslot will be created.
                 *
                 * validation of sp->gfn happens in:
                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
                 *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
-                kvm_arch_flush_shadow_memslot(kvm, slot);
+                kvm_arch_flush_shadow(kvm);
                kfree(old_memslots);
        }
@@ -826,33 +800,44 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (r)
                goto out_free;
-        /* map/unmap the pages in iommu page table */
+        /* map the pages in iommu page table */
        if (npages) {
                r = kvm_iommu_map_pages(kvm, &new);
                if (r)
                        goto out_free;
-        } else
+        }
-                kvm_iommu_unmap_pages(kvm, &old);
        r = -ENOMEM;
-        slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+        slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-                        GFP_KERNEL);
        if (!slots)
                goto out_free;
+        memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+        if (mem->slot >= slots->nmemslots)
+                slots->nmemslots = mem->slot + 1;
+        slots->generation++;
        /* actual memory is freed via old in kvm_free_physmem_slot below */
        if (!npages) {
+                new.rmap = NULL;
                new.dirty_bitmap = NULL;
-                memset(&new.arch, 0, sizeof(new.arch));
+                for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
+                        new.lpage_info[i] = NULL;
        }
-        update_memslots(slots, &new);
+        slots->memslots[mem->slot] = new;
        old_memslots = kvm->memslots;
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+        /*
+         * If the new memory slot is created, we need to clear all
+         * mmio sptes.
+         */
+        if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
+                kvm_arch_flush_shadow(kvm);
        kvm_free_physmem_slot(&old, &new);
        kfree(old_memslots);
@@ -901,7 +886,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
        if (log->slot >= KVM_MEMORY_SLOTS)
                goto out;
-        memslot = id_to_memslot(kvm->memslots, log->slot);
+        memslot = &kvm->memslots->memslots[log->slot];
        r = -ENOENT;
        if (!memslot->dirty_bitmap)
                goto out;
@@ -923,17 +908,74 @@ out:
        return r;
 }
-bool kvm_largepages_enabled(void)
-{
-        return largepages_enabled;
-}
 void kvm_disable_largepages(void)
 {
        largepages_enabled = false;
 }
 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+int is_error_page(struct page *page)
+{
+        return page == bad_page || page == hwpoison_page || page == fault_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+int is_error_pfn(pfn_t pfn)
+{
+        return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_error_pfn);
+int is_hwpoison_pfn(pfn_t pfn)
+{
+        return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+int is_fault_pfn(pfn_t pfn)
+{
+        return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+int is_noslot_pfn(pfn_t pfn)
+{
+        return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_noslot_pfn);
+int is_invalid_pfn(pfn_t pfn)
+{
+        return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+static inline unsigned long bad_hva(void)
+{
+        return PAGE_OFFSET;
+}
+int kvm_is_error_hva(unsigned long addr)
+{
+        return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+                                                gfn_t gfn)
+{
+        int i;
+        for (i = 0; i < slots->nmemslots; ++i) {
+                struct kvm_memory_slot *memslot = &slots->memslots[i];
+                if (gfn >= memslot->base_gfn
+                    && gfn < memslot->base_gfn + memslot->npages)
+                        return memslot;
+        }
+        return NULL;
+}
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -942,13 +984,20 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
-        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
+        int i;
+        struct kvm_memslots *slots = kvm_memslots(kvm);
-        if (!memslot || memslot->id >= KVM_MEMORY_SLOTS ||
+        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-              memslot->flags & KVM_MEMSLOT_INVALID)
+                struct kvm_memory_slot *memslot = &slots->memslots[i];
-                return 0;
+                if (memslot->flags & KVM_MEMSLOT_INVALID)
+                        continue;
-        return 1;
+                if (gfn >= memslot->base_gfn
+                    && gfn < memslot->base_gfn + memslot->npages)
+                        return 1;
+        }
+        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
@@ -976,38 +1025,17 @@ out:
        return size;
 }
-static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-{
+                                     gfn_t *nr_pages)
-        return slot->flags & KVM_MEM_READONLY;
-}
-static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-                                       gfn_t *nr_pages, bool write)
 {
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-                return KVM_HVA_ERR_BAD;
+                return bad_hva();
-        if (memslot_is_readonly(slot) && write)
-                return KVM_HVA_ERR_RO_BAD;
        if (nr_pages)
                *nr_pages = slot->npages - (gfn - slot->base_gfn);
-        return __gfn_to_hva_memslot(slot, gfn);
+        return gfn_to_hva_memslot(slot, gfn);
-}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-                                     gfn_t *nr_pages)
-{
-        return __gfn_to_hva_many(slot, gfn, nr_pages, true);
-}
-unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
-                                 gfn_t gfn)
-{
-        return gfn_to_hva_many(slot, gfn, NULL);
 }
-EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
@@ -1015,23 +1043,10 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
-/*
+static pfn_t get_fault_pfn(void)
- * The hva returned by this function is only allowed to be read.
- * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
- */
-static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
 {
-        return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+        get_page(fault_page);
-}
+        return fault_pfn;
-static int kvm_read_hva(void *data, void __user *hva, int len)
-{
-        return __copy_from_user(data, hva, len);
-}
-static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
-{
-        return __copy_from_user_inatomic(data, hva, len);
 }
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1054,186 +1069,108 @@ static inline int check_user_page_hwpoison(unsigned long addr)
        return rc == -EHWPOISON;
 }
-/*
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- * The atomic path to get the writable pfn which will be stored in @pfn,
+                        bool *async, bool write_fault, bool *writable)
- * true indicates success, otherwise false is returned.
- */
-static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
-                            bool write_fault, bool *writable, pfn_t *pfn)
 {
        struct page *page[1];
-        int npages;
+        int npages = 0;
+        pfn_t pfn;
-        if (!(async || atomic))
+        /* we can do it either atomically or asynchronously, not both */
-                return false;
+        BUG_ON(atomic && async);
-        /*
+        BUG_ON(!write_fault && !writable);
-         * Fast pin a writable pfn only if it is a write fault request
-         * or the caller allows to map a writable pfn for a read fault
-         * request.
-         */
-        if (!(write_fault || writable))
-                return false;
-        npages = __get_user_pages_fast(addr, 1, 1, page);
+        if (writable)
-        if (npages == 1) {
+                *writable = true;
-                *pfn = page_to_pfn(page[0]);
-                if (writable)
+        if (atomic || async)
-                        *writable = true;
+                npages = __get_user_pages_fast(addr, 1, 1, page);
-                return true;
-        }
-        return false;
+        if (unlikely(npages != 1) && !atomic) {
-}
+                might_sleep();
-/*
+                if (writable)
- * The slow path to get the pfn of the specified host virtual address,
+                        *writable = write_fault;
- * 1 indicates success, -errno is returned if error is detected.
- */
-static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-                           bool *writable, pfn_t *pfn)
-{
-        struct page *page[1];
-        int npages = 0;
-        might_sleep();
-        if (writable)
-                *writable = write_fault;
-        if (async) {
+                if (async) {
-                down_read(&current->mm->mmap_sem);
+                        down_read(&current->mm->mmap_sem);
-                npages = get_user_page_nowait(current, current->mm,
+                        npages = get_user_page_nowait(current, current->mm,
-                                              addr, write_fault, page);
+                                                     addr, write_fault, page);
-                up_read(&current->mm->mmap_sem);
+                        up_read(&current->mm->mmap_sem);
-        } else
+                } else
-                npages = get_user_pages_fast(addr, 1, write_fault,
+                        npages = get_user_pages_fast(addr, 1, write_fault,
-                                             page);
+                                                     page);
-        if (npages != 1)
-                return npages;
+                /* map read fault as writable if possible */
+                if (unlikely(!write_fault) && npages == 1) {
-        /* map read fault as writable if possible */
+                        struct page *wpage[1];
-        if (unlikely(!write_fault) && writable) {
-                struct page *wpage[1];
+                        npages = __get_user_pages_fast(addr, 1, 1, wpage);
+                        if (npages == 1) {
-                npages = __get_user_pages_fast(addr, 1, 1, wpage);
+                                *writable = true;
-                if (npages == 1) {
+                                put_page(page[0]);
-                        *writable = true;
+                                page[0] = wpage[0];
-                        put_page(page[0]);
+                        }
-                        page[0] = wpage[0];
+                        npages = 1;
                }
-                npages = 1;
        }
-        *pfn = page_to_pfn(page[0]);
-        return npages;
-}
-static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+        if (unlikely(npages != 1)) {
-{
+                struct vm_area_struct *vma;
-        if (unlikely(!(vma->vm_flags & VM_READ)))
-                return false;
-        if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
-                return false;
-        return true;
-}
-/*
- * Pin guest page in memory and return its pfn.
- * @addr: host virtual address which maps memory to the guest
- * @atomic: whether this function can sleep
- * @async: whether this function need to wait IO complete if the
- *         host page is not in the memory
- * @write_fault: whether we should get a writable host page
- * @writable: whether it allows to map a writable host page for !@write_fault
- *
- * The function will map a writable host page for these two cases:
- * 1): @write_fault = true
- * 2): @write_fault = false && @writable, @writable will tell the caller
- *     whether the mapping is writable.
- */
-static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-                        bool write_fault, bool *writable)
-{
-        struct vm_area_struct *vma;
-        pfn_t pfn = 0;
-        int npages;
-        /* we can do it either atomically or asynchronously, not both */
+                if (atomic)
-        BUG_ON(atomic && async);
+                        return get_fault_pfn();
-        if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+                down_read(&current->mm->mmap_sem);
-                return pfn;
+                if (npages == -EHWPOISON ||
+                        (!async && check_user_page_hwpoison(addr))) {
-        if (atomic)
+                        up_read(&current->mm->mmap_sem);
-                return KVM_PFN_ERR_FAULT;
+                        get_page(hwpoison_page);
+                        return page_to_pfn(hwpoison_page);
+                }
-        npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+                vma = find_vma_intersection(current->mm, addr, addr+1);
-        if (npages == 1)
-                return pfn;
+                if (vma == NULL)
+                        pfn = get_fault_pfn();
+                else if ((vma->vm_flags & VM_PFNMAP)) {
+                        pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                                vma->vm_pgoff;
+                        BUG_ON(!kvm_is_mmio_pfn(pfn));
+                } else {
+                        if (async && (vma->vm_flags & VM_WRITE))
+                                *async = true;
+                        pfn = get_fault_pfn();
+                }
+                up_read(&current->mm->mmap_sem);
+        } else
+                pfn = page_to_pfn(page[0]);
-        down_read(&current->mm->mmap_sem);
-        if (npages == -EHWPOISON ||
-              (!async && check_user_page_hwpoison(addr))) {
-                pfn = KVM_PFN_ERR_HWPOISON;
-                goto exit;
-        }
-        vma = find_vma_intersection(current->mm, addr, addr + 1);
-        if (vma == NULL)
-                pfn = KVM_PFN_ERR_FAULT;
-        else if ((vma->vm_flags & VM_PFNMAP)) {
-                pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                        vma->vm_pgoff;
-                BUG_ON(!kvm_is_mmio_pfn(pfn));
-        } else {
-                if (async && vma_is_valid(vma, write_fault))
-                        *async = true;
-                pfn = KVM_PFN_ERR_FAULT;
-        }
-exit:
-        up_read(&current->mm->mmap_sem);
        return pfn;
 }
-static pfn_t
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
-__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
-                     bool *async, bool write_fault, bool *writable)
 {
-        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+        return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
-        if (addr == KVM_HVA_ERR_RO_BAD)
-                return KVM_PFN_ERR_RO_FAULT;
-        if (kvm_is_error_hva(addr))
-                return KVM_PFN_NOSLOT;
-        /* Do not map writable pfn in the readonly memslot. */
-        if (writable && memslot_is_readonly(slot)) {
-                *writable = false;
-                writable = NULL;
-        }
-        return hva_to_pfn(addr, atomic, async, write_fault,
-                          writable);
 }
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
                          bool write_fault, bool *writable)
 {
-        struct kvm_memory_slot *slot;
+        unsigned long addr;
        if (async)
                *async = false;
-        slot = gfn_to_memslot(kvm, gfn);
+        addr = gfn_to_hva(kvm, gfn);
+        if (kvm_is_error_hva(addr)) {
+                get_page(bad_page);
+                return page_to_pfn(bad_page);
+        }
-        return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+        return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
-                                    writable);
 }
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1262,17 +1199,13 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+                         struct kvm_memory_slot *slot, gfn_t gfn)
 {
-        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+        return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
 }
-pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
-{
-        return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
                                                                  int nr_pages)
 {
@@ -1290,49 +1223,37 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
-static struct page *kvm_pfn_to_page(pfn_t pfn)
-{
-        if (is_error_noslot_pfn(pfn))
-                return KVM_ERR_PTR_BAD_PAGE;
-        if (kvm_is_mmio_pfn(pfn)) {
-                WARN_ON(1);
-                return KVM_ERR_PTR_BAD_PAGE;
-        }
-        return pfn_to_page(pfn);
-}
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
        pfn_t pfn;
        pfn = gfn_to_pfn(kvm, gfn);
+        if (!kvm_is_mmio_pfn(pfn))
+                return pfn_to_page(pfn);
+        WARN_ON(kvm_is_mmio_pfn(pfn));
-        return kvm_pfn_to_page(pfn);
+        get_page(bad_page);
+        return bad_page;
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 void kvm_release_page_clean(struct page *page)
 {
-        WARN_ON(is_error_page(page));
        kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-        if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+        if (!kvm_is_mmio_pfn(pfn))
                put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 void kvm_release_page_dirty(struct page *page)
 {
-        WARN_ON(is_error_page(page));
        kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1388,10 +1309,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
        int r;
        unsigned long addr;
-        addr = gfn_to_hva_read(kvm, gfn);
+        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-        r = kvm_read_hva(data, (void __user *)addr + offset, len);
+        r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@ -1426,11 +1347,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int offset = offset_in_page(gpa);
-        addr = gfn_to_hva_read(kvm, gfn);
+        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
-        r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
+        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
@@ -1484,7 +1405,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        ghc->gpa = gpa;
        ghc->generation = slots->generation;
-        ghc->memslot = gfn_to_memslot(kvm, gfn);
+        ghc->memslot = __gfn_to_memslot(slots, gfn);
        ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
        if (!kvm_is_error_hva(ghc->hva))
                ghc->hva += offset;
@@ -1568,7 +1489,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
-                set_bit_le(rel_gfn, memslot->dirty_bitmap);
+                __set_bit_le(rel_gfn, memslot->dirty_bitmap);
        }
 }
@@ -1605,30 +1526,6 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        finish_wait(&vcpu->wq, &wait);
 }
-#ifndef CONFIG_S390
-/*
- * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
- */
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-        int me;
-        int cpu = vcpu->cpu;
-        wait_queue_head_t *wqp;
-        wqp = kvm_arch_vcpu_wq(vcpu);
-        if (waitqueue_active(wqp)) {
-                wake_up_interruptible(wqp);
-                ++vcpu->stat.halt_wakeup;
-        }
-        me = get_cpu();
-        if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-                if (kvm_arch_vcpu_should_kick(vcpu))
-                        smp_send_reschedule(cpu);
-        put_cpu();
-}
-#endif /* !CONFIG_S390 */
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
        if (!need_resched())
@@ -1637,68 +1534,6 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
-{
-        struct pid *pid;
-        struct task_struct *task = NULL;
-        rcu_read_lock();
-        pid = rcu_dereference(target->pid);
-        if (pid)
-                task = get_pid_task(target->pid, PIDTYPE_PID);
-        rcu_read_unlock();
-        if (!task)
-                return false;
-        if (task->flags & PF_VCPU) {
-                put_task_struct(task);
-                return false;
-        }
-        if (yield_to(task, 1)) {
-                put_task_struct(task);
-                return true;
-        }
-        put_task_struct(task);
-        return false;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
-#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
-/*
- * Helper that checks whether a VCPU is eligible for directed yield.
- * Most eligible candidate to yield is decided by following heuristics:
- *
- *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
- *  (preempted lock holder), indicated by @in_spin_loop.
- *  Set at the beiginning and cleared at the end of interception/PLE handler.
- *
- *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
- *  chance last time (mostly it has become eligible now since we have probably
- *  yielded to lockholder in last iteration. This is done by toggling
- *  @dy_eligible each time a VCPU checked for eligibility.)
- *
- *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
- *  to preempted lock-holder could result in wrong VCPU selection and CPU
- *  burning. Giving priority for a potential lock-holder increases lock
- *  progress.
- *
- *  Since algorithm is based on heuristics, accessing another VCPU data without
- *  locking does not harm. It may result in trying to yield to  same VCPU, fail
- *  and continue with next VCPU and so on.
- */
-bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
-{
-        bool eligible;
-        eligible = !vcpu->spin_loop.in_spin_loop ||
-                        (vcpu->spin_loop.in_spin_loop &&
-                         vcpu->spin_loop.dy_eligible);
-        if (vcpu->spin_loop.in_spin_loop)
-                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
-        return eligible;
-}
-#endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
        struct kvm *kvm = me->kvm;
@@ -1708,7 +1543,6 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
        int pass;
        int i;
-        kvm_vcpu_set_in_spin_loop(me, true);
        /*
         * We boost the priority of a VCPU that is runnable but not
         * currently running, because it got preempted by something
@@ -1718,7 +1552,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
         */
        for (pass = 0; pass < 2 && !yielded; pass++) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
-                        if (!pass && i <= last_boosted_vcpu) {
+                        struct task_struct *task = NULL;
+                        struct pid *pid;
+                        if (!pass && i < last_boosted_vcpu) {
                                i = last_boosted_vcpu;
                                continue;
                        } else if (pass && i > last_boosted_vcpu)
@@ -1727,19 +1563,26 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        if (waitqueue_active(&vcpu->wq))
                                continue;
-                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+                        rcu_read_lock();
+                        pid = rcu_dereference(vcpu->pid);
+                        if (pid)
+                                task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+                        rcu_read_unlock();
+                        if (!task)
                                continue;
-                        if (kvm_vcpu_yield_to(vcpu)) {
+                        if (task->flags & PF_VCPU) {
+                                put_task_struct(task);
+                                continue;
+                        }
+                        if (yield_to(task, 1)) {
+                                put_task_struct(task);
                                kvm->last_boosted_vcpu = i;
                                yielded = 1;
                                break;
                        }
+                        put_task_struct(task);
                }
        }
-        kvm_vcpu_set_in_spin_loop(me, false);
-        /* Ensure vcpu is not eligible during next spinloop */
-        kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -1759,7 +1602,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
        else
-                return kvm_arch_vcpu_fault(vcpu, vmf);
+                return VM_FAULT_SIGBUS;
        get_page(page);
        vmf->page = page;
        return 0;
@@ -1820,10 +1663,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto vcpu_destroy;
        mutex_lock(&kvm->lock);
-        if (!kvm_vcpu_compatible(vcpu)) {
-                r = -EINVAL;
-                goto unlock_vcpu_destroy;
-        }
        if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
                r = -EINVAL;
                goto unlock_vcpu_destroy;
@@ -1849,8 +1688,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        smp_wmb();
        atomic_inc(&kvm->online_vcpus);
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+        if (kvm->bsp_vcpu_id == id)
+                kvm->bsp_vcpu = vcpu;
+#endif
        mutex_unlock(&kvm->lock);
-        kvm_arch_vcpu_postcreate(vcpu);
        return r;
 unlock_vcpu_destroy:
@@ -1893,9 +1735,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 #endif
-        r = vcpu_load(vcpu);
+        vcpu_load(vcpu);
-        if (r)
-                return r;
        switch (ioctl) {
        case KVM_RUN:
                r = -EINVAL;
@@ -1926,12 +1766,17 @@ out_free1:
                struct kvm_regs *kvm_regs;
                r = -ENOMEM;
-                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
+                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
-                if (IS_ERR(kvm_regs)) {
+                if (!kvm_regs)
-                        r = PTR_ERR(kvm_regs);
                        goto out;
-                }
+                r = -EFAULT;
+                if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+                        goto out_free2;
                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+                if (r)
+                        goto out_free2;
+                r = 0;
+out_free2:
                kfree(kvm_regs);
                break;
        }
@@ -1950,13 +1795,17 @@ out_free1:
                break;
        }
        case KVM_SET_SREGS: {
-                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
+                kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
-                if (IS_ERR(kvm_sregs)) {
+                r = -ENOMEM;
-                        r = PTR_ERR(kvm_sregs);
+                if (!kvm_sregs)
-                        kvm_sregs = NULL;
+                        goto out;
+                r = -EFAULT;
+                if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
                        goto out;
-                }
                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
        case KVM_GET_MP_STATE: {
@@ -1978,6 +1827,9 @@ out_free1:
                if (copy_from_user(&mp_state, argp, sizeof mp_state))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
        case KVM_TRANSLATE: {
@@ -2002,6 +1854,9 @@ out_free1:
                if (copy_from_user(&dbg, argp, sizeof dbg))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
        case KVM_SET_SIGNAL_MASK: {
@@ -2042,13 +1897,17 @@ out_free1:
                break;
        }
        case KVM_SET_FPU: {
-                fpu = memdup_user(argp, sizeof(*fpu));
+                fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
-                if (IS_ERR(fpu)) {
+                r = -ENOMEM;
-                        r = PTR_ERR(fpu);
+                if (!fpu)
-                        fpu = NULL;
+                        goto out;
+                r = -EFAULT;
+                if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
                        goto out;
-                }
                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
        default:
@@ -2091,10 +1950,9 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
                        if (copy_from_user(&csigset, sigmask_arg->sigset,
                                           sizeof csigset))
                                goto out;
-                        sigset_from_compat(&sigset, &csigset);
+                }
-                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+                sigset_from_compat(&sigset, &csigset);
-                } else
+                r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
-                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
                break;
        }
        default:
@@ -2118,6 +1976,8 @@ static long kvm_vm_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_CREATE_VCPU:
                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+                if (r < 0)
+                        goto out;
                break;
        case KVM_SET_USER_MEMORY_REGION: {
                struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -2128,6 +1988,8 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out;
                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+                if (r)
+                        goto out;
                break;
        }
        case KVM_GET_DIRTY_LOG: {
@@ -2137,6 +1999,8 @@ static long kvm_vm_ioctl(struct file *filp,
                if (copy_from_user(&log, argp, sizeof log))
                        goto out;
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+                if (r)
+                        goto out;
                break;
        }
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2146,6 +2010,9 @@ static long kvm_vm_ioctl(struct file *filp,
                if (copy_from_user(&zone, argp, sizeof zone))
                        goto out;
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
        case KVM_UNREGISTER_COALESCED_MMIO: {
@@ -2154,6 +2021,9 @@ static long kvm_vm_ioctl(struct file *filp,
                if (copy_from_user(&zone, argp, sizeof zone))
                        goto out;
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
+                if (r)
+                        goto out;
+                r = 0;
                break;
        }
 #endif
@@ -2163,7 +2033,7 @@ static long kvm_vm_ioctl(struct file *filp,
                r = -EFAULT;
                if (copy_from_user(&data, argp, sizeof data))
                        goto out;
-                r = kvm_irqfd(kvm, &data);
+                r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
                break;
        }
        case KVM_IOEVENTFD: {
@@ -2186,40 +2056,6 @@ static long kvm_vm_ioctl(struct file *filp,
                mutex_unlock(&kvm->lock);
                break;
 #endif
-#ifdef CONFIG_HAVE_KVM_MSI
-        case KVM_SIGNAL_MSI: {
-                struct kvm_msi msi;
-                r = -EFAULT;
-                if (copy_from_user(&msi, argp, sizeof msi))
-                        goto out;
-                r = kvm_send_userspace_msi(kvm, &msi);
-                break;
-        }
-#endif
-#ifdef __KVM_HAVE_IRQ_LINE
-        case KVM_IRQ_LINE_STATUS:
-        case KVM_IRQ_LINE: {
-                struct kvm_irq_level irq_event;
-                r = -EFAULT;
-                if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                        goto out;
-                r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
-                if (r)
-                        goto out;
-                r = -EFAULT;
-                if (ioctl == KVM_IRQ_LINE_STATUS) {
-                        if (copy_to_user(argp, &irq_event, sizeof irq_event))
-                                goto out;
-                }
-                r = 0;
-                break;
-        }
-#endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
                if (r == -ENOTTY)
@@ -2262,6 +2098,8 @@ static long kvm_vm_compat_ioctl(struct file *filp,
                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+                if (r)
+                        goto out;
                break;
        }
        default:
@@ -2314,12 +2152,12 @@ static struct file_operations kvm_vm_fops = {
        .llseek         = noop_llseek,
 };
-static int kvm_dev_ioctl_create_vm(unsigned long type)
+static int kvm_dev_ioctl_create_vm(void)
 {
        int r;
        struct kvm *kvm;
-        kvm = kvm_create_vm(type);
+        kvm = kvm_create_vm();
        if (IS_ERR(kvm))
                return PTR_ERR(kvm);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2346,11 +2184,8 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
        case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
        case KVM_CAP_INTERNAL_ERROR_DATA:
-#ifdef CONFIG_HAVE_KVM_MSI
-        case KVM_CAP_SIGNAL_MSI:
-#endif
                return 1;
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
 #endif
@@ -2373,7 +2208,10 @@ static long kvm_dev_ioctl(struct file *filp,
                r = KVM_API_VERSION;
                break;
        case KVM_CREATE_VM:
-                r = kvm_dev_ioctl_create_vm(arg);
+                r = -EINVAL;
+                if (arg)
+                        goto out;
+                r = kvm_dev_ioctl_create_vm();
                break;
        case KVM_CHECK_EXTENSION:
                r = kvm_dev_ioctl_check_extension_generic(arg);
@@ -2553,89 +2391,24 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        int i;
        for (i = 0; i < bus->dev_count; i++) {
-                struct kvm_io_device *pos = bus->range[i].dev;
+                struct kvm_io_device *pos = bus->devs[i];
                kvm_iodevice_destructor(pos);
        }
        kfree(bus);
 }
-int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
-{
-        const struct kvm_io_range *r1 = p1;
-        const struct kvm_io_range *r2 = p2;
-        if (r1->addr < r2->addr)
-                return -1;
-        if (r1->addr + r1->len > r2->addr + r2->len)
-                return 1;
-        return 0;
-}
-int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
-                          gpa_t addr, int len)
-{
-        bus->range[bus->dev_count++] = (struct kvm_io_range) {
-                .addr = addr,
-                .len = len,
-                .dev = dev,
-        };
-        sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
-                kvm_io_bus_sort_cmp, NULL);
-        return 0;
-}
-int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
-                             gpa_t addr, int len)
-{
-        struct kvm_io_range *range, key;
-        int off;
-        key = (struct kvm_io_range) {
-                .addr = addr,
-                .len = len,
-        };
-        range = bsearch(&key, bus->range, bus->dev_count,
-                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
-        if (range == NULL)
-                return -ENOENT;
-        off = range - bus->range;
-        while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
-                off--;
-        return off;
-}
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
-        int idx;
+        int i;
        struct kvm_io_bus *bus;
-        struct kvm_io_range range;
-        range = (struct kvm_io_range) {
-                .addr = addr,
-                .len = len,
-        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        for (i = 0; i < bus->dev_count; i++)
-        if (idx < 0)
+                if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
-                return -EOPNOTSUPP;
-        while (idx < bus->dev_count &&
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
                        return 0;
-                idx++;
-        }
        return -EOPNOTSUPP;
 }
@@ -2643,47 +2416,31 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
-        int idx;
+        int i;
        struct kvm_io_bus *bus;
-        struct kvm_io_range range;
-        range = (struct kvm_io_range) {
-                .addr = addr,
-                .len = len,
-        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        for (i = 0; i < bus->dev_count; i++)
-        if (idx < 0)
+                if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
-                return -EOPNOTSUPP;
-        while (idx < bus->dev_count &&
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
                        return 0;
-                idx++;
-        }
        return -EOPNOTSUPP;
 }
 /* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                            int len, struct kvm_io_device *dev)
+                            struct kvm_io_device *dev)
 {
        struct kvm_io_bus *new_bus, *bus;
        bus = kvm->buses[bus_idx];
-        if (bus->dev_count > NR_IOBUS_DEVS - 1)
+        if (bus->dev_count > NR_IOBUS_DEVS-1)
                return -ENOSPC;
-        new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+        new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
-                          sizeof(struct kvm_io_range)), GFP_KERNEL);
        if (!new_bus)
                return -ENOMEM;
-        memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
+        memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-               sizeof(struct kvm_io_range)));
+        new_bus->devs[new_bus->dev_count++] = dev;
-        kvm_io_bus_insert_dev(new_bus, dev, addr, len);
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
@@ -2698,26 +2455,25 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        int i, r;
        struct kvm_io_bus *new_bus, *bus;
+        new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+        if (!new_bus)
+                return -ENOMEM;
        bus = kvm->buses[bus_idx];
+        memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
        r = -ENOENT;
-        for (i = 0; i < bus->dev_count; i++)
+        for (i = 0; i < new_bus->dev_count; i++)
-                if (bus->range[i].dev == dev) {
+                if (new_bus->devs[i] == dev) {
                        r = 0;
+                        new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
                        break;
                }
-        if (r)
+        if (r) {
+                kfree(new_bus);
                return r;
+        }
-        new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
-                          sizeof(struct kvm_io_range)), GFP_KERNEL);
-        if (!new_bus)
-                return -ENOMEM;
-        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
-        new_bus->dev_count--;
-        memcpy(new_bus->range + i, bus->range + i + 1,
-               (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
@@ -2768,29 +2524,15 @@ static const struct file_operations *stat_fops[] = {
        [KVM_STAT_VM]   = &vm_stat_fops,
 };
-static int kvm_init_debug(void)
+static void kvm_init_debug(void)
 {
-        int r = -EFAULT;
        struct kvm_stats_debugfs_item *p;
        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
-        if (kvm_debugfs_dir == NULL)
+        for (p = debugfs_entries; p->name; ++p)
-                goto out;
-        for (p = debugfs_entries; p->name; ++p) {
                p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                                (void *)(long)p->offset,
                                                stat_fops[p->kind]);
-                if (p->dentry == NULL)
-                        goto out_dir;
-        }
-        return 0;
-out_dir:
-        debugfs_remove_recursive(kvm_debugfs_dir);
-out:
-        return r;
 }
 static void kvm_exit_debug(void)
@@ -2822,6 +2564,9 @@ static struct syscore_ops kvm_syscore_ops = {
        .resume = kvm_resume,
 };
+struct page *bad_page;
+pfn_t bad_pfn;
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
@@ -2853,6 +2598,33 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        if (r)
                goto out_fail;
+        bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (bad_page == NULL) {
+                r = -ENOMEM;
+                goto out;
+        }
+        bad_pfn = page_to_pfn(bad_page);
+        hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (hwpoison_page == NULL) {
+                r = -ENOMEM;
+                goto out_free_0;
+        }
+        hwpoison_pfn = page_to_pfn(hwpoison_page);
+        fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (fault_page == NULL) {
+                r = -ENOMEM;
+                goto out_free_0;
+        }
+        fault_pfn = page_to_pfn(fault_page);
        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                r = -ENOMEM;
                goto out_free_0;
@@ -2904,16 +2676,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_preempt_ops.sched_out = kvm_sched_out;
-        r = kvm_init_debug();
+        kvm_init_debug();
-        if (r) {
-                printk(KERN_ERR "kvm: create debugfs files failed\n");
-                goto out_undebugfs;
-        }
        return 0;
-out_undebugfs:
-        unregister_syscore_ops(&kvm_syscore_ops);
 out_unreg:
        kvm_async_pf_deinit();
 out_free:
@@ -2927,6 +2693,12 @@ out_free_1:
 out_free_0a:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
+        if (fault_page)
+                __free_page(fault_page);
+        if (hwpoison_page)
+                __free_page(hwpoison_page);
+        __free_page(bad_page);
+out:
        kvm_arch_exit();
 out_fail:
        return r;
@@ -2946,5 +2718,7 @@ void kvm_exit(void)
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        free_cpumask_var(cpus_hardware_enabled);
+        __free_page(hwpoison_page);
+        __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
commit	8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree	a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /virt
parent	406089d01562f1e2bf9f089fd7637009ebaad589 (diff)