aboutsummaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
authorNicholas Bellinger <nab@linux-iscsi.org>2011-01-16 16:21:04 -0500
committerNicholas Bellinger <nab@linux-iscsi.org>2011-01-16 16:21:04 -0500
commitf652f6c5b7cfdf139f4155d78f397e99ae1c4acc (patch)
tree71c6344688bf56ea6aaf18c586ab69ff4f077ade /virt
parent140e3008e7fe1526cbb12f8f07dbc273ac713b75 (diff)
parentc66ac9db8d4ad9994a02b3e933ea2ccc643e1fe5 (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/jejb/scsi-post-merge-2.6 into for-linus
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c125
-rw-r--r--virt/kvm/async_pf.c216
-rw-r--r--virt/kvm/async_pf.h36
-rw-r--r--virt/kvm/eventfd.c91
-rw-r--r--virt/kvm/irq_comm.c7
-rw-r--r--virt/kvm/kvm_main.c373
7 files changed, 666 insertions, 185 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7f1178f6b839..f63ccb0a5982 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
15 15
16config KVM_MMIO 16config KVM_MMIO
17 bool 17 bool
18
19config KVM_ASYNC_PF
20 bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 7c98928b09d9..ae72ae604c89 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
55 return index; 55 return index;
56} 56}
57 57
58static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 58static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
59{ 59{
60 struct kvm_assigned_dev_kernel *assigned_dev; 60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
61 int i; 61 u32 vector;
62 int index;
62 63
63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 64 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
64 interrupt_work); 65 spin_lock(&assigned_dev->intx_lock);
66 disable_irq_nosync(irq);
67 assigned_dev->host_irq_disabled = true;
68 spin_unlock(&assigned_dev->intx_lock);
69 }
65 70
66 spin_lock_irq(&assigned_dev->assigned_dev_lock);
67 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 71 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
68 struct kvm_guest_msix_entry *guest_entries = 72 index = find_index_from_host_irq(assigned_dev, irq);
69 assigned_dev->guest_msix_entries; 73 if (index >= 0) {
70 for (i = 0; i < assigned_dev->entries_nr; i++) { 74 vector = assigned_dev->
71 if (!(guest_entries[i].flags & 75 guest_msix_entries[index].vector;
72 KVM_ASSIGNED_MSIX_PENDING))
73 continue;
74 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
75 kvm_set_irq(assigned_dev->kvm, 76 kvm_set_irq(assigned_dev->kvm,
76 assigned_dev->irq_source_id, 77 assigned_dev->irq_source_id, vector, 1);
77 guest_entries[i].vector, 1);
78 } 78 }
79 } else 79 } else
80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
81 assigned_dev->guest_irq, 1); 81 assigned_dev->guest_irq, 1);
82 82
83 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
84}
85
86static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
87{
88 unsigned long flags;
89 struct kvm_assigned_dev_kernel *assigned_dev =
90 (struct kvm_assigned_dev_kernel *) dev_id;
91
92 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
93 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
94 int index = find_index_from_host_irq(assigned_dev, irq);
95 if (index < 0)
96 goto out;
97 assigned_dev->guest_msix_entries[index].flags |=
98 KVM_ASSIGNED_MSIX_PENDING;
99 }
100
101 schedule_work(&assigned_dev->interrupt_work);
102
103 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
104 disable_irq_nosync(irq);
105 assigned_dev->host_irq_disabled = true;
106 }
107
108out:
109 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
110 return IRQ_HANDLED; 83 return IRQ_HANDLED;
111} 84}
112 85
@@ -114,7 +87,6 @@ out:
114static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 87static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
115{ 88{
116 struct kvm_assigned_dev_kernel *dev; 89 struct kvm_assigned_dev_kernel *dev;
117 unsigned long flags;
118 90
119 if (kian->gsi == -1) 91 if (kian->gsi == -1)
120 return; 92 return;
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
127 /* The guest irq may be shared so this ack may be 99 /* The guest irq may be shared so this ack may be
128 * from another device. 100 * from another device.
129 */ 101 */
130 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 102 spin_lock(&dev->intx_lock);
131 if (dev->host_irq_disabled) { 103 if (dev->host_irq_disabled) {
132 enable_irq(dev->host_irq); 104 enable_irq(dev->host_irq);
133 dev->host_irq_disabled = false; 105 dev->host_irq_disabled = false;
134 } 106 }
135 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 107 spin_unlock(&dev->intx_lock);
136} 108}
137 109
138static void deassign_guest_irq(struct kvm *kvm, 110static void deassign_guest_irq(struct kvm *kvm,
@@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm,
141 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 113 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
142 assigned_dev->ack_notifier.gsi = -1; 114 assigned_dev->ack_notifier.gsi = -1;
143 115
116 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
117 assigned_dev->guest_irq, 0);
118
144 if (assigned_dev->irq_source_id != -1) 119 if (assigned_dev->irq_source_id != -1)
145 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 120 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
146 assigned_dev->irq_source_id = -1; 121 assigned_dev->irq_source_id = -1;
@@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
152 struct kvm_assigned_dev_kernel *assigned_dev) 127 struct kvm_assigned_dev_kernel *assigned_dev)
153{ 128{
154 /* 129 /*
155 * In kvm_free_device_irq, cancel_work_sync return true if: 130 * We disable irq here to prevent further events.
156 * 1. work is scheduled, and then cancelled.
157 * 2. work callback is executed.
158 *
159 * The first one ensured that the irq is disabled and no more events
160 * would happen. But for the second one, the irq may be enabled (e.g.
161 * for MSI). So we disable irq here to prevent further events.
162 * 131 *
163 * Notice this maybe result in nested disable if the interrupt type is 132 * Notice this maybe result in nested disable if the interrupt type is
164 * INTx, but it's OK for we are going to free it. 133 * INTx, but it's OK for we are going to free it.
165 * 134 *
166 * If this function is a part of VM destroy, please ensure that till 135 * If this function is a part of VM destroy, please ensure that till
167 * now, the kvm state is still legal for probably we also have to wait 136 * now, the kvm state is still legal for probably we also have to wait
168 * interrupt_work done. 137 * on a currently running IRQ handler.
169 */ 138 */
170 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 139 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
171 int i; 140 int i;
172 for (i = 0; i < assigned_dev->entries_nr; i++) 141 for (i = 0; i < assigned_dev->entries_nr; i++)
173 disable_irq_nosync(assigned_dev-> 142 disable_irq(assigned_dev->host_msix_entries[i].vector);
174 host_msix_entries[i].vector);
175
176 cancel_work_sync(&assigned_dev->interrupt_work);
177 143
178 for (i = 0; i < assigned_dev->entries_nr; i++) 144 for (i = 0; i < assigned_dev->entries_nr; i++)
179 free_irq(assigned_dev->host_msix_entries[i].vector, 145 free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
185 pci_disable_msix(assigned_dev->dev); 151 pci_disable_msix(assigned_dev->dev);
186 } else { 152 } else {
187 /* Deal with MSI and INTx */ 153 /* Deal with MSI and INTx */
188 disable_irq_nosync(assigned_dev->host_irq); 154 disable_irq(assigned_dev->host_irq);
189 cancel_work_sync(&assigned_dev->interrupt_work);
190 155
191 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 156 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
192 157
@@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
232{ 197{
233 kvm_free_assigned_irq(kvm, assigned_dev); 198 kvm_free_assigned_irq(kvm, assigned_dev);
234 199
235 pci_reset_function(assigned_dev->dev); 200 __pci_reset_function(assigned_dev->dev);
201 pci_restore_state(assigned_dev->dev);
236 202
237 pci_release_regions(assigned_dev->dev); 203 pci_release_regions(assigned_dev->dev);
238 pci_disable_device(assigned_dev->dev); 204 pci_disable_device(assigned_dev->dev);
@@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
265 * on the same interrupt line is not a happy situation: there 231 * on the same interrupt line is not a happy situation: there
266 * are going to be long delays in accepting, acking, etc. 232 * are going to be long delays in accepting, acking, etc.
267 */ 233 */
268 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 234 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
269 0, "kvm_assigned_intx_device", (void *)dev)) 235 IRQF_ONESHOT, dev->irq_name, (void *)dev))
270 return -EIO; 236 return -EIO;
271 return 0; 237 return 0;
272} 238}
@@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
284 } 250 }
285 251
286 dev->host_irq = dev->dev->irq; 252 dev->host_irq = dev->dev->irq;
287 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 253 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
288 "kvm_assigned_msi_device", (void *)dev)) { 254 0, dev->irq_name, (void *)dev)) {
289 pci_disable_msi(dev->dev); 255 pci_disable_msi(dev->dev);
290 return -EIO; 256 return -EIO;
291 } 257 }
@@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
310 return r; 276 return r;
311 277
312 for (i = 0; i < dev->entries_nr; i++) { 278 for (i = 0; i < dev->entries_nr; i++) {
313 r = request_irq(dev->host_msix_entries[i].vector, 279 r = request_threaded_irq(dev->host_msix_entries[i].vector,
314 kvm_assigned_dev_intr, 0, 280 NULL, kvm_assigned_dev_thread,
315 "kvm_assigned_msix_device", 281 0, dev->irq_name, (void *)dev);
316 (void *)dev);
317 if (r) 282 if (r)
318 goto err; 283 goto err;
319 } 284 }
@@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm,
370 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 335 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
371 return r; 336 return r;
372 337
338 snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
339 pci_name(dev->dev));
340
373 switch (host_irq_type) { 341 switch (host_irq_type) {
374 case KVM_DEV_IRQ_HOST_INTX: 342 case KVM_DEV_IRQ_HOST_INTX:
375 r = assigned_device_enable_host_intx(kvm, dev); 343 r = assigned_device_enable_host_intx(kvm, dev);
@@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
547 } 515 }
548 516
549 pci_reset_function(dev); 517 pci_reset_function(dev);
518 pci_save_state(dev);
550 519
551 match->assigned_dev_id = assigned_dev->assigned_dev_id; 520 match->assigned_dev_id = assigned_dev->assigned_dev_id;
552 match->host_segnr = assigned_dev->segnr; 521 match->host_segnr = assigned_dev->segnr;
@@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
554 match->host_devfn = assigned_dev->devfn; 523 match->host_devfn = assigned_dev->devfn;
555 match->flags = assigned_dev->flags; 524 match->flags = assigned_dev->flags;
556 match->dev = dev; 525 match->dev = dev;
557 spin_lock_init(&match->assigned_dev_lock); 526 spin_lock_init(&match->intx_lock);
558 match->irq_source_id = -1; 527 match->irq_source_id = -1;
559 match->kvm = kvm; 528 match->kvm = kvm;
560 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 529 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
561 INIT_WORK(&match->interrupt_work,
562 kvm_assigned_dev_interrupt_work_handler);
563 530
564 list_add(&match->list, &kvm->arch.assigned_dev_head); 531 list_add(&match->list, &kvm->arch.assigned_dev_head);
565 532
@@ -579,6 +546,7 @@ out:
579 mutex_unlock(&kvm->lock); 546 mutex_unlock(&kvm->lock);
580 return r; 547 return r;
581out_list_del: 548out_list_del:
549 pci_restore_state(dev);
582 list_del(&match->list); 550 list_del(&match->list);
583 pci_release_regions(dev); 551 pci_release_regions(dev);
584out_disable: 552out_disable:
@@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
651 r = -ENOMEM; 619 r = -ENOMEM;
652 goto msix_nr_out; 620 goto msix_nr_out;
653 } 621 }
654 adev->guest_msix_entries = kzalloc( 622 adev->guest_msix_entries =
655 sizeof(struct kvm_guest_msix_entry) * 623 kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
656 entry_nr->entry_nr, GFP_KERNEL); 624 GFP_KERNEL);
657 if (!adev->guest_msix_entries) { 625 if (!adev->guest_msix_entries) {
658 kfree(adev->host_msix_entries); 626 kfree(adev->host_msix_entries);
659 r = -ENOMEM; 627 r = -ENOMEM;
@@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
706 unsigned long arg) 674 unsigned long arg)
707{ 675{
708 void __user *argp = (void __user *)arg; 676 void __user *argp = (void __user *)arg;
709 int r = -ENOTTY; 677 int r;
710 678
711 switch (ioctl) { 679 switch (ioctl) {
712 case KVM_ASSIGN_PCI_DEVICE: { 680 case KVM_ASSIGN_PCI_DEVICE: {
@@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
724 r = -EOPNOTSUPP; 692 r = -EOPNOTSUPP;
725 break; 693 break;
726 } 694 }
727#ifdef KVM_CAP_ASSIGN_DEV_IRQ
728 case KVM_ASSIGN_DEV_IRQ: { 695 case KVM_ASSIGN_DEV_IRQ: {
729 struct kvm_assigned_irq assigned_irq; 696 struct kvm_assigned_irq assigned_irq;
730 697
@@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
747 goto out; 714 goto out;
748 break; 715 break;
749 } 716 }
750#endif
751#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
752 case KVM_DEASSIGN_PCI_DEVICE: { 717 case KVM_DEASSIGN_PCI_DEVICE: {
753 struct kvm_assigned_pci_dev assigned_dev; 718 struct kvm_assigned_pci_dev assigned_dev;
754 719
@@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
760 goto out; 725 goto out;
761 break; 726 break;
762 } 727 }
763#endif
764#ifdef KVM_CAP_IRQ_ROUTING 728#ifdef KVM_CAP_IRQ_ROUTING
765 case KVM_SET_GSI_ROUTING: { 729 case KVM_SET_GSI_ROUTING: {
766 struct kvm_irq_routing routing; 730 struct kvm_irq_routing routing;
@@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
813 break; 777 break;
814 } 778 }
815#endif 779#endif
780 default:
781 r = -ENOTTY;
782 break;
816 } 783 }
817out: 784out:
818 return r; 785 return r;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 000000000000..74268b4c2ee1
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,216 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#include <linux/kvm_host.h>
24#include <linux/slab.h>
25#include <linux/module.h>
26#include <linux/mmu_context.h>
27
28#include "async_pf.h"
29#include <trace/events/kvm.h>
30
31static struct kmem_cache *async_pf_cache;
32
33int kvm_async_pf_init(void)
34{
35 async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
36
37 if (!async_pf_cache)
38 return -ENOMEM;
39
40 return 0;
41}
42
43void kvm_async_pf_deinit(void)
44{
45 if (async_pf_cache)
46 kmem_cache_destroy(async_pf_cache);
47 async_pf_cache = NULL;
48}
49
50void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
51{
52 INIT_LIST_HEAD(&vcpu->async_pf.done);
53 INIT_LIST_HEAD(&vcpu->async_pf.queue);
54 spin_lock_init(&vcpu->async_pf.lock);
55}
56
57static void async_pf_execute(struct work_struct *work)
58{
59 struct page *page = NULL;
60 struct kvm_async_pf *apf =
61 container_of(work, struct kvm_async_pf, work);
62 struct mm_struct *mm = apf->mm;
63 struct kvm_vcpu *vcpu = apf->vcpu;
64 unsigned long addr = apf->addr;
65 gva_t gva = apf->gva;
66
67 might_sleep();
68
69 use_mm(mm);
70 down_read(&mm->mmap_sem);
71 get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
72 up_read(&mm->mmap_sem);
73 unuse_mm(mm);
74
75 spin_lock(&vcpu->async_pf.lock);
76 list_add_tail(&apf->link, &vcpu->async_pf.done);
77 apf->page = page;
78 apf->done = true;
79 spin_unlock(&vcpu->async_pf.lock);
80
81 /*
82 * apf may be freed by kvm_check_async_pf_completion() after
83 * this point
84 */
85
86 trace_kvm_async_pf_completed(addr, page, gva);
87
88 if (waitqueue_active(&vcpu->wq))
89 wake_up_interruptible(&vcpu->wq);
90
91 mmdrop(mm);
92 kvm_put_kvm(vcpu->kvm);
93}
94
95void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
96{
97 /* cancel outstanding work queue item */
98 while (!list_empty(&vcpu->async_pf.queue)) {
99 struct kvm_async_pf *work =
100 list_entry(vcpu->async_pf.queue.next,
101 typeof(*work), queue);
102 cancel_work_sync(&work->work);
103 list_del(&work->queue);
104 if (!work->done) /* work was canceled */
105 kmem_cache_free(async_pf_cache, work);
106 }
107
108 spin_lock(&vcpu->async_pf.lock);
109 while (!list_empty(&vcpu->async_pf.done)) {
110 struct kvm_async_pf *work =
111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link);
113 list_del(&work->link);
114 if (work->page)
115 put_page(work->page);
116 kmem_cache_free(async_pf_cache, work);
117 }
118 spin_unlock(&vcpu->async_pf.lock);
119
120 vcpu->async_pf.queued = 0;
121}
122
123void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
124{
125 struct kvm_async_pf *work;
126
127 while (!list_empty_careful(&vcpu->async_pf.done) &&
128 kvm_arch_can_inject_async_page_present(vcpu)) {
129 spin_lock(&vcpu->async_pf.lock);
130 work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
131 link);
132 list_del(&work->link);
133 spin_unlock(&vcpu->async_pf.lock);
134
135 if (work->page)
136 kvm_arch_async_page_ready(vcpu, work);
137 kvm_arch_async_page_present(vcpu, work);
138
139 list_del(&work->queue);
140 vcpu->async_pf.queued--;
141 if (work->page)
142 put_page(work->page);
143 kmem_cache_free(async_pf_cache, work);
144 }
145}
146
147int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
148 struct kvm_arch_async_pf *arch)
149{
150 struct kvm_async_pf *work;
151
152 if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
153 return 0;
154
155 /* setup delayed work */
156
157 /*
158 * do alloc nowait since if we are going to sleep anyway we
159 * may as well sleep faulting in page
160 */
161 work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
162 if (!work)
163 return 0;
164
165 work->page = NULL;
166 work->done = false;
167 work->vcpu = vcpu;
168 work->gva = gva;
169 work->addr = gfn_to_hva(vcpu->kvm, gfn);
170 work->arch = *arch;
171 work->mm = current->mm;
172 atomic_inc(&work->mm->mm_count);
173 kvm_get_kvm(work->vcpu->kvm);
174
175 /* this can't really happen otherwise gfn_to_pfn_async
176 would succeed */
177 if (unlikely(kvm_is_error_hva(work->addr)))
178 goto retry_sync;
179
180 INIT_WORK(&work->work, async_pf_execute);
181 if (!schedule_work(&work->work))
182 goto retry_sync;
183
184 list_add_tail(&work->queue, &vcpu->async_pf.queue);
185 vcpu->async_pf.queued++;
186 kvm_arch_async_page_not_present(vcpu, work);
187 return 1;
188retry_sync:
189 kvm_put_kvm(work->vcpu->kvm);
190 mmdrop(work->mm);
191 kmem_cache_free(async_pf_cache, work);
192 return 0;
193}
194
195int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
196{
197 struct kvm_async_pf *work;
198
199 if (!list_empty_careful(&vcpu->async_pf.done))
200 return 0;
201
202 work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
203 if (!work)
204 return -ENOMEM;
205
206 work->page = bad_page;
207 get_page(bad_page);
208 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
209
210 spin_lock(&vcpu->async_pf.lock);
211 list_add_tail(&work->link, &vcpu->async_pf.done);
212 spin_unlock(&vcpu->async_pf.lock);
213
214 vcpu->async_pf.queued++;
215 return 0;
216}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 000000000000..e7ef6447cb82
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#ifndef __KVM_ASYNC_PF_H__
24#define __KVM_ASYNC_PF_H__
25
26#ifdef CONFIG_KVM_ASYNC_PF
27int kvm_async_pf_init(void);
28void kvm_async_pf_deinit(void);
29void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
30#else
31#define kvm_async_pf_init() (0)
32#define kvm_async_pf_deinit() do{}while(0)
33#define kvm_async_pf_vcpu_init(C) do{}while(0)
34#endif
35
36#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c1f1e3c62984..2ca4535f4fb7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -44,14 +44,19 @@
44 */ 44 */
45 45
46struct _irqfd { 46struct _irqfd {
47 struct kvm *kvm; 47 /* Used for MSI fast-path */
48 struct eventfd_ctx *eventfd; 48 struct kvm *kvm;
49 int gsi; 49 wait_queue_t wait;
50 struct list_head list; 50 /* Update side is protected by irqfds.lock */
51 poll_table pt; 51 struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
52 wait_queue_t wait; 52 /* Used for level IRQ fast-path */
53 struct work_struct inject; 53 int gsi;
54 struct work_struct shutdown; 54 struct work_struct inject;
55 /* Used for setup/shutdown */
56 struct eventfd_ctx *eventfd;
57 struct list_head list;
58 poll_table pt;
59 struct work_struct shutdown;
55}; 60};
56 61
57static struct workqueue_struct *irqfd_cleanup_wq; 62static struct workqueue_struct *irqfd_cleanup_wq;
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
125{ 130{
126 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 131 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
127 unsigned long flags = (unsigned long)key; 132 unsigned long flags = (unsigned long)key;
133 struct kvm_kernel_irq_routing_entry *irq;
134 struct kvm *kvm = irqfd->kvm;
128 135
129 if (flags & POLLIN) 136 if (flags & POLLIN) {
137 rcu_read_lock();
138 irq = rcu_dereference(irqfd->irq_entry);
130 /* An event has been signaled, inject an interrupt */ 139 /* An event has been signaled, inject an interrupt */
131 schedule_work(&irqfd->inject); 140 if (irq)
141 kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
142 else
143 schedule_work(&irqfd->inject);
144 rcu_read_unlock();
145 }
132 146
133 if (flags & POLLHUP) { 147 if (flags & POLLHUP) {
134 /* The eventfd is closing, detach from KVM */ 148 /* The eventfd is closing, detach from KVM */
135 struct kvm *kvm = irqfd->kvm;
136 unsigned long flags; 149 unsigned long flags;
137 150
138 spin_lock_irqsave(&kvm->irqfds.lock, flags); 151 spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
163 add_wait_queue(wqh, &irqfd->wait); 176 add_wait_queue(wqh, &irqfd->wait);
164} 177}
165 178
179/* Must be called under irqfds.lock */
180static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
181 struct kvm_irq_routing_table *irq_rt)
182{
183 struct kvm_kernel_irq_routing_entry *e;
184 struct hlist_node *n;
185
186 if (irqfd->gsi >= irq_rt->nr_rt_entries) {
187 rcu_assign_pointer(irqfd->irq_entry, NULL);
188 return;
189 }
190
191 hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
192 /* Only fast-path MSI. */
193 if (e->type == KVM_IRQ_ROUTING_MSI)
194 rcu_assign_pointer(irqfd->irq_entry, e);
195 else
196 rcu_assign_pointer(irqfd->irq_entry, NULL);
197 }
198}
199
166static int 200static int
167kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 201kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
168{ 202{
203 struct kvm_irq_routing_table *irq_rt;
169 struct _irqfd *irqfd, *tmp; 204 struct _irqfd *irqfd, *tmp;
170 struct file *file = NULL; 205 struct file *file = NULL;
171 struct eventfd_ctx *eventfd = NULL; 206 struct eventfd_ctx *eventfd = NULL;
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
215 goto fail; 250 goto fail;
216 } 251 }
217 252
253 irq_rt = rcu_dereference_protected(kvm->irq_routing,
254 lockdep_is_held(&kvm->irqfds.lock));
255 irqfd_update(kvm, irqfd, irq_rt);
256
218 events = file->f_op->poll(file, &irqfd->pt); 257 events = file->f_op->poll(file, &irqfd->pt);
219 258
220 list_add_tail(&irqfd->list, &kvm->irqfds.items); 259 list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
271 spin_lock_irq(&kvm->irqfds.lock); 310 spin_lock_irq(&kvm->irqfds.lock);
272 311
273 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 312 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
274 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) 313 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
314 /*
315 * This rcu_assign_pointer is needed for when
316 * another thread calls kvm_irqfd_update before
317 * we flush workqueue below.
318 * It is paired with synchronize_rcu done by caller
319 * of that function.
320 */
321 rcu_assign_pointer(irqfd->irq_entry, NULL);
275 irqfd_deactivate(irqfd); 322 irqfd_deactivate(irqfd);
323 }
276 } 324 }
277 325
278 spin_unlock_irq(&kvm->irqfds.lock); 326 spin_unlock_irq(&kvm->irqfds.lock);
@@ -322,6 +370,25 @@ kvm_irqfd_release(struct kvm *kvm)
322} 370}
323 371
324/* 372/*
373 * Change irq_routing and irqfd.
374 * Caller must invoke synchronize_rcu afterwards.
375 */
376void kvm_irq_routing_update(struct kvm *kvm,
377 struct kvm_irq_routing_table *irq_rt)
378{
379 struct _irqfd *irqfd;
380
381 spin_lock_irq(&kvm->irqfds.lock);
382
383 rcu_assign_pointer(kvm->irq_routing, irq_rt);
384
385 list_for_each_entry(irqfd, &kvm->irqfds.items, list)
386 irqfd_update(kvm, irqfd, irq_rt);
387
388 spin_unlock_irq(&kvm->irqfds.lock);
389}
390
391/*
325 * create a host-wide workqueue for issuing deferred shutdown requests 392 * create a host-wide workqueue for issuing deferred shutdown requests
326 * aggregated from all vm* instances. We need our own isolated single-thread 393 * aggregated from all vm* instances. We need our own isolated single-thread
327 * queue to prevent deadlock against flushing the normal work-queue. 394 * queue to prevent deadlock against flushing the normal work-queue.
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 8edca9141b78..9f614b4e365f 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
114 return r; 114 return r;
115} 115}
116 116
117static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 117int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
118 struct kvm *kvm, int irq_source_id, int level) 118 struct kvm *kvm, int irq_source_id, int level)
119{ 119{
120 struct kvm_lapic_irq irq; 120 struct kvm_lapic_irq irq;
121 121
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
409 409
410 mutex_lock(&kvm->irq_lock); 410 mutex_lock(&kvm->irq_lock);
411 old = kvm->irq_routing; 411 old = kvm->irq_routing;
412 rcu_assign_pointer(kvm->irq_routing, new); 412 kvm_irq_routing_update(kvm, new);
413 mutex_unlock(&kvm->irq_lock); 413 mutex_unlock(&kvm->irq_lock);
414
414 synchronize_rcu(); 415 synchronize_rcu();
415 416
416 new = old; 417 new = old;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5225052aebc1..f29abeb6a912 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -55,6 +55,7 @@
55#include <asm-generic/bitops/le.h> 55#include <asm-generic/bitops/le.h>
56 56
57#include "coalesced_mmio.h" 57#include "coalesced_mmio.h"
58#include "async_pf.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/kvm.h> 61#include <trace/events/kvm.h>
@@ -89,7 +90,8 @@ static void hardware_disable_all(void);
89 90
90static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 91static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91 92
92static bool kvm_rebooting; 93bool kvm_rebooting;
94EXPORT_SYMBOL_GPL(kvm_rebooting);
93 95
94static bool largepages_enabled = true; 96static bool largepages_enabled = true;
95 97
@@ -102,8 +104,26 @@ static pfn_t fault_pfn;
102inline int kvm_is_mmio_pfn(pfn_t pfn) 104inline int kvm_is_mmio_pfn(pfn_t pfn)
103{ 105{
104 if (pfn_valid(pfn)) { 106 if (pfn_valid(pfn)) {
105 struct page *page = compound_head(pfn_to_page(pfn)); 107 int reserved;
106 return PageReserved(page); 108 struct page *tail = pfn_to_page(pfn);
109 struct page *head = compound_trans_head(tail);
110 reserved = PageReserved(head);
111 if (head != tail) {
112 /*
113 * "head" is not a dangling pointer
114 * (compound_trans_head takes care of that)
115 * but the hugepage may have been splitted
116 * from under us (and we may not hold a
117 * reference count on the head page so it can
118 * be reused before we run PageReferenced), so
119 * we've to check PageTail before returning
120 * what we just read.
121 */
122 smp_rmb();
123 if (PageTail(tail))
124 return reserved;
125 }
126 return PageReserved(tail);
107 } 127 }
108 128
109 return true; 129 return true;
@@ -167,8 +187,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
167 187
168void kvm_flush_remote_tlbs(struct kvm *kvm) 188void kvm_flush_remote_tlbs(struct kvm *kvm)
169{ 189{
190 int dirty_count = kvm->tlbs_dirty;
191
192 smp_mb();
170 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 193 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
171 ++kvm->stat.remote_tlb_flush; 194 ++kvm->stat.remote_tlb_flush;
195 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
172} 196}
173 197
174void kvm_reload_remote_mmus(struct kvm *kvm) 198void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -186,6 +210,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
186 vcpu->kvm = kvm; 210 vcpu->kvm = kvm;
187 vcpu->vcpu_id = id; 211 vcpu->vcpu_id = id;
188 init_waitqueue_head(&vcpu->wq); 212 init_waitqueue_head(&vcpu->wq);
213 kvm_async_pf_vcpu_init(vcpu);
189 214
190 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 215 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
191 if (!page) { 216 if (!page) {
@@ -247,7 +272,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
247 idx = srcu_read_lock(&kvm->srcu); 272 idx = srcu_read_lock(&kvm->srcu);
248 spin_lock(&kvm->mmu_lock); 273 spin_lock(&kvm->mmu_lock);
249 kvm->mmu_notifier_seq++; 274 kvm->mmu_notifier_seq++;
250 need_tlb_flush = kvm_unmap_hva(kvm, address); 275 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
251 spin_unlock(&kvm->mmu_lock); 276 spin_unlock(&kvm->mmu_lock);
252 srcu_read_unlock(&kvm->srcu, idx); 277 srcu_read_unlock(&kvm->srcu, idx);
253 278
@@ -291,6 +316,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
291 kvm->mmu_notifier_count++; 316 kvm->mmu_notifier_count++;
292 for (; start < end; start += PAGE_SIZE) 317 for (; start < end; start += PAGE_SIZE)
293 need_tlb_flush |= kvm_unmap_hva(kvm, start); 318 need_tlb_flush |= kvm_unmap_hva(kvm, start);
319 need_tlb_flush |= kvm->tlbs_dirty;
294 spin_unlock(&kvm->mmu_lock); 320 spin_unlock(&kvm->mmu_lock);
295 srcu_read_unlock(&kvm->srcu, idx); 321 srcu_read_unlock(&kvm->srcu, idx);
296 322
@@ -344,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
344 return young; 370 return young;
345} 371}
346 372
373static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
374 struct mm_struct *mm,
375 unsigned long address)
376{
377 struct kvm *kvm = mmu_notifier_to_kvm(mn);
378 int young, idx;
379
380 idx = srcu_read_lock(&kvm->srcu);
381 spin_lock(&kvm->mmu_lock);
382 young = kvm_test_age_hva(kvm, address);
383 spin_unlock(&kvm->mmu_lock);
384 srcu_read_unlock(&kvm->srcu, idx);
385
386 return young;
387}
388
347static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 389static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
348 struct mm_struct *mm) 390 struct mm_struct *mm)
349{ 391{
@@ -360,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
360 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 402 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
361 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 403 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
362 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 404 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
405 .test_young = kvm_mmu_notifier_test_young,
363 .change_pte = kvm_mmu_notifier_change_pte, 406 .change_pte = kvm_mmu_notifier_change_pte,
364 .release = kvm_mmu_notifier_release, 407 .release = kvm_mmu_notifier_release,
365}; 408};
@@ -381,11 +424,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
381 424
382static struct kvm *kvm_create_vm(void) 425static struct kvm *kvm_create_vm(void)
383{ 426{
384 int r = 0, i; 427 int r, i;
385 struct kvm *kvm = kvm_arch_create_vm(); 428 struct kvm *kvm = kvm_arch_alloc_vm();
386 429
387 if (IS_ERR(kvm)) 430 if (!kvm)
388 goto out; 431 return ERR_PTR(-ENOMEM);
432
433 r = kvm_arch_init_vm(kvm);
434 if (r)
435 goto out_err_nodisable;
389 436
390 r = hardware_enable_all(); 437 r = hardware_enable_all();
391 if (r) 438 if (r)
@@ -399,23 +446,19 @@ static struct kvm *kvm_create_vm(void)
399 r = -ENOMEM; 446 r = -ENOMEM;
400 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 447 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
401 if (!kvm->memslots) 448 if (!kvm->memslots)
402 goto out_err; 449 goto out_err_nosrcu;
403 if (init_srcu_struct(&kvm->srcu)) 450 if (init_srcu_struct(&kvm->srcu))
404 goto out_err; 451 goto out_err_nosrcu;
405 for (i = 0; i < KVM_NR_BUSES; i++) { 452 for (i = 0; i < KVM_NR_BUSES; i++) {
406 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 453 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
407 GFP_KERNEL); 454 GFP_KERNEL);
408 if (!kvm->buses[i]) { 455 if (!kvm->buses[i])
409 cleanup_srcu_struct(&kvm->srcu);
410 goto out_err; 456 goto out_err;
411 }
412 } 457 }
413 458
414 r = kvm_init_mmu_notifier(kvm); 459 r = kvm_init_mmu_notifier(kvm);
415 if (r) { 460 if (r)
416 cleanup_srcu_struct(&kvm->srcu);
417 goto out_err; 461 goto out_err;
418 }
419 462
420 kvm->mm = current->mm; 463 kvm->mm = current->mm;
421 atomic_inc(&kvm->mm->mm_count); 464 atomic_inc(&kvm->mm->mm_count);
@@ -429,19 +472,35 @@ static struct kvm *kvm_create_vm(void)
429 spin_lock(&kvm_lock); 472 spin_lock(&kvm_lock);
430 list_add(&kvm->vm_list, &vm_list); 473 list_add(&kvm->vm_list, &vm_list);
431 spin_unlock(&kvm_lock); 474 spin_unlock(&kvm_lock);
432out: 475
433 return kvm; 476 return kvm;
434 477
435out_err: 478out_err:
479 cleanup_srcu_struct(&kvm->srcu);
480out_err_nosrcu:
436 hardware_disable_all(); 481 hardware_disable_all();
437out_err_nodisable: 482out_err_nodisable:
438 for (i = 0; i < KVM_NR_BUSES; i++) 483 for (i = 0; i < KVM_NR_BUSES; i++)
439 kfree(kvm->buses[i]); 484 kfree(kvm->buses[i]);
440 kfree(kvm->memslots); 485 kfree(kvm->memslots);
441 kfree(kvm); 486 kvm_arch_free_vm(kvm);
442 return ERR_PTR(r); 487 return ERR_PTR(r);
443} 488}
444 489
490static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
491{
492 if (!memslot->dirty_bitmap)
493 return;
494
495 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
496 vfree(memslot->dirty_bitmap_head);
497 else
498 kfree(memslot->dirty_bitmap_head);
499
500 memslot->dirty_bitmap = NULL;
501 memslot->dirty_bitmap_head = NULL;
502}
503
445/* 504/*
446 * Free any memory in @free but not in @dont. 505 * Free any memory in @free but not in @dont.
447 */ 506 */
@@ -454,7 +513,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
454 vfree(free->rmap); 513 vfree(free->rmap);
455 514
456 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 515 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
457 vfree(free->dirty_bitmap); 516 kvm_destroy_dirty_bitmap(free);
458 517
459 518
460 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 519 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@@ -465,7 +524,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
465 } 524 }
466 525
467 free->npages = 0; 526 free->npages = 0;
468 free->dirty_bitmap = NULL;
469 free->rmap = NULL; 527 free->rmap = NULL;
470} 528}
471 529
@@ -499,6 +557,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
499 kvm_arch_flush_shadow(kvm); 557 kvm_arch_flush_shadow(kvm);
500#endif 558#endif
501 kvm_arch_destroy_vm(kvm); 559 kvm_arch_destroy_vm(kvm);
560 kvm_free_physmem(kvm);
561 cleanup_srcu_struct(&kvm->srcu);
562 kvm_arch_free_vm(kvm);
502 hardware_disable_all(); 563 hardware_disable_all();
503 mmdrop(mm); 564 mmdrop(mm);
504} 565}
@@ -528,6 +589,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
528} 589}
529 590
530/* 591/*
592 * Allocation size is twice as large as the actual dirty bitmap size.
593 * This makes it possible to do double buffering: see x86's
594 * kvm_vm_ioctl_get_dirty_log().
595 */
596static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
597{
598 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
599
600 if (dirty_bytes > PAGE_SIZE)
601 memslot->dirty_bitmap = vzalloc(dirty_bytes);
602 else
603 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
604
605 if (!memslot->dirty_bitmap)
606 return -ENOMEM;
607
608 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
609 return 0;
610}
611
612/*
531 * Allocate some memory and give it an address in the guest physical address 613 * Allocate some memory and give it an address in the guest physical address
532 * space. 614 * space.
533 * 615 *
@@ -604,13 +686,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
604 /* Allocate if a slot is being created */ 686 /* Allocate if a slot is being created */
605#ifndef CONFIG_S390 687#ifndef CONFIG_S390
606 if (npages && !new.rmap) { 688 if (npages && !new.rmap) {
607 new.rmap = vmalloc(npages * sizeof(*new.rmap)); 689 new.rmap = vzalloc(npages * sizeof(*new.rmap));
608 690
609 if (!new.rmap) 691 if (!new.rmap)
610 goto out_free; 692 goto out_free;
611 693
612 memset(new.rmap, 0, npages * sizeof(*new.rmap));
613
614 new.user_alloc = user_alloc; 694 new.user_alloc = user_alloc;
615 new.userspace_addr = mem->userspace_addr; 695 new.userspace_addr = mem->userspace_addr;
616 } 696 }
@@ -633,14 +713,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
633 >> KVM_HPAGE_GFN_SHIFT(level)); 713 >> KVM_HPAGE_GFN_SHIFT(level));
634 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 714 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
635 715
636 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 716 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
637 717
638 if (!new.lpage_info[i]) 718 if (!new.lpage_info[i])
639 goto out_free; 719 goto out_free;
640 720
641 memset(new.lpage_info[i], 0,
642 lpages * sizeof(*new.lpage_info[i]));
643
644 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 721 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
645 new.lpage_info[i][0].write_count = 1; 722 new.lpage_info[i][0].write_count = 1;
646 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 723 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
@@ -661,12 +738,8 @@ skip_lpage:
661 738
662 /* Allocate page dirty bitmap if needed */ 739 /* Allocate page dirty bitmap if needed */
663 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 740 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
664 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); 741 if (kvm_create_dirty_bitmap(&new) < 0)
665
666 new.dirty_bitmap = vmalloc(dirty_bytes);
667 if (!new.dirty_bitmap)
668 goto out_free; 742 goto out_free;
669 memset(new.dirty_bitmap, 0, dirty_bytes);
670 /* destroy any largepage mappings for dirty tracking */ 743 /* destroy any largepage mappings for dirty tracking */
671 if (old.npages) 744 if (old.npages)
672 flush_shadow = 1; 745 flush_shadow = 1;
@@ -685,6 +758,7 @@ skip_lpage:
685 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 758 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
686 if (mem->slot >= slots->nmemslots) 759 if (mem->slot >= slots->nmemslots)
687 slots->nmemslots = mem->slot + 1; 760 slots->nmemslots = mem->slot + 1;
761 slots->generation++;
688 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 762 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
689 763
690 old_memslots = kvm->memslots; 764 old_memslots = kvm->memslots;
@@ -719,6 +793,7 @@ skip_lpage:
719 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 793 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
720 if (mem->slot >= slots->nmemslots) 794 if (mem->slot >= slots->nmemslots)
721 slots->nmemslots = mem->slot + 1; 795 slots->nmemslots = mem->slot + 1;
796 slots->generation++;
722 797
723 /* actual memory is freed via old in kvm_free_physmem_slot below */ 798 /* actual memory is freed via old in kvm_free_physmem_slot below */
724 if (!npages) { 799 if (!npages) {
@@ -849,10 +924,10 @@ int kvm_is_error_hva(unsigned long addr)
849} 924}
850EXPORT_SYMBOL_GPL(kvm_is_error_hva); 925EXPORT_SYMBOL_GPL(kvm_is_error_hva);
851 926
852struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 927static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
928 gfn_t gfn)
853{ 929{
854 int i; 930 int i;
855 struct kvm_memslots *slots = kvm_memslots(kvm);
856 931
857 for (i = 0; i < slots->nmemslots; ++i) { 932 for (i = 0; i < slots->nmemslots; ++i) {
858 struct kvm_memory_slot *memslot = &slots->memslots[i]; 933 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -863,6 +938,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
863 } 938 }
864 return NULL; 939 return NULL;
865} 940}
941
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
943{
944 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
945}
866EXPORT_SYMBOL_GPL(gfn_to_memslot); 946EXPORT_SYMBOL_GPL(gfn_to_memslot);
867 947
868int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 948int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -925,12 +1005,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
925 return memslot - slots->memslots; 1005 return memslot - slots->memslots;
926} 1006}
927 1007
928static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, 1008static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
929 gfn_t *nr_pages) 1009 gfn_t *nr_pages)
930{ 1010{
931 struct kvm_memory_slot *slot;
932
933 slot = gfn_to_memslot(kvm, gfn);
934 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1011 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
935 return bad_hva(); 1012 return bad_hva();
936 1013
@@ -942,28 +1019,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
942 1019
943unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1020unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
944{ 1021{
945 return gfn_to_hva_many(kvm, gfn, NULL); 1022 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
946} 1023}
947EXPORT_SYMBOL_GPL(gfn_to_hva); 1024EXPORT_SYMBOL_GPL(gfn_to_hva);
948 1025
949static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) 1026static pfn_t get_fault_pfn(void)
1027{
1028 get_page(fault_page);
1029 return fault_pfn;
1030}
1031
1032static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1033 bool *async, bool write_fault, bool *writable)
950{ 1034{
951 struct page *page[1]; 1035 struct page *page[1];
952 int npages; 1036 int npages = 0;
953 pfn_t pfn; 1037 pfn_t pfn;
954 1038
955 if (atomic) 1039 /* we can do it either atomically or asynchronously, not both */
1040 BUG_ON(atomic && async);
1041
1042 BUG_ON(!write_fault && !writable);
1043
1044 if (writable)
1045 *writable = true;
1046
1047 if (atomic || async)
956 npages = __get_user_pages_fast(addr, 1, 1, page); 1048 npages = __get_user_pages_fast(addr, 1, 1, page);
957 else { 1049
1050 if (unlikely(npages != 1) && !atomic) {
958 might_sleep(); 1051 might_sleep();
959 npages = get_user_pages_fast(addr, 1, 1, page); 1052
1053 if (writable)
1054 *writable = write_fault;
1055
1056 npages = get_user_pages_fast(addr, 1, write_fault, page);
1057
1058 /* map read fault as writable if possible */
1059 if (unlikely(!write_fault) && npages == 1) {
1060 struct page *wpage[1];
1061
1062 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1063 if (npages == 1) {
1064 *writable = true;
1065 put_page(page[0]);
1066 page[0] = wpage[0];
1067 }
1068 npages = 1;
1069 }
960 } 1070 }
961 1071
962 if (unlikely(npages != 1)) { 1072 if (unlikely(npages != 1)) {
963 struct vm_area_struct *vma; 1073 struct vm_area_struct *vma;
964 1074
965 if (atomic) 1075 if (atomic)
966 goto return_fault_page; 1076 return get_fault_pfn();
967 1077
968 down_read(&current->mm->mmap_sem); 1078 down_read(&current->mm->mmap_sem);
969 if (is_hwpoison_address(addr)) { 1079 if (is_hwpoison_address(addr)) {
@@ -972,19 +1082,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
972 return page_to_pfn(hwpoison_page); 1082 return page_to_pfn(hwpoison_page);
973 } 1083 }
974 1084
975 vma = find_vma(current->mm, addr); 1085 vma = find_vma_intersection(current->mm, addr, addr+1);
976 1086
977 if (vma == NULL || addr < vma->vm_start || 1087 if (vma == NULL)
978 !(vma->vm_flags & VM_PFNMAP)) { 1088 pfn = get_fault_pfn();
979 up_read(&current->mm->mmap_sem); 1089 else if ((vma->vm_flags & VM_PFNMAP)) {
980return_fault_page: 1090 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
981 get_page(fault_page); 1091 vma->vm_pgoff;
982 return page_to_pfn(fault_page); 1092 BUG_ON(!kvm_is_mmio_pfn(pfn));
1093 } else {
1094 if (async && (vma->vm_flags & VM_WRITE))
1095 *async = true;
1096 pfn = get_fault_pfn();
983 } 1097 }
984
985 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
986 up_read(&current->mm->mmap_sem); 1098 up_read(&current->mm->mmap_sem);
987 BUG_ON(!kvm_is_mmio_pfn(pfn));
988 } else 1099 } else
989 pfn = page_to_pfn(page[0]); 1100 pfn = page_to_pfn(page[0]);
990 1101
@@ -993,40 +1104,58 @@ return_fault_page:
993 1104
994pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1105pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
995{ 1106{
996 return hva_to_pfn(kvm, addr, true); 1107 return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
997} 1108}
998EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1109EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
999 1110
1000static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) 1111static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1112 bool write_fault, bool *writable)
1001{ 1113{
1002 unsigned long addr; 1114 unsigned long addr;
1003 1115
1116 if (async)
1117 *async = false;
1118
1004 addr = gfn_to_hva(kvm, gfn); 1119 addr = gfn_to_hva(kvm, gfn);
1005 if (kvm_is_error_hva(addr)) { 1120 if (kvm_is_error_hva(addr)) {
1006 get_page(bad_page); 1121 get_page(bad_page);
1007 return page_to_pfn(bad_page); 1122 return page_to_pfn(bad_page);
1008 } 1123 }
1009 1124
1010 return hva_to_pfn(kvm, addr, atomic); 1125 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1011} 1126}
1012 1127
1013pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1128pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1014{ 1129{
1015 return __gfn_to_pfn(kvm, gfn, true); 1130 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1016} 1131}
1017EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1132EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1018 1133
1134pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1135 bool write_fault, bool *writable)
1136{
1137 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1138}
1139EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1140
1019pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1141pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1020{ 1142{
1021 return __gfn_to_pfn(kvm, gfn, false); 1143 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1022} 1144}
1023EXPORT_SYMBOL_GPL(gfn_to_pfn); 1145EXPORT_SYMBOL_GPL(gfn_to_pfn);
1024 1146
1147pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1148 bool *writable)
1149{
1150 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1151}
1152EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1153
1025pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1154pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1026 struct kvm_memory_slot *slot, gfn_t gfn) 1155 struct kvm_memory_slot *slot, gfn_t gfn)
1027{ 1156{
1028 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1157 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1029 return hva_to_pfn(kvm, addr, false); 1158 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1030} 1159}
1031 1160
1032int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1161int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -1035,7 +1164,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1035 unsigned long addr; 1164 unsigned long addr;
1036 gfn_t entry; 1165 gfn_t entry;
1037 1166
1038 addr = gfn_to_hva_many(kvm, gfn, &entry); 1167 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1039 if (kvm_is_error_hva(addr)) 1168 if (kvm_is_error_hva(addr))
1040 return -1; 1169 return -1;
1041 1170
@@ -1219,9 +1348,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1219 return 0; 1348 return 0;
1220} 1349}
1221 1350
1351int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1352 gpa_t gpa)
1353{
1354 struct kvm_memslots *slots = kvm_memslots(kvm);
1355 int offset = offset_in_page(gpa);
1356 gfn_t gfn = gpa >> PAGE_SHIFT;
1357
1358 ghc->gpa = gpa;
1359 ghc->generation = slots->generation;
1360 ghc->memslot = __gfn_to_memslot(slots, gfn);
1361 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1362 if (!kvm_is_error_hva(ghc->hva))
1363 ghc->hva += offset;
1364 else
1365 return -EFAULT;
1366
1367 return 0;
1368}
1369EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1370
1371int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1372 void *data, unsigned long len)
1373{
1374 struct kvm_memslots *slots = kvm_memslots(kvm);
1375 int r;
1376
1377 if (slots->generation != ghc->generation)
1378 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1379
1380 if (kvm_is_error_hva(ghc->hva))
1381 return -EFAULT;
1382
1383 r = copy_to_user((void __user *)ghc->hva, data, len);
1384 if (r)
1385 return -EFAULT;
1386 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1387
1388 return 0;
1389}
1390EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1391
1222int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1392int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1223{ 1393{
1224 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1394 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1395 offset, len);
1225} 1396}
1226EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1397EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1227 1398
@@ -1244,11 +1415,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1244} 1415}
1245EXPORT_SYMBOL_GPL(kvm_clear_guest); 1416EXPORT_SYMBOL_GPL(kvm_clear_guest);
1246 1417
1247void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1418void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1419 gfn_t gfn)
1248{ 1420{
1249 struct kvm_memory_slot *memslot;
1250
1251 memslot = gfn_to_memslot(kvm, gfn);
1252 if (memslot && memslot->dirty_bitmap) { 1421 if (memslot && memslot->dirty_bitmap) {
1253 unsigned long rel_gfn = gfn - memslot->base_gfn; 1422 unsigned long rel_gfn = gfn - memslot->base_gfn;
1254 1423
@@ -1256,6 +1425,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1256 } 1425 }
1257} 1426}
1258 1427
1428void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1429{
1430 struct kvm_memory_slot *memslot;
1431
1432 memslot = gfn_to_memslot(kvm, gfn);
1433 mark_page_dirty_in_slot(kvm, memslot, gfn);
1434}
1435
1259/* 1436/*
1260 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1437 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1261 */ 1438 */
@@ -1457,6 +1634,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
1457 if (arg) 1634 if (arg)
1458 goto out; 1635 goto out;
1459 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1636 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1637 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1460 break; 1638 break;
1461 case KVM_GET_REGS: { 1639 case KVM_GET_REGS: {
1462 struct kvm_regs *kvm_regs; 1640 struct kvm_regs *kvm_regs;
@@ -1824,7 +2002,7 @@ static struct file_operations kvm_vm_fops = {
1824 2002
1825static int kvm_dev_ioctl_create_vm(void) 2003static int kvm_dev_ioctl_create_vm(void)
1826{ 2004{
1827 int fd, r; 2005 int r;
1828 struct kvm *kvm; 2006 struct kvm *kvm;
1829 2007
1830 kvm = kvm_create_vm(); 2008 kvm = kvm_create_vm();
@@ -1837,11 +2015,11 @@ static int kvm_dev_ioctl_create_vm(void)
1837 return r; 2015 return r;
1838 } 2016 }
1839#endif 2017#endif
1840 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2018 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
1841 if (fd < 0) 2019 if (r < 0)
1842 kvm_put_kvm(kvm); 2020 kvm_put_kvm(kvm);
1843 2021
1844 return fd; 2022 return r;
1845} 2023}
1846 2024
1847static long kvm_dev_ioctl_check_extension_generic(long arg) 2025static long kvm_dev_ioctl_check_extension_generic(long arg)
@@ -1922,7 +2100,7 @@ static struct miscdevice kvm_dev = {
1922 &kvm_chardev_ops, 2100 &kvm_chardev_ops,
1923}; 2101};
1924 2102
1925static void hardware_enable(void *junk) 2103static void hardware_enable_nolock(void *junk)
1926{ 2104{
1927 int cpu = raw_smp_processor_id(); 2105 int cpu = raw_smp_processor_id();
1928 int r; 2106 int r;
@@ -1942,7 +2120,14 @@ static void hardware_enable(void *junk)
1942 } 2120 }
1943} 2121}
1944 2122
1945static void hardware_disable(void *junk) 2123static void hardware_enable(void *junk)
2124{
2125 spin_lock(&kvm_lock);
2126 hardware_enable_nolock(junk);
2127 spin_unlock(&kvm_lock);
2128}
2129
2130static void hardware_disable_nolock(void *junk)
1946{ 2131{
1947 int cpu = raw_smp_processor_id(); 2132 int cpu = raw_smp_processor_id();
1948 2133
@@ -1952,13 +2137,20 @@ static void hardware_disable(void *junk)
1952 kvm_arch_hardware_disable(NULL); 2137 kvm_arch_hardware_disable(NULL);
1953} 2138}
1954 2139
2140static void hardware_disable(void *junk)
2141{
2142 spin_lock(&kvm_lock);
2143 hardware_disable_nolock(junk);
2144 spin_unlock(&kvm_lock);
2145}
2146
1955static void hardware_disable_all_nolock(void) 2147static void hardware_disable_all_nolock(void)
1956{ 2148{
1957 BUG_ON(!kvm_usage_count); 2149 BUG_ON(!kvm_usage_count);
1958 2150
1959 kvm_usage_count--; 2151 kvm_usage_count--;
1960 if (!kvm_usage_count) 2152 if (!kvm_usage_count)
1961 on_each_cpu(hardware_disable, NULL, 1); 2153 on_each_cpu(hardware_disable_nolock, NULL, 1);
1962} 2154}
1963 2155
1964static void hardware_disable_all(void) 2156static void hardware_disable_all(void)
@@ -1977,7 +2169,7 @@ static int hardware_enable_all(void)
1977 kvm_usage_count++; 2169 kvm_usage_count++;
1978 if (kvm_usage_count == 1) { 2170 if (kvm_usage_count == 1) {
1979 atomic_set(&hardware_enable_failed, 0); 2171 atomic_set(&hardware_enable_failed, 0);
1980 on_each_cpu(hardware_enable, NULL, 1); 2172 on_each_cpu(hardware_enable_nolock, NULL, 1);
1981 2173
1982 if (atomic_read(&hardware_enable_failed)) { 2174 if (atomic_read(&hardware_enable_failed)) {
1983 hardware_disable_all_nolock(); 2175 hardware_disable_all_nolock();
@@ -2008,27 +2200,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2008 case CPU_STARTING: 2200 case CPU_STARTING:
2009 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2201 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2010 cpu); 2202 cpu);
2011 spin_lock(&kvm_lock);
2012 hardware_enable(NULL); 2203 hardware_enable(NULL);
2013 spin_unlock(&kvm_lock);
2014 break; 2204 break;
2015 } 2205 }
2016 return NOTIFY_OK; 2206 return NOTIFY_OK;
2017} 2207}
2018 2208
2019 2209
2020asmlinkage void kvm_handle_fault_on_reboot(void) 2210asmlinkage void kvm_spurious_fault(void)
2021{ 2211{
2022 if (kvm_rebooting) {
2023 /* spin while reset goes on */
2024 local_irq_enable();
2025 while (true)
2026 cpu_relax();
2027 }
2028 /* Fault while not rebooting. We want the trace. */ 2212 /* Fault while not rebooting. We want the trace. */
2029 BUG(); 2213 BUG();
2030} 2214}
2031EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2215EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2032 2216
2033static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2217static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2034 void *v) 2218 void *v)
@@ -2041,7 +2225,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2041 */ 2225 */
2042 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2226 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2043 kvm_rebooting = true; 2227 kvm_rebooting = true;
2044 on_each_cpu(hardware_disable, NULL, 1); 2228 on_each_cpu(hardware_disable_nolock, NULL, 1);
2045 return NOTIFY_OK; 2229 return NOTIFY_OK;
2046} 2230}
2047 2231
@@ -2211,7 +2395,7 @@ static void kvm_exit_debug(void)
2211static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2395static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2212{ 2396{
2213 if (kvm_usage_count) 2397 if (kvm_usage_count)
2214 hardware_disable(NULL); 2398 hardware_disable_nolock(NULL);
2215 return 0; 2399 return 0;
2216} 2400}
2217 2401
@@ -2219,7 +2403,7 @@ static int kvm_resume(struct sys_device *dev)
2219{ 2403{
2220 if (kvm_usage_count) { 2404 if (kvm_usage_count) {
2221 WARN_ON(spin_is_locked(&kvm_lock)); 2405 WARN_ON(spin_is_locked(&kvm_lock));
2222 hardware_enable(NULL); 2406 hardware_enable_nolock(NULL);
2223 } 2407 }
2224 return 0; 2408 return 0;
2225} 2409}
@@ -2336,6 +2520,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2336 goto out_free_5; 2520 goto out_free_5;
2337 } 2521 }
2338 2522
2523 r = kvm_async_pf_init();
2524 if (r)
2525 goto out_free;
2526
2339 kvm_chardev_ops.owner = module; 2527 kvm_chardev_ops.owner = module;
2340 kvm_vm_fops.owner = module; 2528 kvm_vm_fops.owner = module;
2341 kvm_vcpu_fops.owner = module; 2529 kvm_vcpu_fops.owner = module;
@@ -2343,7 +2531,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2343 r = misc_register(&kvm_dev); 2531 r = misc_register(&kvm_dev);
2344 if (r) { 2532 if (r) {
2345 printk(KERN_ERR "kvm: misc device register failed\n"); 2533 printk(KERN_ERR "kvm: misc device register failed\n");
2346 goto out_free; 2534 goto out_unreg;
2347 } 2535 }
2348 2536
2349 kvm_preempt_ops.sched_in = kvm_sched_in; 2537 kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2541,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2353 2541
2354 return 0; 2542 return 0;
2355 2543
2544out_unreg:
2545 kvm_async_pf_deinit();
2356out_free: 2546out_free:
2357 kmem_cache_destroy(kvm_vcpu_cache); 2547 kmem_cache_destroy(kvm_vcpu_cache);
2358out_free_5: 2548out_free_5:
@@ -2385,11 +2575,12 @@ void kvm_exit(void)
2385 kvm_exit_debug(); 2575 kvm_exit_debug();
2386 misc_deregister(&kvm_dev); 2576 misc_deregister(&kvm_dev);
2387 kmem_cache_destroy(kvm_vcpu_cache); 2577 kmem_cache_destroy(kvm_vcpu_cache);
2578 kvm_async_pf_deinit();
2388 sysdev_unregister(&kvm_sysdev); 2579 sysdev_unregister(&kvm_sysdev);
2389 sysdev_class_unregister(&kvm_sysdev_class); 2580 sysdev_class_unregister(&kvm_sysdev_class);
2390 unregister_reboot_notifier(&kvm_reboot_notifier); 2581 unregister_reboot_notifier(&kvm_reboot_notifier);
2391 unregister_cpu_notifier(&kvm_cpu_notifier); 2582 unregister_cpu_notifier(&kvm_cpu_notifier);
2392 on_each_cpu(hardware_disable, NULL, 1); 2583 on_each_cpu(hardware_disable_nolock, NULL, 1);
2393 kvm_arch_hardware_unsetup(); 2584 kvm_arch_hardware_unsetup();
2394 kvm_arch_exit(); 2585 kvm_arch_exit();
2395 free_cpumask_var(cpus_hardware_enabled); 2586 free_cpumask_var(cpus_hardware_enabled);