aboutsummaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /virt
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig6
-rw-r--r--virt/kvm/assigned-dev.c296
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/coalesced_mmio.c137
-rw-r--r--virt/kvm/coalesced_mmio.h7
-rw-r--r--virt/kvm/eventfd.c182
-rw-r--r--virt/kvm/ioapic.c78
-rw-r--r--virt/kvm/ioapic.h5
-rw-r--r--virt/kvm/iommu.c79
-rw-r--r--virt/kvm/irq_comm.c146
-rw-r--r--virt/kvm/kvm_main.c1158
11 files changed, 717 insertions, 1388 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index d01b24b72c6..f63ccb0a598 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,9 +18,3 @@ config KVM_MMIO
18 18
19config KVM_ASYNC_PF 19config KVM_ASYNC_PF
20 bool 20 bool
21
22config HAVE_KVM_MSI
23 bool
24
25config HAVE_KVM_CPU_RELAX_INTERCEPT
26 bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 3642239252b..af7910228fb 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -49,157 +49,71 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
49 index = i; 49 index = i;
50 break; 50 break;
51 } 51 }
52 if (index < 0) 52 if (index < 0) {
53 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 53 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
54 return 0;
55 }
54 56
55 return index; 57 return index;
56} 58}
57 59
58static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) 60static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
59{ 61{
60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 62 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
61 int ret; 63 u32 vector;
64 int index;
62 65
63 spin_lock(&assigned_dev->intx_lock); 66 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
64 if (pci_check_and_mask_intx(assigned_dev->dev)) { 67 spin_lock(&assigned_dev->intx_lock);
68 disable_irq_nosync(irq);
65 assigned_dev->host_irq_disabled = true; 69 assigned_dev->host_irq_disabled = true;
66 ret = IRQ_WAKE_THREAD; 70 spin_unlock(&assigned_dev->intx_lock);
67 } else 71 }
68 ret = IRQ_NONE;
69 spin_unlock(&assigned_dev->intx_lock);
70
71 return ret;
72}
73 72
74static void 73 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
75kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, 74 index = find_index_from_host_irq(assigned_dev, irq);
76 int vector) 75 if (index >= 0) {
77{ 76 vector = assigned_dev->
78 if (unlikely(assigned_dev->irq_requested_type & 77 guest_msix_entries[index].vector;
79 KVM_DEV_IRQ_GUEST_INTX)) {
80 spin_lock(&assigned_dev->intx_mask_lock);
81 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
82 kvm_set_irq(assigned_dev->kvm, 78 kvm_set_irq(assigned_dev->kvm,
83 assigned_dev->irq_source_id, vector, 1); 79 assigned_dev->irq_source_id, vector, 1);
84 spin_unlock(&assigned_dev->intx_mask_lock); 80 }
85 } else 81 } else
86 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 82 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
87 vector, 1); 83 assigned_dev->guest_irq, 1);
88}
89
90static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
91{
92 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
93
94 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
95 spin_lock_irq(&assigned_dev->intx_lock);
96 disable_irq_nosync(irq);
97 assigned_dev->host_irq_disabled = true;
98 spin_unlock_irq(&assigned_dev->intx_lock);
99 }
100
101 kvm_assigned_dev_raise_guest_irq(assigned_dev,
102 assigned_dev->guest_irq);
103 84
104 return IRQ_HANDLED; 85 return IRQ_HANDLED;
105} 86}
106 87
107#ifdef __KVM_HAVE_MSI
108static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
109{
110 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
111 int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
112 assigned_dev->irq_source_id,
113 assigned_dev->guest_irq, 1);
114 return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
115}
116
117static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
118{
119 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
120
121 kvm_assigned_dev_raise_guest_irq(assigned_dev,
122 assigned_dev->guest_irq);
123
124 return IRQ_HANDLED;
125}
126#endif
127
128#ifdef __KVM_HAVE_MSIX
129static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
130{
131 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
132 int index = find_index_from_host_irq(assigned_dev, irq);
133 u32 vector;
134 int ret = 0;
135
136 if (index >= 0) {
137 vector = assigned_dev->guest_msix_entries[index].vector;
138 ret = kvm_set_irq_inatomic(assigned_dev->kvm,
139 assigned_dev->irq_source_id,
140 vector, 1);
141 }
142
143 return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
144}
145
146static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
147{
148 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
149 int index = find_index_from_host_irq(assigned_dev, irq);
150 u32 vector;
151
152 if (index >= 0) {
153 vector = assigned_dev->guest_msix_entries[index].vector;
154 kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
155 }
156
157 return IRQ_HANDLED;
158}
159#endif
160
161/* Ack the irq line for an assigned device */ 88/* Ack the irq line for an assigned device */
162static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 89static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
163{ 90{
164 struct kvm_assigned_dev_kernel *dev = 91 struct kvm_assigned_dev_kernel *dev;
165 container_of(kian, struct kvm_assigned_dev_kernel,
166 ack_notifier);
167 92
168 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 93 if (kian->gsi == -1)
94 return;
169 95
170 spin_lock(&dev->intx_mask_lock); 96 dev = container_of(kian, struct kvm_assigned_dev_kernel,
171 97 ack_notifier);
172 if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
173 bool reassert = false;
174
175 spin_lock_irq(&dev->intx_lock);
176 /*
177 * The guest IRQ may be shared so this ack can come from an
178 * IRQ for another guest device.
179 */
180 if (dev->host_irq_disabled) {
181 if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
182 enable_irq(dev->host_irq);
183 else if (!pci_check_and_unmask_intx(dev->dev))
184 reassert = true;
185 dev->host_irq_disabled = reassert;
186 }
187 spin_unlock_irq(&dev->intx_lock);
188 98
189 if (reassert) 99 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
190 kvm_set_irq(dev->kvm, dev->irq_source_id,
191 dev->guest_irq, 1);
192 }
193 100
194 spin_unlock(&dev->intx_mask_lock); 101 /* The guest irq may be shared so this ack may be
102 * from another device.
103 */
104 spin_lock(&dev->intx_lock);
105 if (dev->host_irq_disabled) {
106 enable_irq(dev->host_irq);
107 dev->host_irq_disabled = false;
108 }
109 spin_unlock(&dev->intx_lock);
195} 110}
196 111
197static void deassign_guest_irq(struct kvm *kvm, 112static void deassign_guest_irq(struct kvm *kvm,
198 struct kvm_assigned_dev_kernel *assigned_dev) 113 struct kvm_assigned_dev_kernel *assigned_dev)
199{ 114{
200 if (assigned_dev->ack_notifier.gsi != -1) 115 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
201 kvm_unregister_irq_ack_notifier(kvm, 116 assigned_dev->ack_notifier.gsi = -1;
202 &assigned_dev->ack_notifier);
203 117
204 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 118 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
205 assigned_dev->guest_irq, 0); 119 assigned_dev->guest_irq, 0);
@@ -231,7 +145,7 @@ static void deassign_host_irq(struct kvm *kvm,
231 145
232 for (i = 0; i < assigned_dev->entries_nr; i++) 146 for (i = 0; i < assigned_dev->entries_nr; i++)
233 free_irq(assigned_dev->host_msix_entries[i].vector, 147 free_irq(assigned_dev->host_msix_entries[i].vector,
234 assigned_dev); 148 (void *)assigned_dev);
235 149
236 assigned_dev->entries_nr = 0; 150 assigned_dev->entries_nr = 0;
237 kfree(assigned_dev->host_msix_entries); 151 kfree(assigned_dev->host_msix_entries);
@@ -239,17 +153,9 @@ static void deassign_host_irq(struct kvm *kvm,
239 pci_disable_msix(assigned_dev->dev); 153 pci_disable_msix(assigned_dev->dev);
240 } else { 154 } else {
241 /* Deal with MSI and INTx */ 155 /* Deal with MSI and INTx */
242 if ((assigned_dev->irq_requested_type & 156 disable_irq(assigned_dev->host_irq);
243 KVM_DEV_IRQ_HOST_INTX) && 157
244 (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { 158 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
245 spin_lock_irq(&assigned_dev->intx_lock);
246 pci_intx(assigned_dev->dev, false);
247 spin_unlock_irq(&assigned_dev->intx_lock);
248 synchronize_irq(assigned_dev->host_irq);
249 } else
250 disable_irq(assigned_dev->host_irq);
251
252 free_irq(assigned_dev->host_irq, assigned_dev);
253 159
254 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 160 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
255 pci_disable_msi(assigned_dev->dev); 161 pci_disable_msi(assigned_dev->dev);
@@ -301,8 +207,6 @@ static void kvm_free_assigned_device(struct kvm *kvm,
301 else 207 else
302 pci_restore_state(assigned_dev->dev); 208 pci_restore_state(assigned_dev->dev);
303 209
304 assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
305
306 pci_release_regions(assigned_dev->dev); 210 pci_release_regions(assigned_dev->dev);
307 pci_disable_device(assigned_dev->dev); 211 pci_disable_device(assigned_dev->dev);
308 pci_dev_put(assigned_dev->dev); 212 pci_dev_put(assigned_dev->dev);
@@ -328,34 +232,15 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
328static int assigned_device_enable_host_intx(struct kvm *kvm, 232static int assigned_device_enable_host_intx(struct kvm *kvm,
329 struct kvm_assigned_dev_kernel *dev) 233 struct kvm_assigned_dev_kernel *dev)
330{ 234{
331 irq_handler_t irq_handler;
332 unsigned long flags;
333
334 dev->host_irq = dev->dev->irq; 235 dev->host_irq = dev->dev->irq;
335 236 /* Even though this is PCI, we don't want to use shared
336 /* 237 * interrupts. Sharing host devices with guest-assigned devices
337 * We can only share the IRQ line with other host devices if we are 238 * on the same interrupt line is not a happy situation: there
338 * able to disable the IRQ source at device-level - independently of 239 * are going to be long delays in accepting, acking, etc.
339 * the guest driver. Otherwise host devices may suffer from unbounded
340 * IRQ latencies when the guest keeps the line asserted.
341 */ 240 */
342 if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { 241 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
343 irq_handler = kvm_assigned_dev_intx; 242 IRQF_ONESHOT, dev->irq_name, (void *)dev))
344 flags = IRQF_SHARED;
345 } else {
346 irq_handler = NULL;
347 flags = IRQF_ONESHOT;
348 }
349 if (request_threaded_irq(dev->host_irq, irq_handler,
350 kvm_assigned_dev_thread_intx, flags,
351 dev->irq_name, dev))
352 return -EIO; 243 return -EIO;
353
354 if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
355 spin_lock_irq(&dev->intx_lock);
356 pci_intx(dev->dev, true);
357 spin_unlock_irq(&dev->intx_lock);
358 }
359 return 0; 244 return 0;
360} 245}
361 246
@@ -372,9 +257,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
372 } 257 }
373 258
374 dev->host_irq = dev->dev->irq; 259 dev->host_irq = dev->dev->irq;
375 if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, 260 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
376 kvm_assigned_dev_thread_msi, 0, 261 0, dev->irq_name, (void *)dev)) {
377 dev->irq_name, dev)) {
378 pci_disable_msi(dev->dev); 262 pci_disable_msi(dev->dev);
379 return -EIO; 263 return -EIO;
380 } 264 }
@@ -400,9 +284,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
400 284
401 for (i = 0; i < dev->entries_nr; i++) { 285 for (i = 0; i < dev->entries_nr; i++) {
402 r = request_threaded_irq(dev->host_msix_entries[i].vector, 286 r = request_threaded_irq(dev->host_msix_entries[i].vector,
403 kvm_assigned_dev_msix, 287 NULL, kvm_assigned_dev_thread,
404 kvm_assigned_dev_thread_msix, 288 0, dev->irq_name, (void *)dev);
405 0, dev->irq_name, dev);
406 if (r) 289 if (r)
407 goto err; 290 goto err;
408 } 291 }
@@ -410,7 +293,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
410 return 0; 293 return 0;
411err: 294err:
412 for (i -= 1; i >= 0; i--) 295 for (i -= 1; i >= 0; i--)
413 free_irq(dev->host_msix_entries[i].vector, dev); 296 free_irq(dev->host_msix_entries[i].vector, (void *)dev);
414 pci_disable_msix(dev->dev); 297 pci_disable_msix(dev->dev);
415 return r; 298 return r;
416} 299}
@@ -433,6 +316,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
433{ 316{
434 dev->guest_irq = irq->guest_irq; 317 dev->guest_irq = irq->guest_irq;
435 dev->ack_notifier.gsi = -1; 318 dev->ack_notifier.gsi = -1;
319 dev->host_irq_disabled = false;
436 return 0; 320 return 0;
437} 321}
438#endif 322#endif
@@ -444,6 +328,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
444{ 328{
445 dev->guest_irq = irq->guest_irq; 329 dev->guest_irq = irq->guest_irq;
446 dev->ack_notifier.gsi = -1; 330 dev->ack_notifier.gsi = -1;
331 dev->host_irq_disabled = false;
447 return 0; 332 return 0;
448} 333}
449#endif 334#endif
@@ -477,7 +362,6 @@ static int assign_host_irq(struct kvm *kvm,
477 default: 362 default:
478 r = -EINVAL; 363 r = -EINVAL;
479 } 364 }
480 dev->host_irq_disabled = false;
481 365
482 if (!r) 366 if (!r)
483 dev->irq_requested_type |= host_irq_type; 367 dev->irq_requested_type |= host_irq_type;
@@ -522,8 +406,7 @@ static int assign_guest_irq(struct kvm *kvm,
522 406
523 if (!r) { 407 if (!r) {
524 dev->irq_requested_type |= guest_irq_type; 408 dev->irq_requested_type |= guest_irq_type;
525 if (dev->ack_notifier.gsi != -1) 409 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
526 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
527 } else 410 } else
528 kvm_free_irq_source_id(kvm, dev->irq_source_id); 411 kvm_free_irq_source_id(kvm, dev->irq_source_id);
529 412
@@ -579,7 +462,6 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
579{ 462{
580 int r = -ENODEV; 463 int r = -ENODEV;
581 struct kvm_assigned_dev_kernel *match; 464 struct kvm_assigned_dev_kernel *match;
582 unsigned long irq_type;
583 465
584 mutex_lock(&kvm->lock); 466 mutex_lock(&kvm->lock);
585 467
@@ -588,9 +470,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
588 if (!match) 470 if (!match)
589 goto out; 471 goto out;
590 472
591 irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | 473 r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
592 KVM_DEV_IRQ_GUEST_MASK);
593 r = kvm_deassign_irq(kvm, match, irq_type);
594out: 474out:
595 mutex_unlock(&kvm->lock); 475 mutex_unlock(&kvm->lock);
596 return r; 476 return r;
@@ -662,6 +542,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
662 int r = 0, idx; 542 int r = 0, idx;
663 struct kvm_assigned_dev_kernel *match; 543 struct kvm_assigned_dev_kernel *match;
664 struct pci_dev *dev; 544 struct pci_dev *dev;
545 u8 header_type;
665 546
666 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) 547 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
667 return -EINVAL; 548 return -EINVAL;
@@ -694,7 +575,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
694 } 575 }
695 576
696 /* Don't allow bridges to be assigned */ 577 /* Don't allow bridges to be assigned */
697 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { 578 pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
579 if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) {
698 r = -EPERM; 580 r = -EPERM;
699 goto out_put; 581 goto out_put;
700 } 582 }
@@ -721,10 +603,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
721 if (!match->pci_saved_state) 603 if (!match->pci_saved_state)
722 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", 604 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
723 __func__, dev_name(&dev->dev)); 605 __func__, dev_name(&dev->dev));
724
725 if (!pci_intx_mask_supported(dev))
726 assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
727
728 match->assigned_dev_id = assigned_dev->assigned_dev_id; 606 match->assigned_dev_id = assigned_dev->assigned_dev_id;
729 match->host_segnr = assigned_dev->segnr; 607 match->host_segnr = assigned_dev->segnr;
730 match->host_busnr = assigned_dev->busnr; 608 match->host_busnr = assigned_dev->busnr;
@@ -732,7 +610,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
732 match->flags = assigned_dev->flags; 610 match->flags = assigned_dev->flags;
733 match->dev = dev; 611 match->dev = dev;
734 spin_lock_init(&match->intx_lock); 612 spin_lock_init(&match->intx_lock);
735 spin_lock_init(&match->intx_mask_lock);
736 match->irq_source_id = -1; 613 match->irq_source_id = -1;
737 match->kvm = kvm; 614 match->kvm = kvm;
738 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 615 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -878,55 +755,6 @@ msix_entry_out:
878} 755}
879#endif 756#endif
880 757
881static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
882 struct kvm_assigned_pci_dev *assigned_dev)
883{
884 int r = 0;
885 struct kvm_assigned_dev_kernel *match;
886
887 mutex_lock(&kvm->lock);
888
889 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
890 assigned_dev->assigned_dev_id);
891 if (!match) {
892 r = -ENODEV;
893 goto out;
894 }
895
896 spin_lock(&match->intx_mask_lock);
897
898 match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
899 match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
900
901 if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
902 if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
903 kvm_set_irq(match->kvm, match->irq_source_id,
904 match->guest_irq, 0);
905 /*
906 * Masking at hardware-level is performed on demand,
907 * i.e. when an IRQ actually arrives at the host.
908 */
909 } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
910 /*
911 * Unmask the IRQ line if required. Unmasking at
912 * device level will be performed by user space.
913 */
914 spin_lock_irq(&match->intx_lock);
915 if (match->host_irq_disabled) {
916 enable_irq(match->host_irq);
917 match->host_irq_disabled = false;
918 }
919 spin_unlock_irq(&match->intx_lock);
920 }
921 }
922
923 spin_unlock(&match->intx_mask_lock);
924
925out:
926 mutex_unlock(&kvm->lock);
927 return r;
928}
929
930long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 758long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
931 unsigned long arg) 759 unsigned long arg)
932{ 760{
@@ -1034,15 +862,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1034 break; 862 break;
1035 } 863 }
1036#endif 864#endif
1037 case KVM_ASSIGN_SET_INTX_MASK: {
1038 struct kvm_assigned_pci_dev assigned_dev;
1039
1040 r = -EFAULT;
1041 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1042 goto out;
1043 r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1044 break;
1045 }
1046 default: 865 default:
1047 r = -ENOTTY; 866 r = -ENOTTY;
1048 break; 867 break;
@@ -1050,3 +869,4 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1050out: 869out:
1051 return r; 870 return r;
1052} 871}
872
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index ea475cd0351..74268b4c2ee 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
111 list_entry(vcpu->async_pf.done.next, 111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link); 112 typeof(*work), link);
113 list_del(&work->link); 113 list_del(&work->link);
114 if (!is_error_page(work->page)) 114 if (work->page)
115 kvm_release_page_clean(work->page); 115 put_page(work->page);
116 kmem_cache_free(async_pf_cache, work); 116 kmem_cache_free(async_pf_cache, work);
117 } 117 }
118 spin_unlock(&vcpu->async_pf.lock); 118 spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
138 138
139 list_del(&work->queue); 139 list_del(&work->queue);
140 vcpu->async_pf.queued--; 140 vcpu->async_pf.queued--;
141 if (!is_error_page(work->page)) 141 if (work->page)
142 kvm_release_page_clean(work->page); 142 put_page(work->page);
143 kmem_cache_free(async_pf_cache, work); 143 kmem_cache_free(async_pf_cache, work);
144 } 144 }
145} 145}
@@ -203,7 +203,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
203 if (!work) 203 if (!work)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
206 work->page = KVM_ERR_PTR_BAD_PAGE; 206 work->page = bad_page;
207 get_page(bad_page);
207 INIT_LIST_HEAD(&work->queue); /* for list_del to work */ 208 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
208 209
209 spin_lock(&vcpu->async_pf.lock); 210 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 88b2fe3ddf4..fc8487564d1 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -24,25 +24,10 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
24static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, 24static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
25 gpa_t addr, int len) 25 gpa_t addr, int len)
26{ 26{
27 /* is it in a batchable area ? 27 struct kvm_coalesced_mmio_zone *zone;
28 * (addr,len) is fully included in
29 * (zone->addr, zone->size)
30 */
31 if (len < 0)
32 return 0;
33 if (addr + len < addr)
34 return 0;
35 if (addr < dev->zone.addr)
36 return 0;
37 if (addr + len > dev->zone.addr + dev->zone.size)
38 return 0;
39 return 1;
40}
41
42static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
43{
44 struct kvm_coalesced_mmio_ring *ring; 28 struct kvm_coalesced_mmio_ring *ring;
45 unsigned avail; 29 unsigned avail;
30 int i;
46 31
47 /* Are we able to batch it ? */ 32 /* Are we able to batch it ? */
48 33
@@ -52,12 +37,25 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
52 */ 37 */
53 ring = dev->kvm->coalesced_mmio_ring; 38 ring = dev->kvm->coalesced_mmio_ring;
54 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; 39 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
55 if (avail == 0) { 40 if (avail < KVM_MAX_VCPUS) {
56 /* full */ 41 /* full */
57 return 0; 42 return 0;
58 } 43 }
59 44
60 return 1; 45 /* is it in a batchable area ? */
46
47 for (i = 0; i < dev->nb_zones; i++) {
48 zone = &dev->zone[i];
49
50 /* (addr,len) is fully included in
51 * (zone->addr, zone->size)
52 */
53
54 if (zone->addr <= addr &&
55 addr + len <= zone->addr + zone->size)
56 return 1;
57 }
58 return 0;
61} 59}
62 60
63static int coalesced_mmio_write(struct kvm_io_device *this, 61static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -65,16 +63,10 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
65{ 63{
66 struct kvm_coalesced_mmio_dev *dev = to_mmio(this); 64 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
67 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; 65 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
68
69 if (!coalesced_mmio_in_range(dev, addr, len)) 66 if (!coalesced_mmio_in_range(dev, addr, len))
70 return -EOPNOTSUPP; 67 return -EOPNOTSUPP;
71 68
72 spin_lock(&dev->kvm->ring_lock); 69 spin_lock(&dev->lock);
73
74 if (!coalesced_mmio_has_room(dev)) {
75 spin_unlock(&dev->kvm->ring_lock);
76 return -EOPNOTSUPP;
77 }
78 70
79 /* copy data in first free entry of the ring */ 71 /* copy data in first free entry of the ring */
80 72
@@ -83,7 +75,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
83 memcpy(ring->coalesced_mmio[ring->last].data, val, len); 75 memcpy(ring->coalesced_mmio[ring->last].data, val, len);
84 smp_wmb(); 76 smp_wmb();
85 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; 77 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
86 spin_unlock(&dev->kvm->ring_lock); 78 spin_unlock(&dev->lock);
87 return 0; 79 return 0;
88} 80}
89 81
@@ -91,8 +83,6 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
91{ 83{
92 struct kvm_coalesced_mmio_dev *dev = to_mmio(this); 84 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
93 85
94 list_del(&dev->list);
95
96 kfree(dev); 86 kfree(dev);
97} 87}
98 88
@@ -103,6 +93,7 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
103 93
104int kvm_coalesced_mmio_init(struct kvm *kvm) 94int kvm_coalesced_mmio_init(struct kvm *kvm)
105{ 95{
96 struct kvm_coalesced_mmio_dev *dev;
106 struct page *page; 97 struct page *page;
107 int ret; 98 int ret;
108 99
@@ -110,18 +101,31 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
110 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 101 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
111 if (!page) 102 if (!page)
112 goto out_err; 103 goto out_err;
113
114 ret = 0;
115 kvm->coalesced_mmio_ring = page_address(page); 104 kvm->coalesced_mmio_ring = page_address(page);
116 105
117 /* 106 ret = -ENOMEM;
118 * We're using this spinlock to sync access to the coalesced ring. 107 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
119 * The list doesn't need it's own lock since device registration and 108 if (!dev)
120 * unregistration should only happen when kvm->slots_lock is held. 109 goto out_free_page;
121 */ 110 spin_lock_init(&dev->lock);
122 spin_lock_init(&kvm->ring_lock); 111 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
123 INIT_LIST_HEAD(&kvm->coalesced_zones); 112 dev->kvm = kvm;
113 kvm->coalesced_mmio_dev = dev;
114
115 mutex_lock(&kvm->slots_lock);
116 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
117 mutex_unlock(&kvm->slots_lock);
118 if (ret < 0)
119 goto out_free_dev;
124 120
121 return ret;
122
123out_free_dev:
124 kvm->coalesced_mmio_dev = NULL;
125 kfree(dev);
126out_free_page:
127 kvm->coalesced_mmio_ring = NULL;
128 __free_page(page);
125out_err: 129out_err:
126 return ret; 130 return ret;
127} 131}
@@ -135,50 +139,51 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
135int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, 139int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
136 struct kvm_coalesced_mmio_zone *zone) 140 struct kvm_coalesced_mmio_zone *zone)
137{ 141{
138 int ret; 142 struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
139 struct kvm_coalesced_mmio_dev *dev;
140
141 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
142 if (!dev)
143 return -ENOMEM;
144 143
145 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); 144 if (dev == NULL)
146 dev->kvm = kvm; 145 return -ENXIO;
147 dev->zone = *zone;
148 146
149 mutex_lock(&kvm->slots_lock); 147 mutex_lock(&kvm->slots_lock);
150 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, 148 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
151 zone->size, &dev->dev); 149 mutex_unlock(&kvm->slots_lock);
152 if (ret < 0) 150 return -ENOBUFS;
153 goto out_free_dev; 151 }
154 list_add_tail(&dev->list, &kvm->coalesced_zones);
155 mutex_unlock(&kvm->slots_lock);
156 152
157 return ret; 153 dev->zone[dev->nb_zones] = *zone;
154 dev->nb_zones++;
158 155
159out_free_dev:
160 mutex_unlock(&kvm->slots_lock); 156 mutex_unlock(&kvm->slots_lock);
161
162 kfree(dev);
163
164 if (dev == NULL)
165 return -ENXIO;
166
167 return 0; 157 return 0;
168} 158}
169 159
170int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, 160int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
171 struct kvm_coalesced_mmio_zone *zone) 161 struct kvm_coalesced_mmio_zone *zone)
172{ 162{
173 struct kvm_coalesced_mmio_dev *dev, *tmp; 163 int i;
164 struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
165 struct kvm_coalesced_mmio_zone *z;
166
167 if (dev == NULL)
168 return -ENXIO;
174 169
175 mutex_lock(&kvm->slots_lock); 170 mutex_lock(&kvm->slots_lock);
176 171
177 list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) 172 i = dev->nb_zones;
178 if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { 173 while (i) {
179 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); 174 z = &dev->zone[i - 1];
180 kvm_iodevice_destructor(&dev->dev); 175
176 /* unregister all zones
177 * included in (zone->addr, zone->size)
178 */
179
180 if (zone->addr <= z->addr &&
181 z->addr + z->size <= zone->addr + zone->size) {
182 dev->nb_zones--;
183 *z = dev->zone[dev->nb_zones];
181 } 184 }
185 i--;
186 }
182 187
183 mutex_unlock(&kvm->slots_lock); 188 mutex_unlock(&kvm->slots_lock);
184 189
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index b280c20444d..8a5959e3535 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,13 +12,14 @@
12 12
13#ifdef CONFIG_KVM_MMIO 13#ifdef CONFIG_KVM_MMIO
14 14
15#include <linux/list.h> 15#define KVM_COALESCED_MMIO_ZONE_MAX 100
16 16
17struct kvm_coalesced_mmio_dev { 17struct kvm_coalesced_mmio_dev {
18 struct list_head list;
19 struct kvm_io_device dev; 18 struct kvm_io_device dev;
20 struct kvm *kvm; 19 struct kvm *kvm;
21 struct kvm_coalesced_mmio_zone zone; 20 spinlock_t lock;
21 int nb_zones;
22 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
22}; 23};
23 24
24int kvm_coalesced_mmio_init(struct kvm *kvm); 25int kvm_coalesced_mmio_init(struct kvm *kvm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b6eea5cc7b3..73358d256fa 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,7 +35,6 @@
35 35
36#include "iodev.h" 36#include "iodev.h"
37 37
38#ifdef __KVM_HAVE_IOAPIC
39/* 38/*
40 * -------------------------------------------------------------------- 39 * --------------------------------------------------------------------
41 * irqfd: Allows an fd to be used to inject an interrupt to the guest 40 * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -44,31 +43,6 @@
44 * -------------------------------------------------------------------- 43 * --------------------------------------------------------------------
45 */ 44 */
46 45
47/*
48 * Resampling irqfds are a special variety of irqfds used to emulate
49 * level triggered interrupts. The interrupt is asserted on eventfd
50 * trigger. On acknowledgement through the irq ack notifier, the
51 * interrupt is de-asserted and userspace is notified through the
52 * resamplefd. All resamplers on the same gsi are de-asserted
53 * together, so we don't need to track the state of each individual
54 * user. We can also therefore share the same irq source ID.
55 */
56struct _irqfd_resampler {
57 struct kvm *kvm;
58 /*
59 * List of resampling struct _irqfd objects sharing this gsi.
60 * RCU list modified under kvm->irqfds.resampler_lock
61 */
62 struct list_head list;
63 struct kvm_irq_ack_notifier notifier;
64 /*
65 * Entry in list of kvm->irqfd.resampler_list. Use for sharing
66 * resamplers among irqfds on the same gsi.
67 * Accessed and modified under kvm->irqfds.resampler_lock
68 */
69 struct list_head link;
70};
71
72struct _irqfd { 46struct _irqfd {
73 /* Used for MSI fast-path */ 47 /* Used for MSI fast-path */
74 struct kvm *kvm; 48 struct kvm *kvm;
@@ -78,12 +52,6 @@ struct _irqfd {
78 /* Used for level IRQ fast-path */ 52 /* Used for level IRQ fast-path */
79 int gsi; 53 int gsi;
80 struct work_struct inject; 54 struct work_struct inject;
81 /* The resampler used by this irqfd (resampler-only) */
82 struct _irqfd_resampler *resampler;
83 /* Eventfd notified on resample (resampler-only) */
84 struct eventfd_ctx *resamplefd;
85 /* Entry in list of irqfds for a resampler (resampler-only) */
86 struct list_head resampler_link;
87 /* Used for setup/shutdown */ 55 /* Used for setup/shutdown */
88 struct eventfd_ctx *eventfd; 56 struct eventfd_ctx *eventfd;
89 struct list_head list; 57 struct list_head list;
@@ -99,58 +67,8 @@ irqfd_inject(struct work_struct *work)
99 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 67 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
100 struct kvm *kvm = irqfd->kvm; 68 struct kvm *kvm = irqfd->kvm;
101 69
102 if (!irqfd->resampler) { 70 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
103 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 71 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
104 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
105 } else
106 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
107 irqfd->gsi, 1);
108}
109
110/*
111 * Since resampler irqfds share an IRQ source ID, we de-assert once
112 * then notify all of the resampler irqfds using this GSI. We can't
113 * do multiple de-asserts or we risk racing with incoming re-asserts.
114 */
115static void
116irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
117{
118 struct _irqfd_resampler *resampler;
119 struct _irqfd *irqfd;
120
121 resampler = container_of(kian, struct _irqfd_resampler, notifier);
122
123 kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
124 resampler->notifier.gsi, 0);
125
126 rcu_read_lock();
127
128 list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
129 eventfd_signal(irqfd->resamplefd, 1);
130
131 rcu_read_unlock();
132}
133
134static void
135irqfd_resampler_shutdown(struct _irqfd *irqfd)
136{
137 struct _irqfd_resampler *resampler = irqfd->resampler;
138 struct kvm *kvm = resampler->kvm;
139
140 mutex_lock(&kvm->irqfds.resampler_lock);
141
142 list_del_rcu(&irqfd->resampler_link);
143 synchronize_rcu();
144
145 if (list_empty(&resampler->list)) {
146 list_del(&resampler->link);
147 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
148 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
149 resampler->notifier.gsi, 0);
150 kfree(resampler);
151 }
152
153 mutex_unlock(&kvm->irqfds.resampler_lock);
154} 72}
155 73
156/* 74/*
@@ -172,12 +90,7 @@ irqfd_shutdown(struct work_struct *work)
172 * We know no new events will be scheduled at this point, so block 90 * We know no new events will be scheduled at this point, so block
173 * until all previously outstanding events have completed 91 * until all previously outstanding events have completed
174 */ 92 */
175 flush_work(&irqfd->inject); 93 flush_work_sync(&irqfd->inject);
176
177 if (irqfd->resampler) {
178 irqfd_resampler_shutdown(irqfd);
179 eventfd_ctx_put(irqfd->resamplefd);
180 }
181 94
182 /* 95 /*
183 * It is now safe to release the object's resources 96 * It is now safe to release the object's resources
@@ -285,12 +198,12 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
285} 198}
286 199
287static int 200static int
288kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 201kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
289{ 202{
290 struct kvm_irq_routing_table *irq_rt; 203 struct kvm_irq_routing_table *irq_rt;
291 struct _irqfd *irqfd, *tmp; 204 struct _irqfd *irqfd, *tmp;
292 struct file *file = NULL; 205 struct file *file = NULL;
293 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 206 struct eventfd_ctx *eventfd = NULL;
294 int ret; 207 int ret;
295 unsigned int events; 208 unsigned int events;
296 209
@@ -299,12 +212,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
299 return -ENOMEM; 212 return -ENOMEM;
300 213
301 irqfd->kvm = kvm; 214 irqfd->kvm = kvm;
302 irqfd->gsi = args->gsi; 215 irqfd->gsi = gsi;
303 INIT_LIST_HEAD(&irqfd->list); 216 INIT_LIST_HEAD(&irqfd->list);
304 INIT_WORK(&irqfd->inject, irqfd_inject); 217 INIT_WORK(&irqfd->inject, irqfd_inject);
305 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 218 INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
306 219
307 file = eventfd_fget(args->fd); 220 file = eventfd_fget(fd);
308 if (IS_ERR(file)) { 221 if (IS_ERR(file)) {
309 ret = PTR_ERR(file); 222 ret = PTR_ERR(file);
310 goto fail; 223 goto fail;
@@ -318,54 +231,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
318 231
319 irqfd->eventfd = eventfd; 232 irqfd->eventfd = eventfd;
320 233
321 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
322 struct _irqfd_resampler *resampler;
323
324 resamplefd = eventfd_ctx_fdget(args->resamplefd);
325 if (IS_ERR(resamplefd)) {
326 ret = PTR_ERR(resamplefd);
327 goto fail;
328 }
329
330 irqfd->resamplefd = resamplefd;
331 INIT_LIST_HEAD(&irqfd->resampler_link);
332
333 mutex_lock(&kvm->irqfds.resampler_lock);
334
335 list_for_each_entry(resampler,
336 &kvm->irqfds.resampler_list, link) {
337 if (resampler->notifier.gsi == irqfd->gsi) {
338 irqfd->resampler = resampler;
339 break;
340 }
341 }
342
343 if (!irqfd->resampler) {
344 resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
345 if (!resampler) {
346 ret = -ENOMEM;
347 mutex_unlock(&kvm->irqfds.resampler_lock);
348 goto fail;
349 }
350
351 resampler->kvm = kvm;
352 INIT_LIST_HEAD(&resampler->list);
353 resampler->notifier.gsi = irqfd->gsi;
354 resampler->notifier.irq_acked = irqfd_resampler_ack;
355 INIT_LIST_HEAD(&resampler->link);
356
357 list_add(&resampler->link, &kvm->irqfds.resampler_list);
358 kvm_register_irq_ack_notifier(kvm,
359 &resampler->notifier);
360 irqfd->resampler = resampler;
361 }
362
363 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
364 synchronize_rcu();
365
366 mutex_unlock(&kvm->irqfds.resampler_lock);
367 }
368
369 /* 234 /*
370 * Install our own custom wake-up handling so we are notified via 235 * Install our own custom wake-up handling so we are notified via
371 * a callback whenever someone signals the underlying eventfd 236 * a callback whenever someone signals the underlying eventfd
@@ -411,12 +276,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
411 return 0; 276 return 0;
412 277
413fail: 278fail:
414 if (irqfd->resampler)
415 irqfd_resampler_shutdown(irqfd);
416
417 if (resamplefd && !IS_ERR(resamplefd))
418 eventfd_ctx_put(resamplefd);
419
420 if (eventfd && !IS_ERR(eventfd)) 279 if (eventfd && !IS_ERR(eventfd))
421 eventfd_ctx_put(eventfd); 280 eventfd_ctx_put(eventfd);
422 281
@@ -426,38 +285,32 @@ fail:
426 kfree(irqfd); 285 kfree(irqfd);
427 return ret; 286 return ret;
428} 287}
429#endif
430 288
431void 289void
432kvm_eventfd_init(struct kvm *kvm) 290kvm_eventfd_init(struct kvm *kvm)
433{ 291{
434#ifdef __KVM_HAVE_IOAPIC
435 spin_lock_init(&kvm->irqfds.lock); 292 spin_lock_init(&kvm->irqfds.lock);
436 INIT_LIST_HEAD(&kvm->irqfds.items); 293 INIT_LIST_HEAD(&kvm->irqfds.items);
437 INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
438 mutex_init(&kvm->irqfds.resampler_lock);
439#endif
440 INIT_LIST_HEAD(&kvm->ioeventfds); 294 INIT_LIST_HEAD(&kvm->ioeventfds);
441} 295}
442 296
443#ifdef __KVM_HAVE_IOAPIC
444/* 297/*
445 * shutdown any irqfd's that match fd+gsi 298 * shutdown any irqfd's that match fd+gsi
446 */ 299 */
447static int 300static int
448kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 301kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
449{ 302{
450 struct _irqfd *irqfd, *tmp; 303 struct _irqfd *irqfd, *tmp;
451 struct eventfd_ctx *eventfd; 304 struct eventfd_ctx *eventfd;
452 305
453 eventfd = eventfd_ctx_fdget(args->fd); 306 eventfd = eventfd_ctx_fdget(fd);
454 if (IS_ERR(eventfd)) 307 if (IS_ERR(eventfd))
455 return PTR_ERR(eventfd); 308 return PTR_ERR(eventfd);
456 309
457 spin_lock_irq(&kvm->irqfds.lock); 310 spin_lock_irq(&kvm->irqfds.lock);
458 311
459 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 312 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
460 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 313 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
461 /* 314 /*
462 * This rcu_assign_pointer is needed for when 315 * This rcu_assign_pointer is needed for when
463 * another thread calls kvm_irq_routing_update before 316 * another thread calls kvm_irq_routing_update before
@@ -485,15 +338,12 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
485} 338}
486 339
487int 340int
488kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 341kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
489{ 342{
490 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 343 if (flags & KVM_IRQFD_FLAG_DEASSIGN)
491 return -EINVAL; 344 return kvm_irqfd_deassign(kvm, fd, gsi);
492
493 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
494 return kvm_irqfd_deassign(kvm, args);
495 345
496 return kvm_irqfd_assign(kvm, args); 346 return kvm_irqfd_assign(kvm, fd, gsi);
497} 347}
498 348
499/* 349/*
@@ -560,7 +410,6 @@ static void __exit irqfd_module_exit(void)
560 410
561module_init(irqfd_module_init); 411module_init(irqfd_module_init);
562module_exit(irqfd_module_exit); 412module_exit(irqfd_module_exit);
563#endif
564 413
565/* 414/*
566 * -------------------------------------------------------------------- 415 * --------------------------------------------------------------------
@@ -737,8 +586,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
737 586
738 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 587 kvm_iodevice_init(&p->dev, &ioeventfd_ops);
739 588
740 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 589 ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
741 &p->dev);
742 if (ret < 0) 590 if (ret < 0)
743 goto unlock_fail; 591 goto unlock_fail;
744 592
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index cfb7e4d52dc..8df1ca104a7 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -185,56 +185,42 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
185 irqe.dest_mode = 0; /* Physical mode. */ 185 irqe.dest_mode = 0; /* Physical mode. */
186 /* need to read apic_id from apic regiest since 186 /* need to read apic_id from apic regiest since
187 * it can be rewritten */ 187 * it can be rewritten */
188 irqe.dest_id = ioapic->kvm->bsp_vcpu_id; 188 irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
189 } 189 }
190#endif 190#endif
191 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 191 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
192} 192}
193 193
194int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 194int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
195 int level)
196{ 195{
197 u32 old_irr; 196 u32 old_irr;
198 u32 mask = 1 << irq; 197 u32 mask = 1 << irq;
199 union kvm_ioapic_redirect_entry entry; 198 union kvm_ioapic_redirect_entry entry;
200 int ret, irq_level; 199 int ret = 1;
201
202 BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
203 200
204 spin_lock(&ioapic->lock); 201 spin_lock(&ioapic->lock);
205 old_irr = ioapic->irr; 202 old_irr = ioapic->irr;
206 irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], 203 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
207 irq_source_id, level); 204 entry = ioapic->redirtbl[irq];
208 entry = ioapic->redirtbl[irq]; 205 level ^= entry.fields.polarity;
209 irq_level ^= entry.fields.polarity; 206 if (!level)
210 if (!irq_level) { 207 ioapic->irr &= ~mask;
211 ioapic->irr &= ~mask; 208 else {
212 ret = 1; 209 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
213 } else { 210 ioapic->irr |= mask;
214 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); 211 if ((edge && old_irr != ioapic->irr) ||
215 ioapic->irr |= mask; 212 (!edge && !entry.fields.remote_irr))
216 if ((edge && old_irr != ioapic->irr) || 213 ret = ioapic_service(ioapic, irq);
217 (!edge && !entry.fields.remote_irr)) 214 else
218 ret = ioapic_service(ioapic, irq); 215 ret = 0; /* report coalesced interrupt */
219 else 216 }
220 ret = 0; /* report coalesced interrupt */ 217 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
221 } 218 }
222 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
223 spin_unlock(&ioapic->lock); 219 spin_unlock(&ioapic->lock);
224 220
225 return ret; 221 return ret;
226} 222}
227 223
228void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
229{
230 int i;
231
232 spin_lock(&ioapic->lock);
233 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
234 __clear_bit(irq_source_id, &ioapic->irq_states[i]);
235 spin_unlock(&ioapic->lock);
236}
237
238static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, 224static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
239 int trigger_mode) 225 int trigger_mode)
240{ 226{
@@ -268,17 +254,13 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
268 } 254 }
269} 255}
270 256
271bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
272{
273 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
274 smp_rmb();
275 return test_bit(vector, ioapic->handled_vectors);
276}
277
278void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) 257void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
279{ 258{
280 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 259 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
281 260
261 smp_rmb();
262 if (!test_bit(vector, ioapic->handled_vectors))
263 return;
282 spin_lock(&ioapic->lock); 264 spin_lock(&ioapic->lock);
283 __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); 265 __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
284 spin_unlock(&ioapic->lock); 266 spin_unlock(&ioapic->lock);
@@ -350,18 +332,9 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
350 (void*)addr, len, val); 332 (void*)addr, len, val);
351 ASSERT(!(addr & 0xf)); /* check alignment */ 333 ASSERT(!(addr & 0xf)); /* check alignment */
352 334
353 switch (len) { 335 if (len == 4 || len == 8)
354 case 8:
355 case 4:
356 data = *(u32 *) val; 336 data = *(u32 *) val;
357 break; 337 else {
358 case 2:
359 data = *(u16 *) val;
360 break;
361 case 1:
362 data = *(u8 *) val;
363 break;
364 default:
365 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); 338 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
366 return 0; 339 return 0;
367 } 340 }
@@ -370,7 +343,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
370 spin_lock(&ioapic->lock); 343 spin_lock(&ioapic->lock);
371 switch (addr) { 344 switch (addr) {
372 case IOAPIC_REG_SELECT: 345 case IOAPIC_REG_SELECT:
373 ioapic->ioregsel = data & 0xFF; /* 8-bit register */ 346 ioapic->ioregsel = data;
374 break; 347 break;
375 348
376 case IOAPIC_REG_WINDOW: 349 case IOAPIC_REG_WINDOW:
@@ -421,8 +394,7 @@ int kvm_ioapic_init(struct kvm *kvm)
421 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); 394 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
422 ioapic->kvm = kvm; 395 ioapic->kvm = kvm;
423 mutex_lock(&kvm->slots_lock); 396 mutex_lock(&kvm->slots_lock);
424 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, 397 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
425 IOAPIC_MEM_LENGTH, &ioapic->dev);
426 mutex_unlock(&kvm->slots_lock); 398 mutex_unlock(&kvm->slots_lock);
427 if (ret < 0) { 399 if (ret < 0) {
428 kvm->arch.vioapic = NULL; 400 kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a30abfe6ed1..0b190c34ccc 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,12 +71,9 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
71 int short_hand, int dest, int dest_mode); 71 int short_hand, int dest, int dest_mode);
72int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 72int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
73void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); 73void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
74bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
75int kvm_ioapic_init(struct kvm *kvm); 74int kvm_ioapic_init(struct kvm *kvm);
76void kvm_ioapic_destroy(struct kvm *kvm); 75void kvm_ioapic_destroy(struct kvm *kvm);
77int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 76int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
78 int level);
79void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
80void kvm_ioapic_reset(struct kvm_ioapic *ioapic); 77void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
81int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 78int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
82 struct kvm_lapic_irq *irq); 79 struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4a340cb2301..511e160f706 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -25,14 +25,12 @@
25 25
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
28#include <linux/module.h>
29#include <linux/pci.h> 28#include <linux/pci.h>
30#include <linux/stat.h>
31#include <linux/dmar.h> 29#include <linux/dmar.h>
32#include <linux/iommu.h> 30#include <linux/iommu.h>
33#include <linux/intel-iommu.h> 31#include <linux/intel-iommu.h>
34 32
35static bool allow_unsafe_assigned_interrupts; 33static int allow_unsafe_assigned_interrupts;
36module_param_named(allow_unsafe_assigned_interrupts, 34module_param_named(allow_unsafe_assigned_interrupts,
37 allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); 35 allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
38MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, 36MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
@@ -42,21 +40,21 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
42static void kvm_iommu_put_pages(struct kvm *kvm, 40static void kvm_iommu_put_pages(struct kvm *kvm,
43 gfn_t base_gfn, unsigned long npages); 41 gfn_t base_gfn, unsigned long npages);
44 42
45static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, 43static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
46 unsigned long size) 44 gfn_t gfn, unsigned long size)
47{ 45{
48 gfn_t end_gfn; 46 gfn_t end_gfn;
49 pfn_t pfn; 47 pfn_t pfn;
50 48
51 pfn = gfn_to_pfn_memslot(slot, gfn); 49 pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
52 end_gfn = gfn + (size >> PAGE_SHIFT); 50 end_gfn = gfn + (size >> PAGE_SHIFT);
53 gfn += 1; 51 gfn += 1;
54 52
55 if (is_error_noslot_pfn(pfn)) 53 if (is_error_pfn(pfn))
56 return pfn; 54 return pfn;
57 55
58 while (gfn < end_gfn) 56 while (gfn < end_gfn)
59 gfn_to_pfn_memslot(slot, gfn++); 57 gfn_to_pfn_memslot(kvm, slot, gfn++);
60 58
61 return pfn; 59 return pfn;
62} 60}
@@ -105,8 +103,8 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
105 * Pin all pages we are about to map in memory. This is 103 * Pin all pages we are about to map in memory. This is
106 * important because we unmap and unpin in 4kb steps later. 104 * important because we unmap and unpin in 4kb steps later.
107 */ 105 */
108 pfn = kvm_pin_pages(slot, gfn, page_size); 106 pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
109 if (is_error_noslot_pfn(pfn)) { 107 if (is_error_pfn(pfn)) {
110 gfn += 1; 108 gfn += 1;
111 continue; 109 continue;
112 } 110 }
@@ -134,15 +132,14 @@ unmap_pages:
134 132
135static int kvm_iommu_map_memslots(struct kvm *kvm) 133static int kvm_iommu_map_memslots(struct kvm *kvm)
136{ 134{
137 int idx, r = 0; 135 int i, idx, r = 0;
138 struct kvm_memslots *slots; 136 struct kvm_memslots *slots;
139 struct kvm_memory_slot *memslot;
140 137
141 idx = srcu_read_lock(&kvm->srcu); 138 idx = srcu_read_lock(&kvm->srcu);
142 slots = kvm_memslots(kvm); 139 slots = kvm_memslots(kvm);
143 140
144 kvm_for_each_memslot(memslot, slots) { 141 for (i = 0; i < slots->nmemslots; i++) {
145 r = kvm_iommu_map_pages(kvm, memslot); 142 r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
146 if (r) 143 if (r)
147 break; 144 break;
148 } 145 }
@@ -168,7 +165,11 @@ int kvm_assign_device(struct kvm *kvm,
168 165
169 r = iommu_attach_device(domain, &pdev->dev); 166 r = iommu_attach_device(domain, &pdev->dev);
170 if (r) { 167 if (r) {
171 dev_err(&pdev->dev, "kvm assign device failed ret %d", r); 168 printk(KERN_ERR "assign device %x:%x:%x.%x failed",
169 pci_domain_nr(pdev->bus),
170 pdev->bus->number,
171 PCI_SLOT(pdev->devfn),
172 PCI_FUNC(pdev->devfn));
172 return r; 173 return r;
173 } 174 }
174 175
@@ -186,8 +187,6 @@ int kvm_assign_device(struct kvm *kvm,
186 goto out_unmap; 187 goto out_unmap;
187 } 188 }
188 189
189 pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
190
191 printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", 190 printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
192 assigned_dev->host_segnr, 191 assigned_dev->host_segnr,
193 assigned_dev->host_busnr, 192 assigned_dev->host_busnr,
@@ -216,8 +215,6 @@ int kvm_deassign_device(struct kvm *kvm,
216 215
217 iommu_detach_device(domain, &pdev->dev); 216 iommu_detach_device(domain, &pdev->dev);
218 217
219 pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
220
221 printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", 218 printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
222 assigned_dev->host_segnr, 219 assigned_dev->host_segnr,
223 assigned_dev->host_busnr, 220 assigned_dev->host_busnr,
@@ -236,13 +233,9 @@ int kvm_iommu_map_guest(struct kvm *kvm)
236 return -ENODEV; 233 return -ENODEV;
237 } 234 }
238 235
239 mutex_lock(&kvm->slots_lock);
240
241 kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); 236 kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
242 if (!kvm->arch.iommu_domain) { 237 if (!kvm->arch.iommu_domain)
243 r = -ENOMEM; 238 return -ENOMEM;
244 goto out_unlock;
245 }
246 239
247 if (!allow_unsafe_assigned_interrupts && 240 if (!allow_unsafe_assigned_interrupts &&
248 !iommu_domain_has_cap(kvm->arch.iommu_domain, 241 !iommu_domain_has_cap(kvm->arch.iommu_domain,
@@ -253,16 +246,17 @@ int kvm_iommu_map_guest(struct kvm *kvm)
253 " module option.\n", __func__); 246 " module option.\n", __func__);
254 iommu_domain_free(kvm->arch.iommu_domain); 247 iommu_domain_free(kvm->arch.iommu_domain);
255 kvm->arch.iommu_domain = NULL; 248 kvm->arch.iommu_domain = NULL;
256 r = -EPERM; 249 return -EPERM;
257 goto out_unlock;
258 } 250 }
259 251
260 r = kvm_iommu_map_memslots(kvm); 252 r = kvm_iommu_map_memslots(kvm);
261 if (r) 253 if (r)
262 kvm_iommu_unmap_memslots(kvm); 254 goto out_unmap;
263 255
264out_unlock: 256 return 0;
265 mutex_unlock(&kvm->slots_lock); 257
258out_unmap:
259 kvm_iommu_unmap_memslots(kvm);
266 return r; 260 return r;
267} 261}
268 262
@@ -296,12 +290,6 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
296 290
297 /* Get physical address */ 291 /* Get physical address */
298 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); 292 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
299
300 if (!phys) {
301 gfn++;
302 continue;
303 }
304
305 pfn = phys >> PAGE_SHIFT; 293 pfn = phys >> PAGE_SHIFT;
306 294
307 /* Unmap address from IO address space */ 295 /* Unmap address from IO address space */
@@ -315,23 +303,18 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
315 } 303 }
316} 304}
317 305
318void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
319{
320 kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
321}
322
323static int kvm_iommu_unmap_memslots(struct kvm *kvm) 306static int kvm_iommu_unmap_memslots(struct kvm *kvm)
324{ 307{
325 int idx; 308 int i, idx;
326 struct kvm_memslots *slots; 309 struct kvm_memslots *slots;
327 struct kvm_memory_slot *memslot;
328 310
329 idx = srcu_read_lock(&kvm->srcu); 311 idx = srcu_read_lock(&kvm->srcu);
330 slots = kvm_memslots(kvm); 312 slots = kvm_memslots(kvm);
331 313
332 kvm_for_each_memslot(memslot, slots) 314 for (i = 0; i < slots->nmemslots; i++) {
333 kvm_iommu_unmap_pages(kvm, memslot); 315 kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
334 316 slots->memslots[i].npages);
317 }
335 srcu_read_unlock(&kvm->srcu, idx); 318 srcu_read_unlock(&kvm->srcu, idx);
336 319
337 return 0; 320 return 0;
@@ -345,11 +328,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
345 if (!domain) 328 if (!domain)
346 return 0; 329 return 0;
347 330
348 mutex_lock(&kvm->slots_lock);
349 kvm_iommu_unmap_memslots(kvm); 331 kvm_iommu_unmap_memslots(kvm);
350 kvm->arch.iommu_domain = NULL;
351 mutex_unlock(&kvm->slots_lock);
352
353 iommu_domain_free(domain); 332 iommu_domain_free(domain);
354 return 0; 333 return 0;
355} 334}
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 656fa455e15..9f614b4e365 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,12 +33,26 @@
33 33
34#include "ioapic.h" 34#include "ioapic.h"
35 35
36static inline int kvm_irq_line_state(unsigned long *irq_state,
37 int irq_source_id, int level)
38{
39 /* Logical OR for level trig interrupt */
40 if (level)
41 set_bit(irq_source_id, irq_state);
42 else
43 clear_bit(irq_source_id, irq_state);
44
45 return !!(*irq_state);
46}
47
36static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 48static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
37 struct kvm *kvm, int irq_source_id, int level) 49 struct kvm *kvm, int irq_source_id, int level)
38{ 50{
39#ifdef CONFIG_X86 51#ifdef CONFIG_X86
40 struct kvm_pic *pic = pic_irqchip(kvm); 52 struct kvm_pic *pic = pic_irqchip(kvm);
41 return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); 53 level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
54 irq_source_id, level);
55 return kvm_pic_set_irq(pic, e->irqchip.pin, level);
42#else 56#else
43 return -1; 57 return -1;
44#endif 58#endif
@@ -48,7 +62,10 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
48 struct kvm *kvm, int irq_source_id, int level) 62 struct kvm *kvm, int irq_source_id, int level)
49{ 63{
50 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 64 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
51 return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); 65 level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
66 irq_source_id, level);
67
68 return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
52} 69}
53 70
54inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) 71inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -68,13 +85,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
68 struct kvm_vcpu *vcpu, *lowest = NULL; 85 struct kvm_vcpu *vcpu, *lowest = NULL;
69 86
70 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 87 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
71 kvm_is_dm_lowest_prio(irq)) { 88 kvm_is_dm_lowest_prio(irq))
72 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 89 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
73 irq->delivery_mode = APIC_DM_FIXED;
74 }
75
76 if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
77 return r;
78 90
79 kvm_for_each_vcpu(i, vcpu, kvm) { 91 kvm_for_each_vcpu(i, vcpu, kvm) {
80 if (!kvm_apic_present(vcpu)) 92 if (!kvm_apic_present(vcpu))
@@ -102,23 +114,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
102 return r; 114 return r;
103} 115}
104 116
105static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
106 struct kvm_lapic_irq *irq)
107{
108 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
109
110 irq->dest_id = (e->msi.address_lo &
111 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
112 irq->vector = (e->msi.data &
113 MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
114 irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
115 irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
116 irq->delivery_mode = e->msi.data & 0x700;
117 irq->level = 1;
118 irq->shorthand = 0;
119 /* TODO Deal with RH bit of MSI message address */
120}
121
122int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 117int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
123 struct kvm *kvm, int irq_source_id, int level) 118 struct kvm *kvm, int irq_source_id, int level)
124{ 119{
@@ -127,38 +122,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
127 if (!level) 122 if (!level)
128 return -1; 123 return -1;
129 124
130 kvm_set_msi_irq(e, &irq); 125 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
131
132 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
133}
134
135
136static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
137 struct kvm *kvm)
138{
139 struct kvm_lapic_irq irq;
140 int r;
141
142 kvm_set_msi_irq(e, &irq);
143
144 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
145 return r;
146 else
147 return -EWOULDBLOCK;
148}
149
150int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
151{
152 struct kvm_kernel_irq_routing_entry route;
153
154 if (!irqchip_in_kernel(kvm) || msi->flags != 0)
155 return -EINVAL;
156 126
157 route.msi.address_lo = msi->address_lo; 127 irq.dest_id = (e->msi.address_lo &
158 route.msi.address_hi = msi->address_hi; 128 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
159 route.msi.data = msi->data; 129 irq.vector = (e->msi.data &
130 MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
131 irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
132 irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
133 irq.delivery_mode = e->msi.data & 0x700;
134 irq.level = 1;
135 irq.shorthand = 0;
160 136
161 return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); 137 /* TODO Deal with RH bit of MSI message address */
138 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
162} 139}
163 140
164/* 141/*
@@ -199,44 +176,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
199 return ret; 176 return ret;
200} 177}
201 178
202/*
203 * Deliver an IRQ in an atomic context if we can, or return a failure,
204 * user can retry in a process context.
205 * Return value:
206 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
207 * Other values - No need to retry.
208 */
209int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
210{
211 struct kvm_kernel_irq_routing_entry *e;
212 int ret = -EINVAL;
213 struct kvm_irq_routing_table *irq_rt;
214 struct hlist_node *n;
215
216 trace_kvm_set_irq(irq, level, irq_source_id);
217
218 /*
219 * Injection into either PIC or IOAPIC might need to scan all CPUs,
220 * which would need to be retried from thread context; when same GSI
221 * is connected to both PIC and IOAPIC, we'd have to report a
222 * partial failure here.
223 * Since there's no easy way to do this, we only support injecting MSI
224 * which is limited to 1:1 GSI mapping.
225 */
226 rcu_read_lock();
227 irq_rt = rcu_dereference(kvm->irq_routing);
228 if (irq < irq_rt->nr_rt_entries)
229 hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
230 if (likely(e->type == KVM_IRQ_ROUTING_MSI))
231 ret = kvm_set_msi_inatomic(e, kvm);
232 else
233 ret = -EWOULDBLOCK;
234 break;
235 }
236 rcu_read_unlock();
237 return ret;
238}
239
240void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 179void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
241{ 180{
242 struct kvm_irq_ack_notifier *kian; 181 struct kvm_irq_ack_notifier *kian;
@@ -287,9 +226,6 @@ int kvm_request_irq_source_id(struct kvm *kvm)
287 } 226 }
288 227
289 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 228 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
290#ifdef CONFIG_X86
291 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
292#endif
293 set_bit(irq_source_id, bitmap); 229 set_bit(irq_source_id, bitmap);
294unlock: 230unlock:
295 mutex_unlock(&kvm->irq_lock); 231 mutex_unlock(&kvm->irq_lock);
@@ -299,10 +235,9 @@ unlock:
299 235
300void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) 236void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
301{ 237{
238 int i;
239
302 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 240 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
303#ifdef CONFIG_X86
304 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
305#endif
306 241
307 mutex_lock(&kvm->irq_lock); 242 mutex_lock(&kvm->irq_lock);
308 if (irq_source_id < 0 || 243 if (irq_source_id < 0 ||
@@ -314,10 +249,14 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
314 if (!irqchip_in_kernel(kvm)) 249 if (!irqchip_in_kernel(kvm))
315 goto unlock; 250 goto unlock;
316 251
317 kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); 252 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
253 clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
254 if (i >= 16)
255 continue;
318#ifdef CONFIG_X86 256#ifdef CONFIG_X86
319 kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); 257 clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
320#endif 258#endif
259 }
321unlock: 260unlock:
322 mutex_unlock(&kvm->irq_lock); 261 mutex_unlock(&kvm->irq_lock);
323} 262}
@@ -379,7 +318,6 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
379 */ 318 */
380 hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) 319 hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
381 if (ei->type == KVM_IRQ_ROUTING_MSI || 320 if (ei->type == KVM_IRQ_ROUTING_MSI ||
382 ue->type == KVM_IRQ_ROUTING_MSI ||
383 ue->u.irqchip.irqchip == ei->irqchip.irqchip) 321 ue->u.irqchip.irqchip == ei->irqchip.irqchip)
384 return r; 322 return r;
385 323
@@ -391,11 +329,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
391 switch (ue->u.irqchip.irqchip) { 329 switch (ue->u.irqchip.irqchip) {
392 case KVM_IRQCHIP_PIC_MASTER: 330 case KVM_IRQCHIP_PIC_MASTER:
393 e->set = kvm_set_pic_irq; 331 e->set = kvm_set_pic_irq;
394 max_pin = PIC_NUM_PINS; 332 max_pin = 16;
395 break; 333 break;
396 case KVM_IRQCHIP_PIC_SLAVE: 334 case KVM_IRQCHIP_PIC_SLAVE:
397 e->set = kvm_set_pic_irq; 335 e->set = kvm_set_pic_irq;
398 max_pin = PIC_NUM_PINS; 336 max_pin = 16;
399 delta = 8; 337 delta = 8;
400 break; 338 break;
401 case KVM_IRQCHIP_IOAPIC: 339 case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cd693a76a5..aefdda390f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,8 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/hugetlb.h> 48#include <linux/hugetlb.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/sort.h>
51#include <linux/bsearch.h>
52 50
53#include <asm/processor.h> 51#include <asm/processor.h>
54#include <asm/io.h> 52#include <asm/io.h>
@@ -100,7 +98,13 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
100 98
101static bool largepages_enabled = true; 99static bool largepages_enabled = true;
102 100
103bool kvm_is_mmio_pfn(pfn_t pfn) 101static struct page *hwpoison_page;
102static pfn_t hwpoison_pfn;
103
104struct page *fault_page;
105pfn_t fault_pfn;
106
107inline int kvm_is_mmio_pfn(pfn_t pfn)
104{ 108{
105 if (pfn_valid(pfn)) { 109 if (pfn_valid(pfn)) {
106 int reserved; 110 int reserved;
@@ -131,12 +135,11 @@ bool kvm_is_mmio_pfn(pfn_t pfn)
131/* 135/*
132 * Switches to specified vcpu, until a matching vcpu_put() 136 * Switches to specified vcpu, until a matching vcpu_put()
133 */ 137 */
134int vcpu_load(struct kvm_vcpu *vcpu) 138void vcpu_load(struct kvm_vcpu *vcpu)
135{ 139{
136 int cpu; 140 int cpu;
137 141
138 if (mutex_lock_killable(&vcpu->mutex)) 142 mutex_lock(&vcpu->mutex);
139 return -EINTR;
140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 143 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
141 /* The thread running this VCPU changed. */ 144 /* The thread running this VCPU changed. */
142 struct pid *oldpid = vcpu->pid; 145 struct pid *oldpid = vcpu->pid;
@@ -149,7 +152,6 @@ int vcpu_load(struct kvm_vcpu *vcpu)
149 preempt_notifier_register(&vcpu->preempt_notifier); 152 preempt_notifier_register(&vcpu->preempt_notifier);
150 kvm_arch_vcpu_load(vcpu, cpu); 153 kvm_arch_vcpu_load(vcpu, cpu);
151 put_cpu(); 154 put_cpu();
152 return 0;
153} 155}
154 156
155void vcpu_put(struct kvm_vcpu *vcpu) 157void vcpu_put(struct kvm_vcpu *vcpu)
@@ -199,7 +201,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
199 201
200void kvm_flush_remote_tlbs(struct kvm *kvm) 202void kvm_flush_remote_tlbs(struct kvm *kvm)
201{ 203{
202 long dirty_count = kvm->tlbs_dirty; 204 int dirty_count = kvm->tlbs_dirty;
203 205
204 smp_mb(); 206 smp_mb();
205 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 207 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -212,11 +214,6 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 214 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
213} 215}
214 216
215void kvm_make_mclock_inprogress_request(struct kvm *kvm)
216{
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218}
219
220int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 217int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
221{ 218{
222 struct page *page; 219 struct page *page;
@@ -237,9 +234,6 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
237 } 234 }
238 vcpu->run = page_address(page); 235 vcpu->run = page_address(page);
239 236
240 kvm_vcpu_set_in_spin_loop(vcpu, false);
241 kvm_vcpu_set_dy_eligible(vcpu, false);
242
243 r = kvm_arch_vcpu_init(vcpu); 237 r = kvm_arch_vcpu_init(vcpu);
244 if (r < 0) 238 if (r < 0)
245 goto fail_free_run; 239 goto fail_free_run;
@@ -293,15 +287,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
293 */ 287 */
294 idx = srcu_read_lock(&kvm->srcu); 288 idx = srcu_read_lock(&kvm->srcu);
295 spin_lock(&kvm->mmu_lock); 289 spin_lock(&kvm->mmu_lock);
296
297 kvm->mmu_notifier_seq++; 290 kvm->mmu_notifier_seq++;
298 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 291 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
292 spin_unlock(&kvm->mmu_lock);
293 srcu_read_unlock(&kvm->srcu, idx);
294
299 /* we've to flush the tlb before the pages can be freed */ 295 /* we've to flush the tlb before the pages can be freed */
300 if (need_tlb_flush) 296 if (need_tlb_flush)
301 kvm_flush_remote_tlbs(kvm); 297 kvm_flush_remote_tlbs(kvm);
302 298
303 spin_unlock(&kvm->mmu_lock);
304 srcu_read_unlock(&kvm->srcu, idx);
305} 299}
306 300
307static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 301static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -336,14 +330,15 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
336 * count is also read inside the mmu_lock critical section. 330 * count is also read inside the mmu_lock critical section.
337 */ 331 */
338 kvm->mmu_notifier_count++; 332 kvm->mmu_notifier_count++;
339 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 333 for (; start < end; start += PAGE_SIZE)
334 need_tlb_flush |= kvm_unmap_hva(kvm, start);
340 need_tlb_flush |= kvm->tlbs_dirty; 335 need_tlb_flush |= kvm->tlbs_dirty;
336 spin_unlock(&kvm->mmu_lock);
337 srcu_read_unlock(&kvm->srcu, idx);
338
341 /* we've to flush the tlb before the pages can be freed */ 339 /* we've to flush the tlb before the pages can be freed */
342 if (need_tlb_flush) 340 if (need_tlb_flush)
343 kvm_flush_remote_tlbs(kvm); 341 kvm_flush_remote_tlbs(kvm);
344
345 spin_unlock(&kvm->mmu_lock);
346 srcu_read_unlock(&kvm->srcu, idx);
347} 342}
348 343
349static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 344static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -360,11 +355,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
360 * been freed. 355 * been freed.
361 */ 356 */
362 kvm->mmu_notifier_seq++; 357 kvm->mmu_notifier_seq++;
363 smp_wmb();
364 /* 358 /*
365 * The above sequence increase must be visible before the 359 * The above sequence increase must be visible before the
366 * below count decrease, which is ensured by the smp_wmb above 360 * below count decrease but both values are read by the kvm
367 * in conjunction with the smp_rmb in mmu_notifier_retry(). 361 * page fault under mmu_lock spinlock so we don't need to add
362 * a smb_wmb() here in between the two.
368 */ 363 */
369 kvm->mmu_notifier_count--; 364 kvm->mmu_notifier_count--;
370 spin_unlock(&kvm->mmu_lock); 365 spin_unlock(&kvm->mmu_lock);
@@ -381,14 +376,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
381 376
382 idx = srcu_read_lock(&kvm->srcu); 377 idx = srcu_read_lock(&kvm->srcu);
383 spin_lock(&kvm->mmu_lock); 378 spin_lock(&kvm->mmu_lock);
384
385 young = kvm_age_hva(kvm, address); 379 young = kvm_age_hva(kvm, address);
386 if (young)
387 kvm_flush_remote_tlbs(kvm);
388
389 spin_unlock(&kvm->mmu_lock); 380 spin_unlock(&kvm->mmu_lock);
390 srcu_read_unlock(&kvm->srcu, idx); 381 srcu_read_unlock(&kvm->srcu, idx);
391 382
383 if (young)
384 kvm_flush_remote_tlbs(kvm);
385
392 return young; 386 return young;
393} 387}
394 388
@@ -415,7 +409,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
415 int idx; 409 int idx;
416 410
417 idx = srcu_read_lock(&kvm->srcu); 411 idx = srcu_read_lock(&kvm->srcu);
418 kvm_arch_flush_shadow_all(kvm); 412 kvm_arch_flush_shadow(kvm);
419 srcu_read_unlock(&kvm->srcu, idx); 413 srcu_read_unlock(&kvm->srcu, idx);
420} 414}
421 415
@@ -444,16 +438,7 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
444 438
445#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 439#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
446 440
447static void kvm_init_memslots_id(struct kvm *kvm) 441static struct kvm *kvm_create_vm(void)
448{
449 int i;
450 struct kvm_memslots *slots = kvm->memslots;
451
452 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
453 slots->id_to_index[i] = slots->memslots[i].id = i;
454}
455
456static struct kvm *kvm_create_vm(unsigned long type)
457{ 442{
458 int r, i; 443 int r, i;
459 struct kvm *kvm = kvm_arch_alloc_vm(); 444 struct kvm *kvm = kvm_arch_alloc_vm();
@@ -461,7 +446,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
461 if (!kvm) 446 if (!kvm)
462 return ERR_PTR(-ENOMEM); 447 return ERR_PTR(-ENOMEM);
463 448
464 r = kvm_arch_init_vm(kvm, type); 449 r = kvm_arch_init_vm(kvm);
465 if (r) 450 if (r)
466 goto out_err_nodisable; 451 goto out_err_nodisable;
467 452
@@ -478,7 +463,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
478 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 463 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
479 if (!kvm->memslots) 464 if (!kvm->memslots)
480 goto out_err_nosrcu; 465 goto out_err_nosrcu;
481 kvm_init_memslots_id(kvm);
482 if (init_srcu_struct(&kvm->srcu)) 466 if (init_srcu_struct(&kvm->srcu))
483 goto out_err_nosrcu; 467 goto out_err_nosrcu;
484 for (i = 0; i < KVM_NR_BUSES; i++) { 468 for (i = 0; i < KVM_NR_BUSES; i++) {
@@ -519,33 +503,18 @@ out_err_nodisable:
519 return ERR_PTR(r); 503 return ERR_PTR(r);
520} 504}
521 505
522/*
523 * Avoid using vmalloc for a small buffer.
524 * Should not be used when the size is statically known.
525 */
526void *kvm_kvzalloc(unsigned long size)
527{
528 if (size > PAGE_SIZE)
529 return vzalloc(size);
530 else
531 return kzalloc(size, GFP_KERNEL);
532}
533
534void kvm_kvfree(const void *addr)
535{
536 if (is_vmalloc_addr(addr))
537 vfree(addr);
538 else
539 kfree(addr);
540}
541
542static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 506static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
543{ 507{
544 if (!memslot->dirty_bitmap) 508 if (!memslot->dirty_bitmap)
545 return; 509 return;
546 510
547 kvm_kvfree(memslot->dirty_bitmap); 511 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
512 vfree(memslot->dirty_bitmap_head);
513 else
514 kfree(memslot->dirty_bitmap_head);
515
548 memslot->dirty_bitmap = NULL; 516 memslot->dirty_bitmap = NULL;
517 memslot->dirty_bitmap_head = NULL;
549} 518}
550 519
551/* 520/*
@@ -554,21 +523,33 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
554static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 523static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
555 struct kvm_memory_slot *dont) 524 struct kvm_memory_slot *dont)
556{ 525{
526 int i;
527
528 if (!dont || free->rmap != dont->rmap)
529 vfree(free->rmap);
530
557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 531 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
558 kvm_destroy_dirty_bitmap(free); 532 kvm_destroy_dirty_bitmap(free);
559 533
560 kvm_arch_free_memslot(free, dont); 534
535 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
536 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
537 vfree(free->lpage_info[i]);
538 free->lpage_info[i] = NULL;
539 }
540 }
561 541
562 free->npages = 0; 542 free->npages = 0;
543 free->rmap = NULL;
563} 544}
564 545
565void kvm_free_physmem(struct kvm *kvm) 546void kvm_free_physmem(struct kvm *kvm)
566{ 547{
548 int i;
567 struct kvm_memslots *slots = kvm->memslots; 549 struct kvm_memslots *slots = kvm->memslots;
568 struct kvm_memory_slot *memslot;
569 550
570 kvm_for_each_memslot(memslot, slots) 551 for (i = 0; i < slots->nmemslots; ++i)
571 kvm_free_physmem_slot(memslot, NULL); 552 kvm_free_physmem_slot(&slots->memslots[i], NULL);
572 553
573 kfree(kvm->memslots); 554 kfree(kvm->memslots);
574} 555}
@@ -589,7 +570,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
589#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 570#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
590 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 571 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
591#else 572#else
592 kvm_arch_flush_shadow_all(kvm); 573 kvm_arch_flush_shadow(kvm);
593#endif 574#endif
594 kvm_arch_destroy_vm(kvm); 575 kvm_arch_destroy_vm(kvm);
595 kvm_free_physmem(kvm); 576 kvm_free_physmem(kvm);
@@ -623,81 +604,28 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
623 return 0; 604 return 0;
624} 605}
625 606
607#ifndef CONFIG_S390
626/* 608/*
627 * Allocation size is twice as large as the actual dirty bitmap size. 609 * Allocation size is twice as large as the actual dirty bitmap size.
628 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 610 * This makes it possible to do double buffering: see x86's
611 * kvm_vm_ioctl_get_dirty_log().
629 */ 612 */
630static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 613static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
631{ 614{
632#ifndef CONFIG_S390
633 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 615 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
634 616
635 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 617 if (dirty_bytes > PAGE_SIZE)
618 memslot->dirty_bitmap = vzalloc(dirty_bytes);
619 else
620 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
621
636 if (!memslot->dirty_bitmap) 622 if (!memslot->dirty_bitmap)
637 return -ENOMEM; 623 return -ENOMEM;
638 624
639#endif /* !CONFIG_S390 */ 625 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
640 return 0;
641}
642
643static int cmp_memslot(const void *slot1, const void *slot2)
644{
645 struct kvm_memory_slot *s1, *s2;
646
647 s1 = (struct kvm_memory_slot *)slot1;
648 s2 = (struct kvm_memory_slot *)slot2;
649
650 if (s1->npages < s2->npages)
651 return 1;
652 if (s1->npages > s2->npages)
653 return -1;
654
655 return 0;
656}
657
658/*
659 * Sort the memslots base on its size, so the larger slots
660 * will get better fit.
661 */
662static void sort_memslots(struct kvm_memslots *slots)
663{
664 int i;
665
666 sort(slots->memslots, KVM_MEM_SLOTS_NUM,
667 sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
668
669 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
670 slots->id_to_index[slots->memslots[i].id] = i;
671}
672
673void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
674{
675 if (new) {
676 int id = new->id;
677 struct kvm_memory_slot *old = id_to_memslot(slots, id);
678 unsigned long npages = old->npages;
679
680 *old = *new;
681 if (new->npages != npages)
682 sort_memslots(slots);
683 }
684
685 slots->generation++;
686}
687
688static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
689{
690 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
691
692#ifdef KVM_CAP_READONLY_MEM
693 valid_flags |= KVM_MEM_READONLY;
694#endif
695
696 if (mem->flags & ~valid_flags)
697 return -EINVAL;
698
699 return 0; 626 return 0;
700} 627}
628#endif /* !CONFIG_S390 */
701 629
702/* 630/*
703 * Allocate some memory and give it an address in the guest physical address 631 * Allocate some memory and give it an address in the guest physical address
@@ -714,14 +642,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
714 int r; 642 int r;
715 gfn_t base_gfn; 643 gfn_t base_gfn;
716 unsigned long npages; 644 unsigned long npages;
717 struct kvm_memory_slot *memslot, *slot; 645 unsigned long i;
646 struct kvm_memory_slot *memslot;
718 struct kvm_memory_slot old, new; 647 struct kvm_memory_slot old, new;
719 struct kvm_memslots *slots, *old_memslots; 648 struct kvm_memslots *slots, *old_memslots;
720 649
721 r = check_memory_region_flags(mem);
722 if (r)
723 goto out;
724
725 r = -EINVAL; 650 r = -EINVAL;
726 /* General sanity checks */ 651 /* General sanity checks */
727 if (mem->memory_size & (PAGE_SIZE - 1)) 652 if (mem->memory_size & (PAGE_SIZE - 1))
@@ -735,12 +660,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
735 (void __user *)(unsigned long)mem->userspace_addr, 660 (void __user *)(unsigned long)mem->userspace_addr,
736 mem->memory_size))) 661 mem->memory_size)))
737 goto out; 662 goto out;
738 if (mem->slot >= KVM_MEM_SLOTS_NUM) 663 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
739 goto out; 664 goto out;
740 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 665 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
741 goto out; 666 goto out;
742 667
743 memslot = id_to_memslot(kvm->memslots, mem->slot); 668 memslot = &kvm->memslots->memslots[mem->slot];
744 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 669 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
745 npages = mem->memory_size >> PAGE_SHIFT; 670 npages = mem->memory_size >> PAGE_SHIFT;
746 671
@@ -765,11 +690,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
765 690
766 /* Check for overlaps */ 691 /* Check for overlaps */
767 r = -EEXIST; 692 r = -EEXIST;
768 kvm_for_each_memslot(slot, kvm->memslots) { 693 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
769 if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot) 694 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
695
696 if (s == memslot || !s->npages)
770 continue; 697 continue;
771 if (!((base_gfn + npages <= slot->base_gfn) || 698 if (!((base_gfn + npages <= s->base_gfn) ||
772 (base_gfn >= slot->base_gfn + slot->npages))) 699 (base_gfn >= s->base_gfn + s->npages)))
773 goto out_free; 700 goto out_free;
774 } 701 }
775 702
@@ -780,45 +707,92 @@ int __kvm_set_memory_region(struct kvm *kvm,
780 r = -ENOMEM; 707 r = -ENOMEM;
781 708
782 /* Allocate if a slot is being created */ 709 /* Allocate if a slot is being created */
783 if (npages && !old.npages) { 710#ifndef CONFIG_S390
711 if (npages && !new.rmap) {
712 new.rmap = vzalloc(npages * sizeof(*new.rmap));
713
714 if (!new.rmap)
715 goto out_free;
716
784 new.user_alloc = user_alloc; 717 new.user_alloc = user_alloc;
785 new.userspace_addr = mem->userspace_addr; 718 new.userspace_addr = mem->userspace_addr;
719 }
720 if (!npages)
721 goto skip_lpage;
722
723 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
724 unsigned long ugfn;
725 unsigned long j;
726 int lpages;
727 int level = i + 2;
728
729 /* Avoid unused variable warning if no large pages */
730 (void)level;
731
732 if (new.lpage_info[i])
733 continue;
786 734
787 if (kvm_arch_create_memslot(&new, npages)) 735 lpages = 1 + ((base_gfn + npages - 1)
736 >> KVM_HPAGE_GFN_SHIFT(level));
737 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
738
739 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
740
741 if (!new.lpage_info[i])
788 goto out_free; 742 goto out_free;
743
744 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
745 new.lpage_info[i][0].write_count = 1;
746 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
747 new.lpage_info[i][lpages - 1].write_count = 1;
748 ugfn = new.userspace_addr >> PAGE_SHIFT;
749 /*
750 * If the gfn and userspace address are not aligned wrt each
751 * other, or if explicitly asked to, disable large page
752 * support for this slot
753 */
754 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
755 !largepages_enabled)
756 for (j = 0; j < lpages; ++j)
757 new.lpage_info[i][j].write_count = 1;
789 } 758 }
790 759
760skip_lpage:
761
791 /* Allocate page dirty bitmap if needed */ 762 /* Allocate page dirty bitmap if needed */
792 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 763 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
793 if (kvm_create_dirty_bitmap(&new) < 0) 764 if (kvm_create_dirty_bitmap(&new) < 0)
794 goto out_free; 765 goto out_free;
795 /* destroy any largepage mappings for dirty tracking */ 766 /* destroy any largepage mappings for dirty tracking */
796 } 767 }
768#else /* not defined CONFIG_S390 */
769 new.user_alloc = user_alloc;
770 if (user_alloc)
771 new.userspace_addr = mem->userspace_addr;
772#endif /* not defined CONFIG_S390 */
797 773
798 if (!npages || base_gfn != old.base_gfn) { 774 if (!npages) {
799 struct kvm_memory_slot *slot;
800
801 r = -ENOMEM; 775 r = -ENOMEM;
802 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 776 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
803 GFP_KERNEL);
804 if (!slots) 777 if (!slots)
805 goto out_free; 778 goto out_free;
806 slot = id_to_memslot(slots, mem->slot); 779 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
807 slot->flags |= KVM_MEMSLOT_INVALID; 780 if (mem->slot >= slots->nmemslots)
808 781 slots->nmemslots = mem->slot + 1;
809 update_memslots(slots, NULL); 782 slots->generation++;
783 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
810 784
811 old_memslots = kvm->memslots; 785 old_memslots = kvm->memslots;
812 rcu_assign_pointer(kvm->memslots, slots); 786 rcu_assign_pointer(kvm->memslots, slots);
813 synchronize_srcu_expedited(&kvm->srcu); 787 synchronize_srcu_expedited(&kvm->srcu);
814 /* From this point no new shadow pages pointing to a deleted, 788 /* From this point no new shadow pages pointing to a deleted
815 * or moved, memslot will be created. 789 * memslot will be created.
816 * 790 *
817 * validation of sp->gfn happens in: 791 * validation of sp->gfn happens in:
818 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 792 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
819 * - kvm_is_visible_gfn (mmu_check_roots) 793 * - kvm_is_visible_gfn (mmu_check_roots)
820 */ 794 */
821 kvm_arch_flush_shadow_memslot(kvm, slot); 795 kvm_arch_flush_shadow(kvm);
822 kfree(old_memslots); 796 kfree(old_memslots);
823 } 797 }
824 798
@@ -826,33 +800,44 @@ int __kvm_set_memory_region(struct kvm *kvm,
826 if (r) 800 if (r)
827 goto out_free; 801 goto out_free;
828 802
829 /* map/unmap the pages in iommu page table */ 803 /* map the pages in iommu page table */
830 if (npages) { 804 if (npages) {
831 r = kvm_iommu_map_pages(kvm, &new); 805 r = kvm_iommu_map_pages(kvm, &new);
832 if (r) 806 if (r)
833 goto out_free; 807 goto out_free;
834 } else 808 }
835 kvm_iommu_unmap_pages(kvm, &old);
836 809
837 r = -ENOMEM; 810 r = -ENOMEM;
838 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 811 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
839 GFP_KERNEL);
840 if (!slots) 812 if (!slots)
841 goto out_free; 813 goto out_free;
814 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
815 if (mem->slot >= slots->nmemslots)
816 slots->nmemslots = mem->slot + 1;
817 slots->generation++;
842 818
843 /* actual memory is freed via old in kvm_free_physmem_slot below */ 819 /* actual memory is freed via old in kvm_free_physmem_slot below */
844 if (!npages) { 820 if (!npages) {
821 new.rmap = NULL;
845 new.dirty_bitmap = NULL; 822 new.dirty_bitmap = NULL;
846 memset(&new.arch, 0, sizeof(new.arch)); 823 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
824 new.lpage_info[i] = NULL;
847 } 825 }
848 826
849 update_memslots(slots, &new); 827 slots->memslots[mem->slot] = new;
850 old_memslots = kvm->memslots; 828 old_memslots = kvm->memslots;
851 rcu_assign_pointer(kvm->memslots, slots); 829 rcu_assign_pointer(kvm->memslots, slots);
852 synchronize_srcu_expedited(&kvm->srcu); 830 synchronize_srcu_expedited(&kvm->srcu);
853 831
854 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 832 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
855 833
834 /*
835 * If the new memory slot is created, we need to clear all
836 * mmio sptes.
837 */
838 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
839 kvm_arch_flush_shadow(kvm);
840
856 kvm_free_physmem_slot(&old, &new); 841 kvm_free_physmem_slot(&old, &new);
857 kfree(old_memslots); 842 kfree(old_memslots);
858 843
@@ -901,7 +886,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
901 if (log->slot >= KVM_MEMORY_SLOTS) 886 if (log->slot >= KVM_MEMORY_SLOTS)
902 goto out; 887 goto out;
903 888
904 memslot = id_to_memslot(kvm->memslots, log->slot); 889 memslot = &kvm->memslots->memslots[log->slot];
905 r = -ENOENT; 890 r = -ENOENT;
906 if (!memslot->dirty_bitmap) 891 if (!memslot->dirty_bitmap)
907 goto out; 892 goto out;
@@ -923,17 +908,74 @@ out:
923 return r; 908 return r;
924} 909}
925 910
926bool kvm_largepages_enabled(void)
927{
928 return largepages_enabled;
929}
930
931void kvm_disable_largepages(void) 911void kvm_disable_largepages(void)
932{ 912{
933 largepages_enabled = false; 913 largepages_enabled = false;
934} 914}
935EXPORT_SYMBOL_GPL(kvm_disable_largepages); 915EXPORT_SYMBOL_GPL(kvm_disable_largepages);
936 916
917int is_error_page(struct page *page)
918{
919 return page == bad_page || page == hwpoison_page || page == fault_page;
920}
921EXPORT_SYMBOL_GPL(is_error_page);
922
923int is_error_pfn(pfn_t pfn)
924{
925 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
926}
927EXPORT_SYMBOL_GPL(is_error_pfn);
928
929int is_hwpoison_pfn(pfn_t pfn)
930{
931 return pfn == hwpoison_pfn;
932}
933EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
934
935int is_fault_pfn(pfn_t pfn)
936{
937 return pfn == fault_pfn;
938}
939EXPORT_SYMBOL_GPL(is_fault_pfn);
940
941int is_noslot_pfn(pfn_t pfn)
942{
943 return pfn == bad_pfn;
944}
945EXPORT_SYMBOL_GPL(is_noslot_pfn);
946
947int is_invalid_pfn(pfn_t pfn)
948{
949 return pfn == hwpoison_pfn || pfn == fault_pfn;
950}
951EXPORT_SYMBOL_GPL(is_invalid_pfn);
952
953static inline unsigned long bad_hva(void)
954{
955 return PAGE_OFFSET;
956}
957
958int kvm_is_error_hva(unsigned long addr)
959{
960 return addr == bad_hva();
961}
962EXPORT_SYMBOL_GPL(kvm_is_error_hva);
963
964static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
965 gfn_t gfn)
966{
967 int i;
968
969 for (i = 0; i < slots->nmemslots; ++i) {
970 struct kvm_memory_slot *memslot = &slots->memslots[i];
971
972 if (gfn >= memslot->base_gfn
973 && gfn < memslot->base_gfn + memslot->npages)
974 return memslot;
975 }
976 return NULL;
977}
978
937struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 979struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
938{ 980{
939 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 981 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -942,13 +984,20 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
942 984
943int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 985int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
944{ 986{
945 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 987 int i;
988 struct kvm_memslots *slots = kvm_memslots(kvm);
946 989
947 if (!memslot || memslot->id >= KVM_MEMORY_SLOTS || 990 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
948 memslot->flags & KVM_MEMSLOT_INVALID) 991 struct kvm_memory_slot *memslot = &slots->memslots[i];
949 return 0; 992
993 if (memslot->flags & KVM_MEMSLOT_INVALID)
994 continue;
950 995
951 return 1; 996 if (gfn >= memslot->base_gfn
997 && gfn < memslot->base_gfn + memslot->npages)
998 return 1;
999 }
1000 return 0;
952} 1001}
953EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1002EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
954 1003
@@ -976,38 +1025,17 @@ out:
976 return size; 1025 return size;
977} 1026}
978 1027
979static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1028static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
980{ 1029 gfn_t *nr_pages)
981 return slot->flags & KVM_MEM_READONLY;
982}
983
984static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
985 gfn_t *nr_pages, bool write)
986{ 1030{
987 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1031 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
988 return KVM_HVA_ERR_BAD; 1032 return bad_hva();
989
990 if (memslot_is_readonly(slot) && write)
991 return KVM_HVA_ERR_RO_BAD;
992 1033
993 if (nr_pages) 1034 if (nr_pages)
994 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1035 *nr_pages = slot->npages - (gfn - slot->base_gfn);
995 1036
996 return __gfn_to_hva_memslot(slot, gfn); 1037 return gfn_to_hva_memslot(slot, gfn);
997}
998
999static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1000 gfn_t *nr_pages)
1001{
1002 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1003}
1004
1005unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1006 gfn_t gfn)
1007{
1008 return gfn_to_hva_many(slot, gfn, NULL);
1009} 1038}
1010EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1011 1039
1012unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1040unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1013{ 1041{
@@ -1015,23 +1043,10 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1015} 1043}
1016EXPORT_SYMBOL_GPL(gfn_to_hva); 1044EXPORT_SYMBOL_GPL(gfn_to_hva);
1017 1045
1018/* 1046static pfn_t get_fault_pfn(void)
1019 * The hva returned by this function is only allowed to be read.
1020 * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
1021 */
1022static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
1023{ 1047{
1024 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); 1048 get_page(fault_page);
1025} 1049 return fault_pfn;
1026
1027static int kvm_read_hva(void *data, void __user *hva, int len)
1028{
1029 return __copy_from_user(data, hva, len);
1030}
1031
1032static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
1033{
1034 return __copy_from_user_inatomic(data, hva, len);
1035} 1050}
1036 1051
1037int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1052int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1054,186 +1069,108 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1054 return rc == -EHWPOISON; 1069 return rc == -EHWPOISON;
1055} 1070}
1056 1071
1057/* 1072static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1058 * The atomic path to get the writable pfn which will be stored in @pfn, 1073 bool *async, bool write_fault, bool *writable)
1059 * true indicates success, otherwise false is returned.
1060 */
1061static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1062 bool write_fault, bool *writable, pfn_t *pfn)
1063{ 1074{
1064 struct page *page[1]; 1075 struct page *page[1];
1065 int npages; 1076 int npages = 0;
1077 pfn_t pfn;
1066 1078
1067 if (!(async || atomic)) 1079 /* we can do it either atomically or asynchronously, not both */
1068 return false; 1080 BUG_ON(atomic && async);
1069 1081
1070 /* 1082 BUG_ON(!write_fault && !writable);
1071 * Fast pin a writable pfn only if it is a write fault request
1072 * or the caller allows to map a writable pfn for a read fault
1073 * request.
1074 */
1075 if (!(write_fault || writable))
1076 return false;
1077 1083
1078 npages = __get_user_pages_fast(addr, 1, 1, page); 1084 if (writable)
1079 if (npages == 1) { 1085 *writable = true;
1080 *pfn = page_to_pfn(page[0]);
1081 1086
1082 if (writable) 1087 if (atomic || async)
1083 *writable = true; 1088 npages = __get_user_pages_fast(addr, 1, 1, page);
1084 return true;
1085 }
1086 1089
1087 return false; 1090 if (unlikely(npages != 1) && !atomic) {
1088} 1091 might_sleep();
1089 1092
1090/* 1093 if (writable)
1091 * The slow path to get the pfn of the specified host virtual address, 1094 *writable = write_fault;
1092 * 1 indicates success, -errno is returned if error is detected.
1093 */
1094static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1095 bool *writable, pfn_t *pfn)
1096{
1097 struct page *page[1];
1098 int npages = 0;
1099
1100 might_sleep();
1101
1102 if (writable)
1103 *writable = write_fault;
1104 1095
1105 if (async) { 1096 if (async) {
1106 down_read(&current->mm->mmap_sem); 1097 down_read(&current->mm->mmap_sem);
1107 npages = get_user_page_nowait(current, current->mm, 1098 npages = get_user_page_nowait(current, current->mm,
1108 addr, write_fault, page); 1099 addr, write_fault, page);
1109 up_read(&current->mm->mmap_sem); 1100 up_read(&current->mm->mmap_sem);
1110 } else 1101 } else
1111 npages = get_user_pages_fast(addr, 1, write_fault, 1102 npages = get_user_pages_fast(addr, 1, write_fault,
1112 page); 1103 page);
1113 if (npages != 1) 1104
1114 return npages; 1105 /* map read fault as writable if possible */
1115 1106 if (unlikely(!write_fault) && npages == 1) {
1116 /* map read fault as writable if possible */ 1107 struct page *wpage[1];
1117 if (unlikely(!write_fault) && writable) { 1108
1118 struct page *wpage[1]; 1109 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1119 1110 if (npages == 1) {
1120 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1111 *writable = true;
1121 if (npages == 1) { 1112 put_page(page[0]);
1122 *writable = true; 1113 page[0] = wpage[0];
1123 put_page(page[0]); 1114 }
1124 page[0] = wpage[0]; 1115 npages = 1;
1125 } 1116 }
1126
1127 npages = 1;
1128 } 1117 }
1129 *pfn = page_to_pfn(page[0]);
1130 return npages;
1131}
1132 1118
1133static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1119 if (unlikely(npages != 1)) {
1134{ 1120 struct vm_area_struct *vma;
1135 if (unlikely(!(vma->vm_flags & VM_READ)))
1136 return false;
1137
1138 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1139 return false;
1140
1141 return true;
1142}
1143
1144/*
1145 * Pin guest page in memory and return its pfn.
1146 * @addr: host virtual address which maps memory to the guest
1147 * @atomic: whether this function can sleep
1148 * @async: whether this function need to wait IO complete if the
1149 * host page is not in the memory
1150 * @write_fault: whether we should get a writable host page
1151 * @writable: whether it allows to map a writable host page for !@write_fault
1152 *
1153 * The function will map a writable host page for these two cases:
1154 * 1): @write_fault = true
1155 * 2): @write_fault = false && @writable, @writable will tell the caller
1156 * whether the mapping is writable.
1157 */
1158static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1159 bool write_fault, bool *writable)
1160{
1161 struct vm_area_struct *vma;
1162 pfn_t pfn = 0;
1163 int npages;
1164 1121
1165 /* we can do it either atomically or asynchronously, not both */ 1122 if (atomic)
1166 BUG_ON(atomic && async); 1123 return get_fault_pfn();
1167 1124
1168 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1125 down_read(&current->mm->mmap_sem);
1169 return pfn; 1126 if (npages == -EHWPOISON ||
1170 1127 (!async && check_user_page_hwpoison(addr))) {
1171 if (atomic) 1128 up_read(&current->mm->mmap_sem);
1172 return KVM_PFN_ERR_FAULT; 1129 get_page(hwpoison_page);
1130 return page_to_pfn(hwpoison_page);
1131 }
1173 1132
1174 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1133 vma = find_vma_intersection(current->mm, addr, addr+1);
1175 if (npages == 1) 1134
1176 return pfn; 1135 if (vma == NULL)
1136 pfn = get_fault_pfn();
1137 else if ((vma->vm_flags & VM_PFNMAP)) {
1138 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1139 vma->vm_pgoff;
1140 BUG_ON(!kvm_is_mmio_pfn(pfn));
1141 } else {
1142 if (async && (vma->vm_flags & VM_WRITE))
1143 *async = true;
1144 pfn = get_fault_pfn();
1145 }
1146 up_read(&current->mm->mmap_sem);
1147 } else
1148 pfn = page_to_pfn(page[0]);
1177 1149
1178 down_read(&current->mm->mmap_sem);
1179 if (npages == -EHWPOISON ||
1180 (!async && check_user_page_hwpoison(addr))) {
1181 pfn = KVM_PFN_ERR_HWPOISON;
1182 goto exit;
1183 }
1184
1185 vma = find_vma_intersection(current->mm, addr, addr + 1);
1186
1187 if (vma == NULL)
1188 pfn = KVM_PFN_ERR_FAULT;
1189 else if ((vma->vm_flags & VM_PFNMAP)) {
1190 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1191 vma->vm_pgoff;
1192 BUG_ON(!kvm_is_mmio_pfn(pfn));
1193 } else {
1194 if (async && vma_is_valid(vma, write_fault))
1195 *async = true;
1196 pfn = KVM_PFN_ERR_FAULT;
1197 }
1198exit:
1199 up_read(&current->mm->mmap_sem);
1200 return pfn; 1150 return pfn;
1201} 1151}
1202 1152
1203static pfn_t 1153pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1204__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1205 bool *async, bool write_fault, bool *writable)
1206{ 1154{
1207 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1155 return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
1208
1209 if (addr == KVM_HVA_ERR_RO_BAD)
1210 return KVM_PFN_ERR_RO_FAULT;
1211
1212 if (kvm_is_error_hva(addr))
1213 return KVM_PFN_NOSLOT;
1214
1215 /* Do not map writable pfn in the readonly memslot. */
1216 if (writable && memslot_is_readonly(slot)) {
1217 *writable = false;
1218 writable = NULL;
1219 }
1220
1221 return hva_to_pfn(addr, atomic, async, write_fault,
1222 writable);
1223} 1156}
1157EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1224 1158
1225static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1159static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1226 bool write_fault, bool *writable) 1160 bool write_fault, bool *writable)
1227{ 1161{
1228 struct kvm_memory_slot *slot; 1162 unsigned long addr;
1229 1163
1230 if (async) 1164 if (async)
1231 *async = false; 1165 *async = false;
1232 1166
1233 slot = gfn_to_memslot(kvm, gfn); 1167 addr = gfn_to_hva(kvm, gfn);
1168 if (kvm_is_error_hva(addr)) {
1169 get_page(bad_page);
1170 return page_to_pfn(bad_page);
1171 }
1234 1172
1235 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, 1173 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1236 writable);
1237} 1174}
1238 1175
1239pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1176pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1262,17 +1199,13 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1262} 1199}
1263EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1200EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1264 1201
1265pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1202pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1203 struct kvm_memory_slot *slot, gfn_t gfn)
1266{ 1204{
1267 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1205 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1206 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1268} 1207}
1269 1208
1270pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1271{
1272 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1273}
1274EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1275
1276int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1209int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1277 int nr_pages) 1210 int nr_pages)
1278{ 1211{
@@ -1290,49 +1223,37 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1290} 1223}
1291EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1224EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1292 1225
1293static struct page *kvm_pfn_to_page(pfn_t pfn)
1294{
1295 if (is_error_noslot_pfn(pfn))
1296 return KVM_ERR_PTR_BAD_PAGE;
1297
1298 if (kvm_is_mmio_pfn(pfn)) {
1299 WARN_ON(1);
1300 return KVM_ERR_PTR_BAD_PAGE;
1301 }
1302
1303 return pfn_to_page(pfn);
1304}
1305
1306struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1226struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1307{ 1227{
1308 pfn_t pfn; 1228 pfn_t pfn;
1309 1229
1310 pfn = gfn_to_pfn(kvm, gfn); 1230 pfn = gfn_to_pfn(kvm, gfn);
1231 if (!kvm_is_mmio_pfn(pfn))
1232 return pfn_to_page(pfn);
1233
1234 WARN_ON(kvm_is_mmio_pfn(pfn));
1311 1235
1312 return kvm_pfn_to_page(pfn); 1236 get_page(bad_page);
1237 return bad_page;
1313} 1238}
1314 1239
1315EXPORT_SYMBOL_GPL(gfn_to_page); 1240EXPORT_SYMBOL_GPL(gfn_to_page);
1316 1241
1317void kvm_release_page_clean(struct page *page) 1242void kvm_release_page_clean(struct page *page)
1318{ 1243{
1319 WARN_ON(is_error_page(page));
1320
1321 kvm_release_pfn_clean(page_to_pfn(page)); 1244 kvm_release_pfn_clean(page_to_pfn(page));
1322} 1245}
1323EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1246EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1324 1247
1325void kvm_release_pfn_clean(pfn_t pfn) 1248void kvm_release_pfn_clean(pfn_t pfn)
1326{ 1249{
1327 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1250 if (!kvm_is_mmio_pfn(pfn))
1328 put_page(pfn_to_page(pfn)); 1251 put_page(pfn_to_page(pfn));
1329} 1252}
1330EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1253EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1331 1254
1332void kvm_release_page_dirty(struct page *page) 1255void kvm_release_page_dirty(struct page *page)
1333{ 1256{
1334 WARN_ON(is_error_page(page));
1335
1336 kvm_release_pfn_dirty(page_to_pfn(page)); 1257 kvm_release_pfn_dirty(page_to_pfn(page));
1337} 1258}
1338EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1259EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1388,10 +1309,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1388 int r; 1309 int r;
1389 unsigned long addr; 1310 unsigned long addr;
1390 1311
1391 addr = gfn_to_hva_read(kvm, gfn); 1312 addr = gfn_to_hva(kvm, gfn);
1392 if (kvm_is_error_hva(addr)) 1313 if (kvm_is_error_hva(addr))
1393 return -EFAULT; 1314 return -EFAULT;
1394 r = kvm_read_hva(data, (void __user *)addr + offset, len); 1315 r = __copy_from_user(data, (void __user *)addr + offset, len);
1395 if (r) 1316 if (r)
1396 return -EFAULT; 1317 return -EFAULT;
1397 return 0; 1318 return 0;
@@ -1426,11 +1347,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1426 gfn_t gfn = gpa >> PAGE_SHIFT; 1347 gfn_t gfn = gpa >> PAGE_SHIFT;
1427 int offset = offset_in_page(gpa); 1348 int offset = offset_in_page(gpa);
1428 1349
1429 addr = gfn_to_hva_read(kvm, gfn); 1350 addr = gfn_to_hva(kvm, gfn);
1430 if (kvm_is_error_hva(addr)) 1351 if (kvm_is_error_hva(addr))
1431 return -EFAULT; 1352 return -EFAULT;
1432 pagefault_disable(); 1353 pagefault_disable();
1433 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); 1354 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1434 pagefault_enable(); 1355 pagefault_enable();
1435 if (r) 1356 if (r)
1436 return -EFAULT; 1357 return -EFAULT;
@@ -1484,7 +1405,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1484 1405
1485 ghc->gpa = gpa; 1406 ghc->gpa = gpa;
1486 ghc->generation = slots->generation; 1407 ghc->generation = slots->generation;
1487 ghc->memslot = gfn_to_memslot(kvm, gfn); 1408 ghc->memslot = __gfn_to_memslot(slots, gfn);
1488 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1409 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1489 if (!kvm_is_error_hva(ghc->hva)) 1410 if (!kvm_is_error_hva(ghc->hva))
1490 ghc->hva += offset; 1411 ghc->hva += offset;
@@ -1568,7 +1489,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1568 if (memslot && memslot->dirty_bitmap) { 1489 if (memslot && memslot->dirty_bitmap) {
1569 unsigned long rel_gfn = gfn - memslot->base_gfn; 1490 unsigned long rel_gfn = gfn - memslot->base_gfn;
1570 1491
1571 set_bit_le(rel_gfn, memslot->dirty_bitmap); 1492 __set_bit_le(rel_gfn, memslot->dirty_bitmap);
1572 } 1493 }
1573} 1494}
1574 1495
@@ -1605,30 +1526,6 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1605 finish_wait(&vcpu->wq, &wait); 1526 finish_wait(&vcpu->wq, &wait);
1606} 1527}
1607 1528
1608#ifndef CONFIG_S390
1609/*
1610 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
1611 */
1612void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
1613{
1614 int me;
1615 int cpu = vcpu->cpu;
1616 wait_queue_head_t *wqp;
1617
1618 wqp = kvm_arch_vcpu_wq(vcpu);
1619 if (waitqueue_active(wqp)) {
1620 wake_up_interruptible(wqp);
1621 ++vcpu->stat.halt_wakeup;
1622 }
1623
1624 me = get_cpu();
1625 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
1626 if (kvm_arch_vcpu_should_kick(vcpu))
1627 smp_send_reschedule(cpu);
1628 put_cpu();
1629}
1630#endif /* !CONFIG_S390 */
1631
1632void kvm_resched(struct kvm_vcpu *vcpu) 1529void kvm_resched(struct kvm_vcpu *vcpu)
1633{ 1530{
1634 if (!need_resched()) 1531 if (!need_resched())
@@ -1637,68 +1534,6 @@ void kvm_resched(struct kvm_vcpu *vcpu)
1637} 1534}
1638EXPORT_SYMBOL_GPL(kvm_resched); 1535EXPORT_SYMBOL_GPL(kvm_resched);
1639 1536
1640bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1641{
1642 struct pid *pid;
1643 struct task_struct *task = NULL;
1644
1645 rcu_read_lock();
1646 pid = rcu_dereference(target->pid);
1647 if (pid)
1648 task = get_pid_task(target->pid, PIDTYPE_PID);
1649 rcu_read_unlock();
1650 if (!task)
1651 return false;
1652 if (task->flags & PF_VCPU) {
1653 put_task_struct(task);
1654 return false;
1655 }
1656 if (yield_to(task, 1)) {
1657 put_task_struct(task);
1658 return true;
1659 }
1660 put_task_struct(task);
1661 return false;
1662}
1663EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1664
1665#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1666/*
1667 * Helper that checks whether a VCPU is eligible for directed yield.
1668 * Most eligible candidate to yield is decided by following heuristics:
1669 *
1670 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1671 * (preempted lock holder), indicated by @in_spin_loop.
1672 * Set at the beiginning and cleared at the end of interception/PLE handler.
1673 *
1674 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1675 * chance last time (mostly it has become eligible now since we have probably
1676 * yielded to lockholder in last iteration. This is done by toggling
1677 * @dy_eligible each time a VCPU checked for eligibility.)
1678 *
1679 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1680 * to preempted lock-holder could result in wrong VCPU selection and CPU
1681 * burning. Giving priority for a potential lock-holder increases lock
1682 * progress.
1683 *
1684 * Since algorithm is based on heuristics, accessing another VCPU data without
1685 * locking does not harm. It may result in trying to yield to same VCPU, fail
1686 * and continue with next VCPU and so on.
1687 */
1688bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1689{
1690 bool eligible;
1691
1692 eligible = !vcpu->spin_loop.in_spin_loop ||
1693 (vcpu->spin_loop.in_spin_loop &&
1694 vcpu->spin_loop.dy_eligible);
1695
1696 if (vcpu->spin_loop.in_spin_loop)
1697 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1698
1699 return eligible;
1700}
1701#endif
1702void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1537void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1703{ 1538{
1704 struct kvm *kvm = me->kvm; 1539 struct kvm *kvm = me->kvm;
@@ -1708,7 +1543,6 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1708 int pass; 1543 int pass;
1709 int i; 1544 int i;
1710 1545
1711 kvm_vcpu_set_in_spin_loop(me, true);
1712 /* 1546 /*
1713 * We boost the priority of a VCPU that is runnable but not 1547 * We boost the priority of a VCPU that is runnable but not
1714 * currently running, because it got preempted by something 1548 * currently running, because it got preempted by something
@@ -1718,7 +1552,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1718 */ 1552 */
1719 for (pass = 0; pass < 2 && !yielded; pass++) { 1553 for (pass = 0; pass < 2 && !yielded; pass++) {
1720 kvm_for_each_vcpu(i, vcpu, kvm) { 1554 kvm_for_each_vcpu(i, vcpu, kvm) {
1721 if (!pass && i <= last_boosted_vcpu) { 1555 struct task_struct *task = NULL;
1556 struct pid *pid;
1557 if (!pass && i < last_boosted_vcpu) {
1722 i = last_boosted_vcpu; 1558 i = last_boosted_vcpu;
1723 continue; 1559 continue;
1724 } else if (pass && i > last_boosted_vcpu) 1560 } else if (pass && i > last_boosted_vcpu)
@@ -1727,19 +1563,26 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1727 continue; 1563 continue;
1728 if (waitqueue_active(&vcpu->wq)) 1564 if (waitqueue_active(&vcpu->wq))
1729 continue; 1565 continue;
1730 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1566 rcu_read_lock();
1567 pid = rcu_dereference(vcpu->pid);
1568 if (pid)
1569 task = get_pid_task(vcpu->pid, PIDTYPE_PID);
1570 rcu_read_unlock();
1571 if (!task)
1731 continue; 1572 continue;
1732 if (kvm_vcpu_yield_to(vcpu)) { 1573 if (task->flags & PF_VCPU) {
1574 put_task_struct(task);
1575 continue;
1576 }
1577 if (yield_to(task, 1)) {
1578 put_task_struct(task);
1733 kvm->last_boosted_vcpu = i; 1579 kvm->last_boosted_vcpu = i;
1734 yielded = 1; 1580 yielded = 1;
1735 break; 1581 break;
1736 } 1582 }
1583 put_task_struct(task);
1737 } 1584 }
1738 } 1585 }
1739 kvm_vcpu_set_in_spin_loop(me, false);
1740
1741 /* Ensure vcpu is not eligible during next spinloop */
1742 kvm_vcpu_set_dy_eligible(me, false);
1743} 1586}
1744EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1587EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1745 1588
@@ -1759,7 +1602,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1759 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1602 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1760#endif 1603#endif
1761 else 1604 else
1762 return kvm_arch_vcpu_fault(vcpu, vmf); 1605 return VM_FAULT_SIGBUS;
1763 get_page(page); 1606 get_page(page);
1764 vmf->page = page; 1607 vmf->page = page;
1765 return 0; 1608 return 0;
@@ -1820,10 +1663,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1820 goto vcpu_destroy; 1663 goto vcpu_destroy;
1821 1664
1822 mutex_lock(&kvm->lock); 1665 mutex_lock(&kvm->lock);
1823 if (!kvm_vcpu_compatible(vcpu)) {
1824 r = -EINVAL;
1825 goto unlock_vcpu_destroy;
1826 }
1827 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1666 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1828 r = -EINVAL; 1667 r = -EINVAL;
1829 goto unlock_vcpu_destroy; 1668 goto unlock_vcpu_destroy;
@@ -1849,8 +1688,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1849 smp_wmb(); 1688 smp_wmb();
1850 atomic_inc(&kvm->online_vcpus); 1689 atomic_inc(&kvm->online_vcpus);
1851 1690
1691#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1692 if (kvm->bsp_vcpu_id == id)
1693 kvm->bsp_vcpu = vcpu;
1694#endif
1852 mutex_unlock(&kvm->lock); 1695 mutex_unlock(&kvm->lock);
1853 kvm_arch_vcpu_postcreate(vcpu);
1854 return r; 1696 return r;
1855 1697
1856unlock_vcpu_destroy: 1698unlock_vcpu_destroy:
@@ -1893,9 +1735,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
1893#endif 1735#endif
1894 1736
1895 1737
1896 r = vcpu_load(vcpu); 1738 vcpu_load(vcpu);
1897 if (r)
1898 return r;
1899 switch (ioctl) { 1739 switch (ioctl) {
1900 case KVM_RUN: 1740 case KVM_RUN:
1901 r = -EINVAL; 1741 r = -EINVAL;
@@ -1926,12 +1766,17 @@ out_free1:
1926 struct kvm_regs *kvm_regs; 1766 struct kvm_regs *kvm_regs;
1927 1767
1928 r = -ENOMEM; 1768 r = -ENOMEM;
1929 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 1769 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1930 if (IS_ERR(kvm_regs)) { 1770 if (!kvm_regs)
1931 r = PTR_ERR(kvm_regs);
1932 goto out; 1771 goto out;
1933 } 1772 r = -EFAULT;
1773 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1774 goto out_free2;
1934 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1775 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1776 if (r)
1777 goto out_free2;
1778 r = 0;
1779out_free2:
1935 kfree(kvm_regs); 1780 kfree(kvm_regs);
1936 break; 1781 break;
1937 } 1782 }
@@ -1950,13 +1795,17 @@ out_free1:
1950 break; 1795 break;
1951 } 1796 }
1952 case KVM_SET_SREGS: { 1797 case KVM_SET_SREGS: {
1953 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1798 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1954 if (IS_ERR(kvm_sregs)) { 1799 r = -ENOMEM;
1955 r = PTR_ERR(kvm_sregs); 1800 if (!kvm_sregs)
1956 kvm_sregs = NULL; 1801 goto out;
1802 r = -EFAULT;
1803 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1957 goto out; 1804 goto out;
1958 }
1959 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1805 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1806 if (r)
1807 goto out;
1808 r = 0;
1960 break; 1809 break;
1961 } 1810 }
1962 case KVM_GET_MP_STATE: { 1811 case KVM_GET_MP_STATE: {
@@ -1978,6 +1827,9 @@ out_free1:
1978 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1827 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1979 goto out; 1828 goto out;
1980 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1829 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1830 if (r)
1831 goto out;
1832 r = 0;
1981 break; 1833 break;
1982 } 1834 }
1983 case KVM_TRANSLATE: { 1835 case KVM_TRANSLATE: {
@@ -2002,6 +1854,9 @@ out_free1:
2002 if (copy_from_user(&dbg, argp, sizeof dbg)) 1854 if (copy_from_user(&dbg, argp, sizeof dbg))
2003 goto out; 1855 goto out;
2004 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1856 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1857 if (r)
1858 goto out;
1859 r = 0;
2005 break; 1860 break;
2006 } 1861 }
2007 case KVM_SET_SIGNAL_MASK: { 1862 case KVM_SET_SIGNAL_MASK: {
@@ -2042,13 +1897,17 @@ out_free1:
2042 break; 1897 break;
2043 } 1898 }
2044 case KVM_SET_FPU: { 1899 case KVM_SET_FPU: {
2045 fpu = memdup_user(argp, sizeof(*fpu)); 1900 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
2046 if (IS_ERR(fpu)) { 1901 r = -ENOMEM;
2047 r = PTR_ERR(fpu); 1902 if (!fpu)
2048 fpu = NULL; 1903 goto out;
1904 r = -EFAULT;
1905 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
2049 goto out; 1906 goto out;
2050 }
2051 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1907 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1908 if (r)
1909 goto out;
1910 r = 0;
2052 break; 1911 break;
2053 } 1912 }
2054 default: 1913 default:
@@ -2091,10 +1950,9 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
2091 if (copy_from_user(&csigset, sigmask_arg->sigset, 1950 if (copy_from_user(&csigset, sigmask_arg->sigset,
2092 sizeof csigset)) 1951 sizeof csigset))
2093 goto out; 1952 goto out;
2094 sigset_from_compat(&sigset, &csigset); 1953 }
2095 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1954 sigset_from_compat(&sigset, &csigset);
2096 } else 1955 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2097 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
2098 break; 1956 break;
2099 } 1957 }
2100 default: 1958 default:
@@ -2118,6 +1976,8 @@ static long kvm_vm_ioctl(struct file *filp,
2118 switch (ioctl) { 1976 switch (ioctl) {
2119 case KVM_CREATE_VCPU: 1977 case KVM_CREATE_VCPU:
2120 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1978 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1979 if (r < 0)
1980 goto out;
2121 break; 1981 break;
2122 case KVM_SET_USER_MEMORY_REGION: { 1982 case KVM_SET_USER_MEMORY_REGION: {
2123 struct kvm_userspace_memory_region kvm_userspace_mem; 1983 struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -2128,6 +1988,8 @@ static long kvm_vm_ioctl(struct file *filp,
2128 goto out; 1988 goto out;
2129 1989
2130 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1990 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1991 if (r)
1992 goto out;
2131 break; 1993 break;
2132 } 1994 }
2133 case KVM_GET_DIRTY_LOG: { 1995 case KVM_GET_DIRTY_LOG: {
@@ -2137,6 +1999,8 @@ static long kvm_vm_ioctl(struct file *filp,
2137 if (copy_from_user(&log, argp, sizeof log)) 1999 if (copy_from_user(&log, argp, sizeof log))
2138 goto out; 2000 goto out;
2139 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2001 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2002 if (r)
2003 goto out;
2140 break; 2004 break;
2141 } 2005 }
2142#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2006#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2146,6 +2010,9 @@ static long kvm_vm_ioctl(struct file *filp,
2146 if (copy_from_user(&zone, argp, sizeof zone)) 2010 if (copy_from_user(&zone, argp, sizeof zone))
2147 goto out; 2011 goto out;
2148 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2012 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
2013 if (r)
2014 goto out;
2015 r = 0;
2149 break; 2016 break;
2150 } 2017 }
2151 case KVM_UNREGISTER_COALESCED_MMIO: { 2018 case KVM_UNREGISTER_COALESCED_MMIO: {
@@ -2154,6 +2021,9 @@ static long kvm_vm_ioctl(struct file *filp,
2154 if (copy_from_user(&zone, argp, sizeof zone)) 2021 if (copy_from_user(&zone, argp, sizeof zone))
2155 goto out; 2022 goto out;
2156 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2023 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2024 if (r)
2025 goto out;
2026 r = 0;
2157 break; 2027 break;
2158 } 2028 }
2159#endif 2029#endif
@@ -2163,7 +2033,7 @@ static long kvm_vm_ioctl(struct file *filp,
2163 r = -EFAULT; 2033 r = -EFAULT;
2164 if (copy_from_user(&data, argp, sizeof data)) 2034 if (copy_from_user(&data, argp, sizeof data))
2165 goto out; 2035 goto out;
2166 r = kvm_irqfd(kvm, &data); 2036 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
2167 break; 2037 break;
2168 } 2038 }
2169 case KVM_IOEVENTFD: { 2039 case KVM_IOEVENTFD: {
@@ -2186,40 +2056,6 @@ static long kvm_vm_ioctl(struct file *filp,
2186 mutex_unlock(&kvm->lock); 2056 mutex_unlock(&kvm->lock);
2187 break; 2057 break;
2188#endif 2058#endif
2189#ifdef CONFIG_HAVE_KVM_MSI
2190 case KVM_SIGNAL_MSI: {
2191 struct kvm_msi msi;
2192
2193 r = -EFAULT;
2194 if (copy_from_user(&msi, argp, sizeof msi))
2195 goto out;
2196 r = kvm_send_userspace_msi(kvm, &msi);
2197 break;
2198 }
2199#endif
2200#ifdef __KVM_HAVE_IRQ_LINE
2201 case KVM_IRQ_LINE_STATUS:
2202 case KVM_IRQ_LINE: {
2203 struct kvm_irq_level irq_event;
2204
2205 r = -EFAULT;
2206 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2207 goto out;
2208
2209 r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
2210 if (r)
2211 goto out;
2212
2213 r = -EFAULT;
2214 if (ioctl == KVM_IRQ_LINE_STATUS) {
2215 if (copy_to_user(argp, &irq_event, sizeof irq_event))
2216 goto out;
2217 }
2218
2219 r = 0;
2220 break;
2221 }
2222#endif
2223 default: 2059 default:
2224 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2060 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2225 if (r == -ENOTTY) 2061 if (r == -ENOTTY)
@@ -2262,6 +2098,8 @@ static long kvm_vm_compat_ioctl(struct file *filp,
2262 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2098 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2263 2099
2264 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2100 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2101 if (r)
2102 goto out;
2265 break; 2103 break;
2266 } 2104 }
2267 default: 2105 default:
@@ -2314,12 +2152,12 @@ static struct file_operations kvm_vm_fops = {
2314 .llseek = noop_llseek, 2152 .llseek = noop_llseek,
2315}; 2153};
2316 2154
2317static int kvm_dev_ioctl_create_vm(unsigned long type) 2155static int kvm_dev_ioctl_create_vm(void)
2318{ 2156{
2319 int r; 2157 int r;
2320 struct kvm *kvm; 2158 struct kvm *kvm;
2321 2159
2322 kvm = kvm_create_vm(type); 2160 kvm = kvm_create_vm();
2323 if (IS_ERR(kvm)) 2161 if (IS_ERR(kvm))
2324 return PTR_ERR(kvm); 2162 return PTR_ERR(kvm);
2325#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2163#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2346,11 +2184,8 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
2346 case KVM_CAP_SET_BOOT_CPU_ID: 2184 case KVM_CAP_SET_BOOT_CPU_ID:
2347#endif 2185#endif
2348 case KVM_CAP_INTERNAL_ERROR_DATA: 2186 case KVM_CAP_INTERNAL_ERROR_DATA:
2349#ifdef CONFIG_HAVE_KVM_MSI
2350 case KVM_CAP_SIGNAL_MSI:
2351#endif
2352 return 1; 2187 return 1;
2353#ifdef KVM_CAP_IRQ_ROUTING 2188#ifdef CONFIG_HAVE_KVM_IRQCHIP
2354 case KVM_CAP_IRQ_ROUTING: 2189 case KVM_CAP_IRQ_ROUTING:
2355 return KVM_MAX_IRQ_ROUTES; 2190 return KVM_MAX_IRQ_ROUTES;
2356#endif 2191#endif
@@ -2373,7 +2208,10 @@ static long kvm_dev_ioctl(struct file *filp,
2373 r = KVM_API_VERSION; 2208 r = KVM_API_VERSION;
2374 break; 2209 break;
2375 case KVM_CREATE_VM: 2210 case KVM_CREATE_VM:
2376 r = kvm_dev_ioctl_create_vm(arg); 2211 r = -EINVAL;
2212 if (arg)
2213 goto out;
2214 r = kvm_dev_ioctl_create_vm();
2377 break; 2215 break;
2378 case KVM_CHECK_EXTENSION: 2216 case KVM_CHECK_EXTENSION:
2379 r = kvm_dev_ioctl_check_extension_generic(arg); 2217 r = kvm_dev_ioctl_check_extension_generic(arg);
@@ -2553,89 +2391,24 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2553 int i; 2391 int i;
2554 2392
2555 for (i = 0; i < bus->dev_count; i++) { 2393 for (i = 0; i < bus->dev_count; i++) {
2556 struct kvm_io_device *pos = bus->range[i].dev; 2394 struct kvm_io_device *pos = bus->devs[i];
2557 2395
2558 kvm_iodevice_destructor(pos); 2396 kvm_iodevice_destructor(pos);
2559 } 2397 }
2560 kfree(bus); 2398 kfree(bus);
2561} 2399}
2562 2400
2563int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2564{
2565 const struct kvm_io_range *r1 = p1;
2566 const struct kvm_io_range *r2 = p2;
2567
2568 if (r1->addr < r2->addr)
2569 return -1;
2570 if (r1->addr + r1->len > r2->addr + r2->len)
2571 return 1;
2572 return 0;
2573}
2574
2575int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2576 gpa_t addr, int len)
2577{
2578 bus->range[bus->dev_count++] = (struct kvm_io_range) {
2579 .addr = addr,
2580 .len = len,
2581 .dev = dev,
2582 };
2583
2584 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
2585 kvm_io_bus_sort_cmp, NULL);
2586
2587 return 0;
2588}
2589
2590int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2591 gpa_t addr, int len)
2592{
2593 struct kvm_io_range *range, key;
2594 int off;
2595
2596 key = (struct kvm_io_range) {
2597 .addr = addr,
2598 .len = len,
2599 };
2600
2601 range = bsearch(&key, bus->range, bus->dev_count,
2602 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
2603 if (range == NULL)
2604 return -ENOENT;
2605
2606 off = range - bus->range;
2607
2608 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
2609 off--;
2610
2611 return off;
2612}
2613
2614/* kvm_io_bus_write - called under kvm->slots_lock */ 2401/* kvm_io_bus_write - called under kvm->slots_lock */
2615int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2402int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2616 int len, const void *val) 2403 int len, const void *val)
2617{ 2404{
2618 int idx; 2405 int i;
2619 struct kvm_io_bus *bus; 2406 struct kvm_io_bus *bus;
2620 struct kvm_io_range range;
2621
2622 range = (struct kvm_io_range) {
2623 .addr = addr,
2624 .len = len,
2625 };
2626 2407
2627 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2408 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2628 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2409 for (i = 0; i < bus->dev_count; i++)
2629 if (idx < 0) 2410 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2630 return -EOPNOTSUPP;
2631
2632 while (idx < bus->dev_count &&
2633 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2634 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
2635 return 0; 2411 return 0;
2636 idx++;
2637 }
2638
2639 return -EOPNOTSUPP; 2412 return -EOPNOTSUPP;
2640} 2413}
2641 2414
@@ -2643,47 +2416,31 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2643int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2416int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2644 int len, void *val) 2417 int len, void *val)
2645{ 2418{
2646 int idx; 2419 int i;
2647 struct kvm_io_bus *bus; 2420 struct kvm_io_bus *bus;
2648 struct kvm_io_range range;
2649
2650 range = (struct kvm_io_range) {
2651 .addr = addr,
2652 .len = len,
2653 };
2654 2421
2655 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2422 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2656 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2423 for (i = 0; i < bus->dev_count; i++)
2657 if (idx < 0) 2424 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2658 return -EOPNOTSUPP;
2659
2660 while (idx < bus->dev_count &&
2661 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2662 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
2663 return 0; 2425 return 0;
2664 idx++;
2665 }
2666
2667 return -EOPNOTSUPP; 2426 return -EOPNOTSUPP;
2668} 2427}
2669 2428
2670/* Caller must hold slots_lock. */ 2429/* Caller must hold slots_lock. */
2671int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2430int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2672 int len, struct kvm_io_device *dev) 2431 struct kvm_io_device *dev)
2673{ 2432{
2674 struct kvm_io_bus *new_bus, *bus; 2433 struct kvm_io_bus *new_bus, *bus;
2675 2434
2676 bus = kvm->buses[bus_idx]; 2435 bus = kvm->buses[bus_idx];
2677 if (bus->dev_count > NR_IOBUS_DEVS - 1) 2436 if (bus->dev_count > NR_IOBUS_DEVS-1)
2678 return -ENOSPC; 2437 return -ENOSPC;
2679 2438
2680 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 2439 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2681 sizeof(struct kvm_io_range)), GFP_KERNEL);
2682 if (!new_bus) 2440 if (!new_bus)
2683 return -ENOMEM; 2441 return -ENOMEM;
2684 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 2442 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2685 sizeof(struct kvm_io_range))); 2443 new_bus->devs[new_bus->dev_count++] = dev;
2686 kvm_io_bus_insert_dev(new_bus, dev, addr, len);
2687 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2444 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2688 synchronize_srcu_expedited(&kvm->srcu); 2445 synchronize_srcu_expedited(&kvm->srcu);
2689 kfree(bus); 2446 kfree(bus);
@@ -2698,26 +2455,25 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2698 int i, r; 2455 int i, r;
2699 struct kvm_io_bus *new_bus, *bus; 2456 struct kvm_io_bus *new_bus, *bus;
2700 2457
2458 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2459 if (!new_bus)
2460 return -ENOMEM;
2461
2701 bus = kvm->buses[bus_idx]; 2462 bus = kvm->buses[bus_idx];
2463 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2464
2702 r = -ENOENT; 2465 r = -ENOENT;
2703 for (i = 0; i < bus->dev_count; i++) 2466 for (i = 0; i < new_bus->dev_count; i++)
2704 if (bus->range[i].dev == dev) { 2467 if (new_bus->devs[i] == dev) {
2705 r = 0; 2468 r = 0;
2469 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2706 break; 2470 break;
2707 } 2471 }
2708 2472
2709 if (r) 2473 if (r) {
2474 kfree(new_bus);
2710 return r; 2475 return r;
2711 2476 }
2712 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
2713 sizeof(struct kvm_io_range)), GFP_KERNEL);
2714 if (!new_bus)
2715 return -ENOMEM;
2716
2717 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
2718 new_bus->dev_count--;
2719 memcpy(new_bus->range + i, bus->range + i + 1,
2720 (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
2721 2477
2722 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2478 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2723 synchronize_srcu_expedited(&kvm->srcu); 2479 synchronize_srcu_expedited(&kvm->srcu);
@@ -2768,29 +2524,15 @@ static const struct file_operations *stat_fops[] = {
2768 [KVM_STAT_VM] = &vm_stat_fops, 2524 [KVM_STAT_VM] = &vm_stat_fops,
2769}; 2525};
2770 2526
2771static int kvm_init_debug(void) 2527static void kvm_init_debug(void)
2772{ 2528{
2773 int r = -EFAULT;
2774 struct kvm_stats_debugfs_item *p; 2529 struct kvm_stats_debugfs_item *p;
2775 2530
2776 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2531 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2777 if (kvm_debugfs_dir == NULL) 2532 for (p = debugfs_entries; p->name; ++p)
2778 goto out;
2779
2780 for (p = debugfs_entries; p->name; ++p) {
2781 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2533 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2782 (void *)(long)p->offset, 2534 (void *)(long)p->offset,
2783 stat_fops[p->kind]); 2535 stat_fops[p->kind]);
2784 if (p->dentry == NULL)
2785 goto out_dir;
2786 }
2787
2788 return 0;
2789
2790out_dir:
2791 debugfs_remove_recursive(kvm_debugfs_dir);
2792out:
2793 return r;
2794} 2536}
2795 2537
2796static void kvm_exit_debug(void) 2538static void kvm_exit_debug(void)
@@ -2822,6 +2564,9 @@ static struct syscore_ops kvm_syscore_ops = {
2822 .resume = kvm_resume, 2564 .resume = kvm_resume,
2823}; 2565};
2824 2566
2567struct page *bad_page;
2568pfn_t bad_pfn;
2569
2825static inline 2570static inline
2826struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2571struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2827{ 2572{
@@ -2853,6 +2598,33 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2853 if (r) 2598 if (r)
2854 goto out_fail; 2599 goto out_fail;
2855 2600
2601 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2602
2603 if (bad_page == NULL) {
2604 r = -ENOMEM;
2605 goto out;
2606 }
2607
2608 bad_pfn = page_to_pfn(bad_page);
2609
2610 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2611
2612 if (hwpoison_page == NULL) {
2613 r = -ENOMEM;
2614 goto out_free_0;
2615 }
2616
2617 hwpoison_pfn = page_to_pfn(hwpoison_page);
2618
2619 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2620
2621 if (fault_page == NULL) {
2622 r = -ENOMEM;
2623 goto out_free_0;
2624 }
2625
2626 fault_pfn = page_to_pfn(fault_page);
2627
2856 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2628 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2857 r = -ENOMEM; 2629 r = -ENOMEM;
2858 goto out_free_0; 2630 goto out_free_0;
@@ -2904,16 +2676,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2904 kvm_preempt_ops.sched_in = kvm_sched_in; 2676 kvm_preempt_ops.sched_in = kvm_sched_in;
2905 kvm_preempt_ops.sched_out = kvm_sched_out; 2677 kvm_preempt_ops.sched_out = kvm_sched_out;
2906 2678
2907 r = kvm_init_debug(); 2679 kvm_init_debug();
2908 if (r) {
2909 printk(KERN_ERR "kvm: create debugfs files failed\n");
2910 goto out_undebugfs;
2911 }
2912 2680
2913 return 0; 2681 return 0;
2914 2682
2915out_undebugfs:
2916 unregister_syscore_ops(&kvm_syscore_ops);
2917out_unreg: 2683out_unreg:
2918 kvm_async_pf_deinit(); 2684 kvm_async_pf_deinit();
2919out_free: 2685out_free:
@@ -2927,6 +2693,12 @@ out_free_1:
2927out_free_0a: 2693out_free_0a:
2928 free_cpumask_var(cpus_hardware_enabled); 2694 free_cpumask_var(cpus_hardware_enabled);
2929out_free_0: 2695out_free_0:
2696 if (fault_page)
2697 __free_page(fault_page);
2698 if (hwpoison_page)
2699 __free_page(hwpoison_page);
2700 __free_page(bad_page);
2701out:
2930 kvm_arch_exit(); 2702 kvm_arch_exit();
2931out_fail: 2703out_fail:
2932 return r; 2704 return r;
@@ -2946,5 +2718,7 @@ void kvm_exit(void)
2946 kvm_arch_hardware_unsetup(); 2718 kvm_arch_hardware_unsetup();
2947 kvm_arch_exit(); 2719 kvm_arch_exit();
2948 free_cpumask_var(cpus_hardware_enabled); 2720 free_cpumask_var(cpus_hardware_enabled);
2721 __free_page(hwpoison_page);
2722 __free_page(bad_page);
2949} 2723}
2950EXPORT_SYMBOL_GPL(kvm_exit); 2724EXPORT_SYMBOL_GPL(kvm_exit);