aboutsummaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig14
-rw-r--r--virt/kvm/coalesced_mmio.c74
-rw-r--r--virt/kvm/coalesced_mmio.h1
-rw-r--r--virt/kvm/eventfd.c578
-rw-r--r--virt/kvm/ioapic.c78
-rw-r--r--virt/kvm/iodev.h55
-rw-r--r--virt/kvm/irq_comm.c51
-rw-r--r--virt/kvm/kvm_main.c298
-rw-r--r--virt/kvm/kvm_trace.c285
9 files changed, 963 insertions, 471 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
new file mode 100644
index 000000000000..daece36c0a57
--- /dev/null
+++ b/virt/kvm/Kconfig
@@ -0,0 +1,14 @@
1# KVM common configuration items and defaults
2
3config HAVE_KVM
4 bool
5
6config HAVE_KVM_IRQCHIP
7 bool
8
9config HAVE_KVM_EVENTFD
10 bool
11 select EVENTFD
12
13config KVM_APIC_ARCHITECTURE
14 bool
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 5ae620d32fac..04d69cd7049b 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -14,32 +14,28 @@
14 14
15#include "coalesced_mmio.h" 15#include "coalesced_mmio.h"
16 16
17static int coalesced_mmio_in_range(struct kvm_io_device *this, 17static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
18 gpa_t addr, int len, int is_write) 18{
19 return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
20}
21
22static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
23 gpa_t addr, int len)
19{ 24{
20 struct kvm_coalesced_mmio_dev *dev =
21 (struct kvm_coalesced_mmio_dev*)this->private;
22 struct kvm_coalesced_mmio_zone *zone; 25 struct kvm_coalesced_mmio_zone *zone;
23 int next; 26 struct kvm_coalesced_mmio_ring *ring;
27 unsigned avail;
24 int i; 28 int i;
25 29
26 if (!is_write)
27 return 0;
28
29 /* kvm->lock is taken by the caller and must be not released before
30 * dev.read/write
31 */
32
33 /* Are we able to batch it ? */ 30 /* Are we able to batch it ? */
34 31
35 /* last is the first free entry 32 /* last is the first free entry
36 * check if we don't meet the first used entry 33 * check if we don't meet the first used entry
37 * there is always one unused entry in the buffer 34 * there is always one unused entry in the buffer
38 */ 35 */
39 36 ring = dev->kvm->coalesced_mmio_ring;
40 next = (dev->kvm->coalesced_mmio_ring->last + 1) % 37 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
41 KVM_COALESCED_MMIO_MAX; 38 if (avail < KVM_MAX_VCPUS) {
42 if (next == dev->kvm->coalesced_mmio_ring->first) {
43 /* full */ 39 /* full */
44 return 0; 40 return 0;
45 } 41 }
@@ -60,14 +56,15 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this,
60 return 0; 56 return 0;
61} 57}
62 58
63static void coalesced_mmio_write(struct kvm_io_device *this, 59static int coalesced_mmio_write(struct kvm_io_device *this,
64 gpa_t addr, int len, const void *val) 60 gpa_t addr, int len, const void *val)
65{ 61{
66 struct kvm_coalesced_mmio_dev *dev = 62 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
67 (struct kvm_coalesced_mmio_dev*)this->private;
68 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; 63 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
64 if (!coalesced_mmio_in_range(dev, addr, len))
65 return -EOPNOTSUPP;
69 66
70 /* kvm->lock must be taken by caller before call to in_range()*/ 67 spin_lock(&dev->lock);
71 68
72 /* copy data in first free entry of the ring */ 69 /* copy data in first free entry of the ring */
73 70
@@ -76,29 +73,40 @@ static void coalesced_mmio_write(struct kvm_io_device *this,
76 memcpy(ring->coalesced_mmio[ring->last].data, val, len); 73 memcpy(ring->coalesced_mmio[ring->last].data, val, len);
77 smp_wmb(); 74 smp_wmb();
78 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; 75 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
76 spin_unlock(&dev->lock);
77 return 0;
79} 78}
80 79
81static void coalesced_mmio_destructor(struct kvm_io_device *this) 80static void coalesced_mmio_destructor(struct kvm_io_device *this)
82{ 81{
83 kfree(this); 82 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
83
84 kfree(dev);
84} 85}
85 86
87static const struct kvm_io_device_ops coalesced_mmio_ops = {
88 .write = coalesced_mmio_write,
89 .destructor = coalesced_mmio_destructor,
90};
91
86int kvm_coalesced_mmio_init(struct kvm *kvm) 92int kvm_coalesced_mmio_init(struct kvm *kvm)
87{ 93{
88 struct kvm_coalesced_mmio_dev *dev; 94 struct kvm_coalesced_mmio_dev *dev;
95 int ret;
89 96
90 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); 97 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
91 if (!dev) 98 if (!dev)
92 return -ENOMEM; 99 return -ENOMEM;
93 dev->dev.write = coalesced_mmio_write; 100 spin_lock_init(&dev->lock);
94 dev->dev.in_range = coalesced_mmio_in_range; 101 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
95 dev->dev.destructor = coalesced_mmio_destructor;
96 dev->dev.private = dev;
97 dev->kvm = kvm; 102 dev->kvm = kvm;
98 kvm->coalesced_mmio_dev = dev; 103 kvm->coalesced_mmio_dev = dev;
99 kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev);
100 104
101 return 0; 105 ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
106 if (ret < 0)
107 kfree(dev);
108
109 return ret;
102} 110}
103 111
104int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, 112int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
@@ -109,16 +117,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
109 if (dev == NULL) 117 if (dev == NULL)
110 return -EINVAL; 118 return -EINVAL;
111 119
112 mutex_lock(&kvm->lock); 120 down_write(&kvm->slots_lock);
113 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { 121 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
114 mutex_unlock(&kvm->lock); 122 up_write(&kvm->slots_lock);
115 return -ENOBUFS; 123 return -ENOBUFS;
116 } 124 }
117 125
118 dev->zone[dev->nb_zones] = *zone; 126 dev->zone[dev->nb_zones] = *zone;
119 dev->nb_zones++; 127 dev->nb_zones++;
120 128
121 mutex_unlock(&kvm->lock); 129 up_write(&kvm->slots_lock);
122 return 0; 130 return 0;
123} 131}
124 132
@@ -132,7 +140,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
132 if (dev == NULL) 140 if (dev == NULL)
133 return -EINVAL; 141 return -EINVAL;
134 142
135 mutex_lock(&kvm->lock); 143 down_write(&kvm->slots_lock);
136 144
137 i = dev->nb_zones; 145 i = dev->nb_zones;
138 while(i) { 146 while(i) {
@@ -150,7 +158,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
150 i--; 158 i--;
151 } 159 }
152 160
153 mutex_unlock(&kvm->lock); 161 up_write(&kvm->slots_lock);
154 162
155 return 0; 163 return 0;
156} 164}
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 5ac0ec628461..4b49f27fa31e 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,6 +12,7 @@
12struct kvm_coalesced_mmio_dev { 12struct kvm_coalesced_mmio_dev {
13 struct kvm_io_device dev; 13 struct kvm_io_device dev;
14 struct kvm *kvm; 14 struct kvm *kvm;
15 spinlock_t lock;
15 int nb_zones; 16 int nb_zones;
16 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; 17 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
17}; 18};
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 000000000000..bb4ebd89b9ff
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,578 @@
1/*
2 * kvm eventfd support - use eventfd objects to signal various KVM events
3 *
4 * Copyright 2009 Novell. All Rights Reserved.
5 *
6 * Author:
7 * Gregory Haskins <ghaskins@novell.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#include <linux/kvm_host.h>
24#include <linux/kvm.h>
25#include <linux/workqueue.h>
26#include <linux/syscalls.h>
27#include <linux/wait.h>
28#include <linux/poll.h>
29#include <linux/file.h>
30#include <linux/list.h>
31#include <linux/eventfd.h>
32#include <linux/kernel.h>
33
34#include "iodev.h"
35
36/*
37 * --------------------------------------------------------------------
38 * irqfd: Allows an fd to be used to inject an interrupt to the guest
39 *
40 * Credit goes to Avi Kivity for the original idea.
41 * --------------------------------------------------------------------
42 */
43
44struct _irqfd {
45 struct kvm *kvm;
46 struct eventfd_ctx *eventfd;
47 int gsi;
48 struct list_head list;
49 poll_table pt;
50 wait_queue_head_t *wqh;
51 wait_queue_t wait;
52 struct work_struct inject;
53 struct work_struct shutdown;
54};
55
56static struct workqueue_struct *irqfd_cleanup_wq;
57
58static void
59irqfd_inject(struct work_struct *work)
60{
61 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
62 struct kvm *kvm = irqfd->kvm;
63
64 mutex_lock(&kvm->irq_lock);
65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
66 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
67 mutex_unlock(&kvm->irq_lock);
68}
69
70/*
71 * Race-free decouple logic (ordering is critical)
72 */
73static void
74irqfd_shutdown(struct work_struct *work)
75{
76 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
77
78 /*
79 * Synchronize with the wait-queue and unhook ourselves to prevent
80 * further events.
81 */
82 remove_wait_queue(irqfd->wqh, &irqfd->wait);
83
84 /*
85 * We know no new events will be scheduled at this point, so block
86 * until all previously outstanding events have completed
87 */
88 flush_work(&irqfd->inject);
89
90 /*
91 * It is now safe to release the object's resources
92 */
93 eventfd_ctx_put(irqfd->eventfd);
94 kfree(irqfd);
95}
96
97
98/* assumes kvm->irqfds.lock is held */
99static bool
100irqfd_is_active(struct _irqfd *irqfd)
101{
102 return list_empty(&irqfd->list) ? false : true;
103}
104
105/*
106 * Mark the irqfd as inactive and schedule it for removal
107 *
108 * assumes kvm->irqfds.lock is held
109 */
110static void
111irqfd_deactivate(struct _irqfd *irqfd)
112{
113 BUG_ON(!irqfd_is_active(irqfd));
114
115 list_del_init(&irqfd->list);
116
117 queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
118}
119
120/*
121 * Called with wqh->lock held and interrupts disabled
122 */
123static int
124irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
125{
126 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
127 unsigned long flags = (unsigned long)key;
128
129 if (flags & POLLIN)
130 /* An event has been signaled, inject an interrupt */
131 schedule_work(&irqfd->inject);
132
133 if (flags & POLLHUP) {
134 /* The eventfd is closing, detach from KVM */
135 struct kvm *kvm = irqfd->kvm;
136 unsigned long flags;
137
138 spin_lock_irqsave(&kvm->irqfds.lock, flags);
139
140 /*
141 * We must check if someone deactivated the irqfd before
142 * we could acquire the irqfds.lock since the item is
143 * deactivated from the KVM side before it is unhooked from
144 * the wait-queue. If it is already deactivated, we can
145 * simply return knowing the other side will cleanup for us.
146 * We cannot race against the irqfd going away since the
147 * other side is required to acquire wqh->lock, which we hold
148 */
149 if (irqfd_is_active(irqfd))
150 irqfd_deactivate(irqfd);
151
152 spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
153 }
154
155 return 0;
156}
157
158static void
159irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
160 poll_table *pt)
161{
162 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
163
164 irqfd->wqh = wqh;
165 add_wait_queue(wqh, &irqfd->wait);
166}
167
168static int
169kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
170{
171 struct _irqfd *irqfd;
172 struct file *file = NULL;
173 struct eventfd_ctx *eventfd = NULL;
174 int ret;
175 unsigned int events;
176
177 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
178 if (!irqfd)
179 return -ENOMEM;
180
181 irqfd->kvm = kvm;
182 irqfd->gsi = gsi;
183 INIT_LIST_HEAD(&irqfd->list);
184 INIT_WORK(&irqfd->inject, irqfd_inject);
185 INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
186
187 file = eventfd_fget(fd);
188 if (IS_ERR(file)) {
189 ret = PTR_ERR(file);
190 goto fail;
191 }
192
193 eventfd = eventfd_ctx_fileget(file);
194 if (IS_ERR(eventfd)) {
195 ret = PTR_ERR(eventfd);
196 goto fail;
197 }
198
199 irqfd->eventfd = eventfd;
200
201 /*
202 * Install our own custom wake-up handling so we are notified via
203 * a callback whenever someone signals the underlying eventfd
204 */
205 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
206 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
207
208 events = file->f_op->poll(file, &irqfd->pt);
209
210 spin_lock_irq(&kvm->irqfds.lock);
211 list_add_tail(&irqfd->list, &kvm->irqfds.items);
212 spin_unlock_irq(&kvm->irqfds.lock);
213
214 /*
215 * Check if there was an event already pending on the eventfd
216 * before we registered, and trigger it as if we didn't miss it.
217 */
218 if (events & POLLIN)
219 schedule_work(&irqfd->inject);
220
221 /*
222 * do not drop the file until the irqfd is fully initialized, otherwise
223 * we might race against the POLLHUP
224 */
225 fput(file);
226
227 return 0;
228
229fail:
230 if (eventfd && !IS_ERR(eventfd))
231 eventfd_ctx_put(eventfd);
232
233 if (!IS_ERR(file))
234 fput(file);
235
236 kfree(irqfd);
237 return ret;
238}
239
240void
241kvm_eventfd_init(struct kvm *kvm)
242{
243 spin_lock_init(&kvm->irqfds.lock);
244 INIT_LIST_HEAD(&kvm->irqfds.items);
245 INIT_LIST_HEAD(&kvm->ioeventfds);
246}
247
248/*
249 * shutdown any irqfd's that match fd+gsi
250 */
251static int
252kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
253{
254 struct _irqfd *irqfd, *tmp;
255 struct eventfd_ctx *eventfd;
256
257 eventfd = eventfd_ctx_fdget(fd);
258 if (IS_ERR(eventfd))
259 return PTR_ERR(eventfd);
260
261 spin_lock_irq(&kvm->irqfds.lock);
262
263 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
264 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
265 irqfd_deactivate(irqfd);
266 }
267
268 spin_unlock_irq(&kvm->irqfds.lock);
269 eventfd_ctx_put(eventfd);
270
271 /*
272 * Block until we know all outstanding shutdown jobs have completed
273 * so that we guarantee there will not be any more interrupts on this
274 * gsi once this deassign function returns.
275 */
276 flush_workqueue(irqfd_cleanup_wq);
277
278 return 0;
279}
280
281int
282kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
283{
284 if (flags & KVM_IRQFD_FLAG_DEASSIGN)
285 return kvm_irqfd_deassign(kvm, fd, gsi);
286
287 return kvm_irqfd_assign(kvm, fd, gsi);
288}
289
290/*
291 * This function is called as the kvm VM fd is being released. Shutdown all
292 * irqfds that still remain open
293 */
294void
295kvm_irqfd_release(struct kvm *kvm)
296{
297 struct _irqfd *irqfd, *tmp;
298
299 spin_lock_irq(&kvm->irqfds.lock);
300
301 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
302 irqfd_deactivate(irqfd);
303
304 spin_unlock_irq(&kvm->irqfds.lock);
305
306 /*
307 * Block until we know all outstanding shutdown jobs have completed
308 * since we do not take a kvm* reference.
309 */
310 flush_workqueue(irqfd_cleanup_wq);
311
312}
313
314/*
315 * create a host-wide workqueue for issuing deferred shutdown requests
316 * aggregated from all vm* instances. We need our own isolated single-thread
317 * queue to prevent deadlock against flushing the normal work-queue.
318 */
319static int __init irqfd_module_init(void)
320{
321 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
322 if (!irqfd_cleanup_wq)
323 return -ENOMEM;
324
325 return 0;
326}
327
328static void __exit irqfd_module_exit(void)
329{
330 destroy_workqueue(irqfd_cleanup_wq);
331}
332
333module_init(irqfd_module_init);
334module_exit(irqfd_module_exit);
335
336/*
337 * --------------------------------------------------------------------
338 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
339 *
340 * userspace can register a PIO/MMIO address with an eventfd for receiving
341 * notification when the memory has been touched.
342 * --------------------------------------------------------------------
343 */
344
345struct _ioeventfd {
346 struct list_head list;
347 u64 addr;
348 int length;
349 struct eventfd_ctx *eventfd;
350 u64 datamatch;
351 struct kvm_io_device dev;
352 bool wildcard;
353};
354
355static inline struct _ioeventfd *
356to_ioeventfd(struct kvm_io_device *dev)
357{
358 return container_of(dev, struct _ioeventfd, dev);
359}
360
361static void
362ioeventfd_release(struct _ioeventfd *p)
363{
364 eventfd_ctx_put(p->eventfd);
365 list_del(&p->list);
366 kfree(p);
367}
368
369static bool
370ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
371{
372 u64 _val;
373
374 if (!(addr == p->addr && len == p->length))
375 /* address-range must be precise for a hit */
376 return false;
377
378 if (p->wildcard)
379 /* all else equal, wildcard is always a hit */
380 return true;
381
382 /* otherwise, we have to actually compare the data */
383
384 BUG_ON(!IS_ALIGNED((unsigned long)val, len));
385
386 switch (len) {
387 case 1:
388 _val = *(u8 *)val;
389 break;
390 case 2:
391 _val = *(u16 *)val;
392 break;
393 case 4:
394 _val = *(u32 *)val;
395 break;
396 case 8:
397 _val = *(u64 *)val;
398 break;
399 default:
400 return false;
401 }
402
403 return _val == p->datamatch ? true : false;
404}
405
406/* MMIO/PIO writes trigger an event if the addr/val match */
407static int
408ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
409 const void *val)
410{
411 struct _ioeventfd *p = to_ioeventfd(this);
412
413 if (!ioeventfd_in_range(p, addr, len, val))
414 return -EOPNOTSUPP;
415
416 eventfd_signal(p->eventfd, 1);
417 return 0;
418}
419
420/*
421 * This function is called as KVM is completely shutting down. We do not
422 * need to worry about locking just nuke anything we have as quickly as possible
423 */
424static void
425ioeventfd_destructor(struct kvm_io_device *this)
426{
427 struct _ioeventfd *p = to_ioeventfd(this);
428
429 ioeventfd_release(p);
430}
431
432static const struct kvm_io_device_ops ioeventfd_ops = {
433 .write = ioeventfd_write,
434 .destructor = ioeventfd_destructor,
435};
436
437/* assumes kvm->slots_lock held */
438static bool
439ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
440{
441 struct _ioeventfd *_p;
442
443 list_for_each_entry(_p, &kvm->ioeventfds, list)
444 if (_p->addr == p->addr && _p->length == p->length &&
445 (_p->wildcard || p->wildcard ||
446 _p->datamatch == p->datamatch))
447 return true;
448
449 return false;
450}
451
452static int
453kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
454{
455 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
456 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
457 struct _ioeventfd *p;
458 struct eventfd_ctx *eventfd;
459 int ret;
460
461 /* must be natural-word sized */
462 switch (args->len) {
463 case 1:
464 case 2:
465 case 4:
466 case 8:
467 break;
468 default:
469 return -EINVAL;
470 }
471
472 /* check for range overflow */
473 if (args->addr + args->len < args->addr)
474 return -EINVAL;
475
476 /* check for extra flags that we don't understand */
477 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
478 return -EINVAL;
479
480 eventfd = eventfd_ctx_fdget(args->fd);
481 if (IS_ERR(eventfd))
482 return PTR_ERR(eventfd);
483
484 p = kzalloc(sizeof(*p), GFP_KERNEL);
485 if (!p) {
486 ret = -ENOMEM;
487 goto fail;
488 }
489
490 INIT_LIST_HEAD(&p->list);
491 p->addr = args->addr;
492 p->length = args->len;
493 p->eventfd = eventfd;
494
495 /* The datamatch feature is optional, otherwise this is a wildcard */
496 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
497 p->datamatch = args->datamatch;
498 else
499 p->wildcard = true;
500
501 down_write(&kvm->slots_lock);
502
503 /* Verify that there isnt a match already */
504 if (ioeventfd_check_collision(kvm, p)) {
505 ret = -EEXIST;
506 goto unlock_fail;
507 }
508
509 kvm_iodevice_init(&p->dev, &ioeventfd_ops);
510
511 ret = __kvm_io_bus_register_dev(bus, &p->dev);
512 if (ret < 0)
513 goto unlock_fail;
514
515 list_add_tail(&p->list, &kvm->ioeventfds);
516
517 up_write(&kvm->slots_lock);
518
519 return 0;
520
521unlock_fail:
522 up_write(&kvm->slots_lock);
523
524fail:
525 kfree(p);
526 eventfd_ctx_put(eventfd);
527
528 return ret;
529}
530
531static int
532kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
533{
534 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
535 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
536 struct _ioeventfd *p, *tmp;
537 struct eventfd_ctx *eventfd;
538 int ret = -ENOENT;
539
540 eventfd = eventfd_ctx_fdget(args->fd);
541 if (IS_ERR(eventfd))
542 return PTR_ERR(eventfd);
543
544 down_write(&kvm->slots_lock);
545
546 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
547 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
548
549 if (p->eventfd != eventfd ||
550 p->addr != args->addr ||
551 p->length != args->len ||
552 p->wildcard != wildcard)
553 continue;
554
555 if (!p->wildcard && p->datamatch != args->datamatch)
556 continue;
557
558 __kvm_io_bus_unregister_dev(bus, &p->dev);
559 ioeventfd_release(p);
560 ret = 0;
561 break;
562 }
563
564 up_write(&kvm->slots_lock);
565
566 eventfd_ctx_put(eventfd);
567
568 return ret;
569}
570
571int
572kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
573{
574 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
575 return kvm_deassign_ioeventfd(kvm, args);
576
577 return kvm_assign_ioeventfd(kvm, args);
578}
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1150c6d5c7b8..9fe140bb38ec 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -36,6 +36,7 @@
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/current.h> 38#include <asm/current.h>
39#include <trace/events/kvm.h>
39 40
40#include "ioapic.h" 41#include "ioapic.h"
41#include "lapic.h" 42#include "lapic.h"
@@ -103,6 +104,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
103{ 104{
104 unsigned index; 105 unsigned index;
105 bool mask_before, mask_after; 106 bool mask_before, mask_after;
107 union kvm_ioapic_redirect_entry *e;
106 108
107 switch (ioapic->ioregsel) { 109 switch (ioapic->ioregsel) {
108 case IOAPIC_REG_VERSION: 110 case IOAPIC_REG_VERSION:
@@ -122,19 +124,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
122 ioapic_debug("change redir index %x val %x\n", index, val); 124 ioapic_debug("change redir index %x val %x\n", index, val);
123 if (index >= IOAPIC_NUM_PINS) 125 if (index >= IOAPIC_NUM_PINS)
124 return; 126 return;
125 mask_before = ioapic->redirtbl[index].fields.mask; 127 e = &ioapic->redirtbl[index];
128 mask_before = e->fields.mask;
126 if (ioapic->ioregsel & 1) { 129 if (ioapic->ioregsel & 1) {
127 ioapic->redirtbl[index].bits &= 0xffffffff; 130 e->bits &= 0xffffffff;
128 ioapic->redirtbl[index].bits |= (u64) val << 32; 131 e->bits |= (u64) val << 32;
129 } else { 132 } else {
130 ioapic->redirtbl[index].bits &= ~0xffffffffULL; 133 e->bits &= ~0xffffffffULL;
131 ioapic->redirtbl[index].bits |= (u32) val; 134 e->bits |= (u32) val;
132 ioapic->redirtbl[index].fields.remote_irr = 0; 135 e->fields.remote_irr = 0;
133 } 136 }
134 mask_after = ioapic->redirtbl[index].fields.mask; 137 mask_after = e->fields.mask;
135 if (mask_before != mask_after) 138 if (mask_before != mask_after)
136 kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); 139 kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
137 if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG 140 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
138 && ioapic->irr & (1 << index)) 141 && ioapic->irr & (1 << index))
139 ioapic_service(ioapic, index); 142 ioapic_service(ioapic, index);
140 break; 143 break;
@@ -164,7 +167,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
164 /* Always delivery PIT interrupt to vcpu 0 */ 167 /* Always delivery PIT interrupt to vcpu 0 */
165 if (irq == 0) { 168 if (irq == 0) {
166 irqe.dest_mode = 0; /* Physical mode. */ 169 irqe.dest_mode = 0; /* Physical mode. */
167 irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; 170 /* need to read apic_id from apic regiest since
171 * it can be rewritten */
172 irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
168 } 173 }
169#endif 174#endif
170 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 175 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
@@ -188,7 +193,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
188 if ((edge && old_irr != ioapic->irr) || 193 if ((edge && old_irr != ioapic->irr) ||
189 (!edge && !entry.fields.remote_irr)) 194 (!edge && !entry.fields.remote_irr))
190 ret = ioapic_service(ioapic, irq); 195 ret = ioapic_service(ioapic, irq);
196 else
197 ret = 0; /* report coalesced interrupt */
191 } 198 }
199 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
192 } 200 }
193 return ret; 201 return ret;
194} 202}
@@ -220,24 +228,29 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
220 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); 228 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
221} 229}
222 230
223static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, 231static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
224 int len, int is_write)
225{ 232{
226 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 233 return container_of(dev, struct kvm_ioapic, dev);
234}
227 235
236static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
237{
228 return ((addr >= ioapic->base_address && 238 return ((addr >= ioapic->base_address &&
229 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); 239 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
230} 240}
231 241
232static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, 242static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
233 void *val) 243 void *val)
234{ 244{
235 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 245 struct kvm_ioapic *ioapic = to_ioapic(this);
236 u32 result; 246 u32 result;
247 if (!ioapic_in_range(ioapic, addr))
248 return -EOPNOTSUPP;
237 249
238 ioapic_debug("addr %lx\n", (unsigned long)addr); 250 ioapic_debug("addr %lx\n", (unsigned long)addr);
239 ASSERT(!(addr & 0xf)); /* check alignment */ 251 ASSERT(!(addr & 0xf)); /* check alignment */
240 252
253 mutex_lock(&ioapic->kvm->irq_lock);
241 addr &= 0xff; 254 addr &= 0xff;
242 switch (addr) { 255 switch (addr) {
243 case IOAPIC_REG_SELECT: 256 case IOAPIC_REG_SELECT:
@@ -264,22 +277,28 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
264 default: 277 default:
265 printk(KERN_WARNING "ioapic: wrong length %d\n", len); 278 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
266 } 279 }
280 mutex_unlock(&ioapic->kvm->irq_lock);
281 return 0;
267} 282}
268 283
269static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, 284static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
270 const void *val) 285 const void *val)
271{ 286{
272 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 287 struct kvm_ioapic *ioapic = to_ioapic(this);
273 u32 data; 288 u32 data;
289 if (!ioapic_in_range(ioapic, addr))
290 return -EOPNOTSUPP;
274 291
275 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", 292 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
276 (void*)addr, len, val); 293 (void*)addr, len, val);
277 ASSERT(!(addr & 0xf)); /* check alignment */ 294 ASSERT(!(addr & 0xf)); /* check alignment */
295
296 mutex_lock(&ioapic->kvm->irq_lock);
278 if (len == 4 || len == 8) 297 if (len == 4 || len == 8)
279 data = *(u32 *) val; 298 data = *(u32 *) val;
280 else { 299 else {
281 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); 300 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
282 return; 301 goto unlock;
283 } 302 }
284 303
285 addr &= 0xff; 304 addr &= 0xff;
@@ -300,6 +319,9 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
300 default: 319 default:
301 break; 320 break;
302 } 321 }
322unlock:
323 mutex_unlock(&ioapic->kvm->irq_lock);
324 return 0;
303} 325}
304 326
305void kvm_ioapic_reset(struct kvm_ioapic *ioapic) 327void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
@@ -314,21 +336,27 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
314 ioapic->id = 0; 336 ioapic->id = 0;
315} 337}
316 338
339static const struct kvm_io_device_ops ioapic_mmio_ops = {
340 .read = ioapic_mmio_read,
341 .write = ioapic_mmio_write,
342};
343
317int kvm_ioapic_init(struct kvm *kvm) 344int kvm_ioapic_init(struct kvm *kvm)
318{ 345{
319 struct kvm_ioapic *ioapic; 346 struct kvm_ioapic *ioapic;
347 int ret;
320 348
321 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 349 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
322 if (!ioapic) 350 if (!ioapic)
323 return -ENOMEM; 351 return -ENOMEM;
324 kvm->arch.vioapic = ioapic; 352 kvm->arch.vioapic = ioapic;
325 kvm_ioapic_reset(ioapic); 353 kvm_ioapic_reset(ioapic);
326 ioapic->dev.read = ioapic_mmio_read; 354 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
327 ioapic->dev.write = ioapic_mmio_write;
328 ioapic->dev.in_range = ioapic_in_range;
329 ioapic->dev.private = ioapic;
330 ioapic->kvm = kvm; 355 ioapic->kvm = kvm;
331 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); 356 ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
332 return 0; 357 if (ret < 0)
358 kfree(ioapic);
359
360 return ret;
333} 361}
334 362
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 55e8846ac3a6..12fd3caffd2b 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,49 +17,54 @@
17#define __KVM_IODEV_H__ 17#define __KVM_IODEV_H__
18 18
19#include <linux/kvm_types.h> 19#include <linux/kvm_types.h>
20#include <asm/errno.h>
20 21
21struct kvm_io_device { 22struct kvm_io_device;
22 void (*read)(struct kvm_io_device *this, 23
24/**
25 * kvm_io_device_ops are called under kvm slots_lock.
26 * read and write handlers return 0 if the transaction has been handled,
27 * or non-zero to have it passed to the next device.
28 **/
29struct kvm_io_device_ops {
30 int (*read)(struct kvm_io_device *this,
31 gpa_t addr,
32 int len,
33 void *val);
34 int (*write)(struct kvm_io_device *this,
23 gpa_t addr, 35 gpa_t addr,
24 int len, 36 int len,
25 void *val); 37 const void *val);
26 void (*write)(struct kvm_io_device *this,
27 gpa_t addr,
28 int len,
29 const void *val);
30 int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
31 int is_write);
32 void (*destructor)(struct kvm_io_device *this); 38 void (*destructor)(struct kvm_io_device *this);
39};
33 40
34 void *private; 41
42struct kvm_io_device {
43 const struct kvm_io_device_ops *ops;
35}; 44};
36 45
37static inline void kvm_iodevice_read(struct kvm_io_device *dev, 46static inline void kvm_iodevice_init(struct kvm_io_device *dev,
38 gpa_t addr, 47 const struct kvm_io_device_ops *ops)
39 int len,
40 void *val)
41{ 48{
42 dev->read(dev, addr, len, val); 49 dev->ops = ops;
43} 50}
44 51
45static inline void kvm_iodevice_write(struct kvm_io_device *dev, 52static inline int kvm_iodevice_read(struct kvm_io_device *dev,
46 gpa_t addr, 53 gpa_t addr, int l, void *v)
47 int len,
48 const void *val)
49{ 54{
50 dev->write(dev, addr, len, val); 55 return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
51} 56}
52 57
53static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, 58static inline int kvm_iodevice_write(struct kvm_io_device *dev,
54 gpa_t addr, int len, int is_write) 59 gpa_t addr, int l, const void *v)
55{ 60{
56 return dev->in_range(dev, addr, len, is_write); 61 return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
57} 62}
58 63
59static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) 64static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
60{ 65{
61 if (dev->destructor) 66 if (dev->ops->destructor)
62 dev->destructor(dev); 67 dev->ops->destructor(dev);
63} 68}
64 69
65#endif /* __KVM_IODEV_H__ */ 70#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index ddc17f0e2f35..001663ff401a 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <trace/events/kvm.h>
23 24
24#include <asm/msidef.h> 25#include <asm/msidef.h>
25#ifdef CONFIG_IA64 26#ifdef CONFIG_IA64
@@ -62,14 +63,14 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
62 int i, r = -1; 63 int i, r = -1;
63 struct kvm_vcpu *vcpu, *lowest = NULL; 64 struct kvm_vcpu *vcpu, *lowest = NULL;
64 65
66 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
67
65 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 68 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
66 kvm_is_dm_lowest_prio(irq)) 69 kvm_is_dm_lowest_prio(irq))
67 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 70 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
68 71
69 for (i = 0; i < KVM_MAX_VCPUS; i++) { 72 kvm_for_each_vcpu(i, vcpu, kvm) {
70 vcpu = kvm->vcpus[i]; 73 if (!kvm_apic_present(vcpu))
71
72 if (!vcpu || !kvm_apic_present(vcpu))
73 continue; 74 continue;
74 75
75 if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 76 if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
@@ -99,6 +100,8 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
99{ 100{
100 struct kvm_lapic_irq irq; 101 struct kvm_lapic_irq irq;
101 102
103 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
104
102 irq.dest_id = (e->msi.address_lo & 105 irq.dest_id = (e->msi.address_lo &
103 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; 106 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
104 irq.vector = (e->msi.data & 107 irq.vector = (e->msi.data &
@@ -113,7 +116,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
113 return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 116 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
114} 117}
115 118
116/* This should be called with the kvm->lock mutex held 119/* This should be called with the kvm->irq_lock mutex held
117 * Return value: 120 * Return value:
118 * < 0 Interrupt was ignored (masked or not delivered for other reasons) 121 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
119 * = 0 Interrupt was coalesced (previous irq is still pending) 122 * = 0 Interrupt was coalesced (previous irq is still pending)
@@ -125,6 +128,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
125 unsigned long *irq_state, sig_level; 128 unsigned long *irq_state, sig_level;
126 int ret = -1; 129 int ret = -1;
127 130
131 trace_kvm_set_irq(irq, level, irq_source_id);
132
133 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
134
128 if (irq < KVM_IOAPIC_NUM_PINS) { 135 if (irq < KVM_IOAPIC_NUM_PINS) {
129 irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; 136 irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
130 137
@@ -134,7 +141,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
134 else 141 else
135 clear_bit(irq_source_id, irq_state); 142 clear_bit(irq_source_id, irq_state);
136 sig_level = !!(*irq_state); 143 sig_level = !!(*irq_state);
137 } else /* Deal with MSI/MSI-X */ 144 } else if (!level)
145 return ret;
146 else /* Deal with MSI/MSI-X */
138 sig_level = 1; 147 sig_level = 1;
139 148
140 /* Not possible to detect if the guest uses the PIC or the 149 /* Not possible to detect if the guest uses the PIC or the
@@ -159,6 +168,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
159 struct hlist_node *n; 168 struct hlist_node *n;
160 unsigned gsi = pin; 169 unsigned gsi = pin;
161 170
171 trace_kvm_ack_irq(irqchip, pin);
172
162 list_for_each_entry(e, &kvm->irq_routing, link) 173 list_for_each_entry(e, &kvm->irq_routing, link)
163 if (e->type == KVM_IRQ_ROUTING_IRQCHIP && 174 if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
164 e->irqchip.irqchip == irqchip && 175 e->irqchip.irqchip == irqchip &&
@@ -175,19 +186,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
175void kvm_register_irq_ack_notifier(struct kvm *kvm, 186void kvm_register_irq_ack_notifier(struct kvm *kvm,
176 struct kvm_irq_ack_notifier *kian) 187 struct kvm_irq_ack_notifier *kian)
177{ 188{
189 mutex_lock(&kvm->irq_lock);
178 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); 190 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
191 mutex_unlock(&kvm->irq_lock);
179} 192}
180 193
181void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) 194void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
195 struct kvm_irq_ack_notifier *kian)
182{ 196{
197 mutex_lock(&kvm->irq_lock);
183 hlist_del_init(&kian->link); 198 hlist_del_init(&kian->link);
199 mutex_unlock(&kvm->irq_lock);
184} 200}
185 201
186/* The caller must hold kvm->lock mutex */
187int kvm_request_irq_source_id(struct kvm *kvm) 202int kvm_request_irq_source_id(struct kvm *kvm)
188{ 203{
189 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; 204 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
190 int irq_source_id = find_first_zero_bit(bitmap, 205 int irq_source_id;
206
207 mutex_lock(&kvm->irq_lock);
208 irq_source_id = find_first_zero_bit(bitmap,
191 sizeof(kvm->arch.irq_sources_bitmap)); 209 sizeof(kvm->arch.irq_sources_bitmap));
192 210
193 if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 211 if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
@@ -197,6 +215,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
197 215
198 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 216 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
199 set_bit(irq_source_id, bitmap); 217 set_bit(irq_source_id, bitmap);
218 mutex_unlock(&kvm->irq_lock);
200 219
201 return irq_source_id; 220 return irq_source_id;
202} 221}
@@ -207,6 +226,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
207 226
208 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 227 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
209 228
229 mutex_lock(&kvm->irq_lock);
210 if (irq_source_id < 0 || 230 if (irq_source_id < 0 ||
211 irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 231 irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
212 printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); 232 printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
@@ -215,19 +235,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
215 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) 235 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
216 clear_bit(irq_source_id, &kvm->arch.irq_states[i]); 236 clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
217 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 237 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
238 mutex_unlock(&kvm->irq_lock);
218} 239}
219 240
220void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 241void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
221 struct kvm_irq_mask_notifier *kimn) 242 struct kvm_irq_mask_notifier *kimn)
222{ 243{
244 mutex_lock(&kvm->irq_lock);
223 kimn->irq = irq; 245 kimn->irq = irq;
224 hlist_add_head(&kimn->link, &kvm->mask_notifier_list); 246 hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
247 mutex_unlock(&kvm->irq_lock);
225} 248}
226 249
227void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 250void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
228 struct kvm_irq_mask_notifier *kimn) 251 struct kvm_irq_mask_notifier *kimn)
229{ 252{
253 mutex_lock(&kvm->irq_lock);
230 hlist_del(&kimn->link); 254 hlist_del(&kimn->link);
255 mutex_unlock(&kvm->irq_lock);
231} 256}
232 257
233void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) 258void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -235,6 +260,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
235 struct kvm_irq_mask_notifier *kimn; 260 struct kvm_irq_mask_notifier *kimn;
236 struct hlist_node *n; 261 struct hlist_node *n;
237 262
263 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
264
238 hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) 265 hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
239 if (kimn->irq == irq) 266 if (kimn->irq == irq)
240 kimn->func(kimn, mask); 267 kimn->func(kimn, mask);
@@ -250,7 +277,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing)
250 277
251void kvm_free_irq_routing(struct kvm *kvm) 278void kvm_free_irq_routing(struct kvm *kvm)
252{ 279{
280 mutex_lock(&kvm->irq_lock);
253 __kvm_free_irq_routing(&kvm->irq_routing); 281 __kvm_free_irq_routing(&kvm->irq_routing);
282 mutex_unlock(&kvm->irq_lock);
254} 283}
255 284
256static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, 285static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
@@ -325,13 +354,13 @@ int kvm_set_irq_routing(struct kvm *kvm,
325 e = NULL; 354 e = NULL;
326 } 355 }
327 356
328 mutex_lock(&kvm->lock); 357 mutex_lock(&kvm->irq_lock);
329 list_splice(&kvm->irq_routing, &tmp); 358 list_splice(&kvm->irq_routing, &tmp);
330 INIT_LIST_HEAD(&kvm->irq_routing); 359 INIT_LIST_HEAD(&kvm->irq_routing);
331 list_splice(&irq_list, &kvm->irq_routing); 360 list_splice(&irq_list, &kvm->irq_routing);
332 INIT_LIST_HEAD(&irq_list); 361 INIT_LIST_HEAD(&irq_list);
333 list_splice(&tmp, &irq_list); 362 list_splice(&tmp, &irq_list);
334 mutex_unlock(&kvm->lock); 363 mutex_unlock(&kvm->irq_lock);
335 364
336 r = 0; 365 r = 0;
337 366
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2884baf1d5f9..897bff3b7df9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -59,9 +59,18 @@
59#include "irq.h" 59#include "irq.h"
60#endif 60#endif
61 61
62#define CREATE_TRACE_POINTS
63#include <trace/events/kvm.h>
64
62MODULE_AUTHOR("Qumranet"); 65MODULE_AUTHOR("Qumranet");
63MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
64 67
68/*
69 * Ordering of locks:
70 *
71 * kvm->slots_lock --> kvm->lock --> kvm->irq_lock
72 */
73
65DEFINE_SPINLOCK(kvm_lock); 74DEFINE_SPINLOCK(kvm_lock);
66LIST_HEAD(vm_list); 75LIST_HEAD(vm_list);
67 76
@@ -79,6 +88,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
79 88
80static bool kvm_rebooting; 89static bool kvm_rebooting;
81 90
91static bool largepages_enabled = true;
92
82#ifdef KVM_CAP_DEVICE_ASSIGNMENT 93#ifdef KVM_CAP_DEVICE_ASSIGNMENT
83static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 94static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
84 int assigned_dev_id) 95 int assigned_dev_id)
@@ -120,17 +131,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
120{ 131{
121 struct kvm_assigned_dev_kernel *assigned_dev; 132 struct kvm_assigned_dev_kernel *assigned_dev;
122 struct kvm *kvm; 133 struct kvm *kvm;
123 int irq, i; 134 int i;
124 135
125 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 136 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
126 interrupt_work); 137 interrupt_work);
127 kvm = assigned_dev->kvm; 138 kvm = assigned_dev->kvm;
128 139
129 /* This is taken to safely inject irq inside the guest. When 140 mutex_lock(&kvm->irq_lock);
130 * the interrupt injection (or the ioapic code) uses a
131 * finer-grained lock, update this
132 */
133 mutex_lock(&kvm->lock);
134 spin_lock_irq(&assigned_dev->assigned_dev_lock); 141 spin_lock_irq(&assigned_dev->assigned_dev_lock);
135 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
136 struct kvm_guest_msix_entry *guest_entries = 143 struct kvm_guest_msix_entry *guest_entries =
@@ -143,23 +150,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
143 kvm_set_irq(assigned_dev->kvm, 150 kvm_set_irq(assigned_dev->kvm,
144 assigned_dev->irq_source_id, 151 assigned_dev->irq_source_id,
145 guest_entries[i].vector, 1); 152 guest_entries[i].vector, 1);
146 irq = assigned_dev->host_msix_entries[i].vector;
147 if (irq != 0)
148 enable_irq(irq);
149 assigned_dev->host_irq_disabled = false;
150 } 153 }
151 } else { 154 } else
152 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 155 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
153 assigned_dev->guest_irq, 1); 156 assigned_dev->guest_irq, 1);
154 if (assigned_dev->irq_requested_type &
155 KVM_DEV_IRQ_GUEST_MSI) {
156 enable_irq(assigned_dev->host_irq);
157 assigned_dev->host_irq_disabled = false;
158 }
159 }
160 157
161 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 158 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
162 mutex_unlock(&assigned_dev->kvm->lock); 159 mutex_unlock(&assigned_dev->kvm->irq_lock);
163} 160}
164 161
165static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 162static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
@@ -179,8 +176,10 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
179 176
180 schedule_work(&assigned_dev->interrupt_work); 177 schedule_work(&assigned_dev->interrupt_work);
181 178
182 disable_irq_nosync(irq); 179 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
183 assigned_dev->host_irq_disabled = true; 180 disable_irq_nosync(irq);
181 assigned_dev->host_irq_disabled = true;
182 }
184 183
185out: 184out:
186 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 185 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
@@ -215,7 +214,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
215static void deassign_guest_irq(struct kvm *kvm, 214static void deassign_guest_irq(struct kvm *kvm,
216 struct kvm_assigned_dev_kernel *assigned_dev) 215 struct kvm_assigned_dev_kernel *assigned_dev)
217{ 216{
218 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 217 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
219 assigned_dev->ack_notifier.gsi = -1; 218 assigned_dev->ack_notifier.gsi = -1;
220 219
221 if (assigned_dev->irq_source_id != -1) 220 if (assigned_dev->irq_source_id != -1)
@@ -417,6 +416,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
417{ 416{
418 dev->guest_irq = irq->guest_irq; 417 dev->guest_irq = irq->guest_irq;
419 dev->ack_notifier.gsi = -1; 418 dev->ack_notifier.gsi = -1;
419 dev->host_irq_disabled = false;
420 return 0; 420 return 0;
421} 421}
422#endif 422#endif
@@ -427,6 +427,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
427{ 427{
428 dev->guest_irq = irq->guest_irq; 428 dev->guest_irq = irq->guest_irq;
429 dev->ack_notifier.gsi = -1; 429 dev->ack_notifier.gsi = -1;
430 dev->host_irq_disabled = false;
430 return 0; 431 return 0;
431} 432}
432#endif 433#endif
@@ -693,11 +694,6 @@ out:
693} 694}
694#endif 695#endif
695 696
696static inline int valid_vcpu(int n)
697{
698 return likely(n >= 0 && n < KVM_MAX_VCPUS);
699}
700
701inline int kvm_is_mmio_pfn(pfn_t pfn) 697inline int kvm_is_mmio_pfn(pfn_t pfn)
702{ 698{
703 if (pfn_valid(pfn)) { 699 if (pfn_valid(pfn)) {
@@ -745,12 +741,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
745 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 741 if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
746 cpumask_clear(cpus); 742 cpumask_clear(cpus);
747 743
748 me = get_cpu();
749 spin_lock(&kvm->requests_lock); 744 spin_lock(&kvm->requests_lock);
750 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 745 me = smp_processor_id();
751 vcpu = kvm->vcpus[i]; 746 kvm_for_each_vcpu(i, vcpu, kvm) {
752 if (!vcpu)
753 continue;
754 if (test_and_set_bit(req, &vcpu->requests)) 747 if (test_and_set_bit(req, &vcpu->requests))
755 continue; 748 continue;
756 cpu = vcpu->cpu; 749 cpu = vcpu->cpu;
@@ -764,7 +757,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
764 else 757 else
765 called = false; 758 called = false;
766 spin_unlock(&kvm->requests_lock); 759 spin_unlock(&kvm->requests_lock);
767 put_cpu();
768 free_cpumask_var(cpus); 760 free_cpumask_var(cpus);
769 return called; 761 return called;
770} 762}
@@ -986,7 +978,9 @@ static struct kvm *kvm_create_vm(void)
986 spin_lock_init(&kvm->mmu_lock); 978 spin_lock_init(&kvm->mmu_lock);
987 spin_lock_init(&kvm->requests_lock); 979 spin_lock_init(&kvm->requests_lock);
988 kvm_io_bus_init(&kvm->pio_bus); 980 kvm_io_bus_init(&kvm->pio_bus);
981 kvm_eventfd_init(kvm);
989 mutex_init(&kvm->lock); 982 mutex_init(&kvm->lock);
983 mutex_init(&kvm->irq_lock);
990 kvm_io_bus_init(&kvm->mmio_bus); 984 kvm_io_bus_init(&kvm->mmio_bus);
991 init_rwsem(&kvm->slots_lock); 985 init_rwsem(&kvm->slots_lock);
992 atomic_set(&kvm->users_count, 1); 986 atomic_set(&kvm->users_count, 1);
@@ -1006,19 +1000,25 @@ out:
1006static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1000static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
1007 struct kvm_memory_slot *dont) 1001 struct kvm_memory_slot *dont)
1008{ 1002{
1003 int i;
1004
1009 if (!dont || free->rmap != dont->rmap) 1005 if (!dont || free->rmap != dont->rmap)
1010 vfree(free->rmap); 1006 vfree(free->rmap);
1011 1007
1012 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1008 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
1013 vfree(free->dirty_bitmap); 1009 vfree(free->dirty_bitmap);
1014 1010
1015 if (!dont || free->lpage_info != dont->lpage_info) 1011
1016 vfree(free->lpage_info); 1012 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
1013 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
1014 vfree(free->lpage_info[i]);
1015 free->lpage_info[i] = NULL;
1016 }
1017 }
1017 1018
1018 free->npages = 0; 1019 free->npages = 0;
1019 free->dirty_bitmap = NULL; 1020 free->dirty_bitmap = NULL;
1020 free->rmap = NULL; 1021 free->rmap = NULL;
1021 free->lpage_info = NULL;
1022} 1022}
1023 1023
1024void kvm_free_physmem(struct kvm *kvm) 1024void kvm_free_physmem(struct kvm *kvm)
@@ -1071,6 +1071,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
1071{ 1071{
1072 struct kvm *kvm = filp->private_data; 1072 struct kvm *kvm = filp->private_data;
1073 1073
1074 kvm_irqfd_release(kvm);
1075
1074 kvm_put_kvm(kvm); 1076 kvm_put_kvm(kvm);
1075 return 0; 1077 return 0;
1076} 1078}
@@ -1089,8 +1091,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
1089{ 1091{
1090 int r; 1092 int r;
1091 gfn_t base_gfn; 1093 gfn_t base_gfn;
1092 unsigned long npages, ugfn; 1094 unsigned long npages;
1093 unsigned long largepages, i; 1095 unsigned long i;
1094 struct kvm_memory_slot *memslot; 1096 struct kvm_memory_slot *memslot;
1095 struct kvm_memory_slot old, new; 1097 struct kvm_memory_slot old, new;
1096 1098
@@ -1164,31 +1166,51 @@ int __kvm_set_memory_region(struct kvm *kvm,
1164 else 1166 else
1165 new.userspace_addr = 0; 1167 new.userspace_addr = 0;
1166 } 1168 }
1167 if (npages && !new.lpage_info) { 1169 if (!npages)
1168 largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; 1170 goto skip_lpage;
1169 largepages -= base_gfn / KVM_PAGES_PER_HPAGE;
1170 1171
1171 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1172 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
1173 unsigned long ugfn;
1174 unsigned long j;
1175 int lpages;
1176 int level = i + 2;
1172 1177
1173 if (!new.lpage_info) 1178 /* Avoid unused variable warning if no large pages */
1179 (void)level;
1180
1181 if (new.lpage_info[i])
1182 continue;
1183
1184 lpages = 1 + (base_gfn + npages - 1) /
1185 KVM_PAGES_PER_HPAGE(level);
1186 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
1187
1188 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
1189
1190 if (!new.lpage_info[i])
1174 goto out_free; 1191 goto out_free;
1175 1192
1176 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1193 memset(new.lpage_info[i], 0,
1194 lpages * sizeof(*new.lpage_info[i]));
1177 1195
1178 if (base_gfn % KVM_PAGES_PER_HPAGE) 1196 if (base_gfn % KVM_PAGES_PER_HPAGE(level))
1179 new.lpage_info[0].write_count = 1; 1197 new.lpage_info[i][0].write_count = 1;
1180 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1198 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
1181 new.lpage_info[largepages-1].write_count = 1; 1199 new.lpage_info[i][lpages - 1].write_count = 1;
1182 ugfn = new.userspace_addr >> PAGE_SHIFT; 1200 ugfn = new.userspace_addr >> PAGE_SHIFT;
1183 /* 1201 /*
1184 * If the gfn and userspace address are not aligned wrt each 1202 * If the gfn and userspace address are not aligned wrt each
1185 * other, disable large page support for this slot 1203 * other, or if explicitly asked to, disable large page
1204 * support for this slot
1186 */ 1205 */
1187 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) 1206 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
1188 for (i = 0; i < largepages; ++i) 1207 !largepages_enabled)
1189 new.lpage_info[i].write_count = 1; 1208 for (j = 0; j < lpages; ++j)
1209 new.lpage_info[i][j].write_count = 1;
1190 } 1210 }
1191 1211
1212skip_lpage:
1213
1192 /* Allocate page dirty bitmap if needed */ 1214 /* Allocate page dirty bitmap if needed */
1193 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1215 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1194 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1216 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
@@ -1200,6 +1222,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
1200 if (old.npages) 1222 if (old.npages)
1201 kvm_arch_flush_shadow(kvm); 1223 kvm_arch_flush_shadow(kvm);
1202 } 1224 }
1225#else /* not defined CONFIG_S390 */
1226 new.user_alloc = user_alloc;
1227 if (user_alloc)
1228 new.userspace_addr = mem->userspace_addr;
1203#endif /* not defined CONFIG_S390 */ 1229#endif /* not defined CONFIG_S390 */
1204 1230
1205 if (!npages) 1231 if (!npages)
@@ -1299,6 +1325,12 @@ out:
1299 return r; 1325 return r;
1300} 1326}
1301 1327
1328void kvm_disable_largepages(void)
1329{
1330 largepages_enabled = false;
1331}
1332EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1333
1302int is_error_page(struct page *page) 1334int is_error_page(struct page *page)
1303{ 1335{
1304 return page == bad_page; 1336 return page == bad_page;
@@ -1635,9 +1667,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1635 for (;;) { 1667 for (;;) {
1636 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1668 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1637 1669
1638 if ((kvm_arch_interrupt_allowed(vcpu) && 1670 if (kvm_arch_vcpu_runnable(vcpu)) {
1639 kvm_cpu_has_interrupt(vcpu)) ||
1640 kvm_arch_vcpu_runnable(vcpu)) {
1641 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1671 set_bit(KVM_REQ_UNHALT, &vcpu->requests);
1642 break; 1672 break;
1643 } 1673 }
@@ -1714,24 +1744,18 @@ static struct file_operations kvm_vcpu_fops = {
1714 */ 1744 */
1715static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1745static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1716{ 1746{
1717 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1747 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
1718 if (fd < 0)
1719 kvm_put_kvm(vcpu->kvm);
1720 return fd;
1721} 1748}
1722 1749
1723/* 1750/*
1724 * Creates some virtual cpus. Good luck creating more than one. 1751 * Creates some virtual cpus. Good luck creating more than one.
1725 */ 1752 */
1726static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1753static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1727{ 1754{
1728 int r; 1755 int r;
1729 struct kvm_vcpu *vcpu; 1756 struct kvm_vcpu *vcpu, *v;
1730
1731 if (!valid_vcpu(n))
1732 return -EINVAL;
1733 1757
1734 vcpu = kvm_arch_vcpu_create(kvm, n); 1758 vcpu = kvm_arch_vcpu_create(kvm, id);
1735 if (IS_ERR(vcpu)) 1759 if (IS_ERR(vcpu))
1736 return PTR_ERR(vcpu); 1760 return PTR_ERR(vcpu);
1737 1761
@@ -1742,23 +1766,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1742 return r; 1766 return r;
1743 1767
1744 mutex_lock(&kvm->lock); 1768 mutex_lock(&kvm->lock);
1745 if (kvm->vcpus[n]) { 1769 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1746 r = -EEXIST; 1770 r = -EINVAL;
1747 goto vcpu_destroy; 1771 goto vcpu_destroy;
1748 } 1772 }
1749 kvm->vcpus[n] = vcpu; 1773
1750 mutex_unlock(&kvm->lock); 1774 kvm_for_each_vcpu(r, v, kvm)
1775 if (v->vcpu_id == id) {
1776 r = -EEXIST;
1777 goto vcpu_destroy;
1778 }
1779
1780 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1751 1781
1752 /* Now it's all set up, let userspace reach it */ 1782 /* Now it's all set up, let userspace reach it */
1753 kvm_get_kvm(kvm); 1783 kvm_get_kvm(kvm);
1754 r = create_vcpu_fd(vcpu); 1784 r = create_vcpu_fd(vcpu);
1755 if (r < 0) 1785 if (r < 0) {
1756 goto unlink; 1786 kvm_put_kvm(kvm);
1787 goto vcpu_destroy;
1788 }
1789
1790 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1791 smp_wmb();
1792 atomic_inc(&kvm->online_vcpus);
1793
1794#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1795 if (kvm->bsp_vcpu_id == id)
1796 kvm->bsp_vcpu = vcpu;
1797#endif
1798 mutex_unlock(&kvm->lock);
1757 return r; 1799 return r;
1758 1800
1759unlink:
1760 mutex_lock(&kvm->lock);
1761 kvm->vcpus[n] = NULL;
1762vcpu_destroy: 1801vcpu_destroy:
1763 mutex_unlock(&kvm->lock); 1802 mutex_unlock(&kvm->lock);
1764 kvm_arch_vcpu_destroy(vcpu); 1803 kvm_arch_vcpu_destroy(vcpu);
@@ -2199,6 +2238,7 @@ static long kvm_vm_ioctl(struct file *filp,
2199 vfree(entries); 2238 vfree(entries);
2200 break; 2239 break;
2201 } 2240 }
2241#endif /* KVM_CAP_IRQ_ROUTING */
2202#ifdef __KVM_HAVE_MSIX 2242#ifdef __KVM_HAVE_MSIX
2203 case KVM_ASSIGN_SET_MSIX_NR: { 2243 case KVM_ASSIGN_SET_MSIX_NR: {
2204 struct kvm_assigned_msix_nr entry_nr; 2244 struct kvm_assigned_msix_nr entry_nr;
@@ -2221,7 +2261,35 @@ static long kvm_vm_ioctl(struct file *filp,
2221 break; 2261 break;
2222 } 2262 }
2223#endif 2263#endif
2224#endif /* KVM_CAP_IRQ_ROUTING */ 2264 case KVM_IRQFD: {
2265 struct kvm_irqfd data;
2266
2267 r = -EFAULT;
2268 if (copy_from_user(&data, argp, sizeof data))
2269 goto out;
2270 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
2271 break;
2272 }
2273 case KVM_IOEVENTFD: {
2274 struct kvm_ioeventfd data;
2275
2276 r = -EFAULT;
2277 if (copy_from_user(&data, argp, sizeof data))
2278 goto out;
2279 r = kvm_ioeventfd(kvm, &data);
2280 break;
2281 }
2282#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2283 case KVM_SET_BOOT_CPU_ID:
2284 r = 0;
2285 mutex_lock(&kvm->lock);
2286 if (atomic_read(&kvm->online_vcpus) != 0)
2287 r = -EBUSY;
2288 else
2289 kvm->bsp_vcpu_id = arg;
2290 mutex_unlock(&kvm->lock);
2291 break;
2292#endif
2225 default: 2293 default:
2226 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2294 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2227 } 2295 }
@@ -2288,6 +2356,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
2288 case KVM_CAP_USER_MEMORY: 2356 case KVM_CAP_USER_MEMORY:
2289 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2357 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2290 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2358 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2359#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2360 case KVM_CAP_SET_BOOT_CPU_ID:
2361#endif
2291 return 1; 2362 return 1;
2292#ifdef CONFIG_HAVE_KVM_IRQCHIP 2363#ifdef CONFIG_HAVE_KVM_IRQCHIP
2293 case KVM_CAP_IRQ_ROUTING: 2364 case KVM_CAP_IRQ_ROUTING:
@@ -2335,7 +2406,7 @@ static long kvm_dev_ioctl(struct file *filp,
2335 case KVM_TRACE_ENABLE: 2406 case KVM_TRACE_ENABLE:
2336 case KVM_TRACE_PAUSE: 2407 case KVM_TRACE_PAUSE:
2337 case KVM_TRACE_DISABLE: 2408 case KVM_TRACE_DISABLE:
2338 r = kvm_trace_ioctl(ioctl, arg); 2409 r = -EOPNOTSUPP;
2339 break; 2410 break;
2340 default: 2411 default:
2341 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2412 return kvm_arch_dev_ioctl(filp, ioctl, arg);
@@ -2449,26 +2520,71 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2449 } 2520 }
2450} 2521}
2451 2522
2452struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2523/* kvm_io_bus_write - called under kvm->slots_lock */
2453 gpa_t addr, int len, int is_write) 2524int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
2525 int len, const void *val)
2454{ 2526{
2455 int i; 2527 int i;
2528 for (i = 0; i < bus->dev_count; i++)
2529 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2530 return 0;
2531 return -EOPNOTSUPP;
2532}
2456 2533
2457 for (i = 0; i < bus->dev_count; i++) { 2534/* kvm_io_bus_read - called under kvm->slots_lock */
2458 struct kvm_io_device *pos = bus->devs[i]; 2535int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
2536{
2537 int i;
2538 for (i = 0; i < bus->dev_count; i++)
2539 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2540 return 0;
2541 return -EOPNOTSUPP;
2542}
2459 2543
2460 if (pos->in_range(pos, addr, len, is_write)) 2544int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
2461 return pos; 2545 struct kvm_io_device *dev)
2462 } 2546{
2547 int ret;
2463 2548
2464 return NULL; 2549 down_write(&kvm->slots_lock);
2550 ret = __kvm_io_bus_register_dev(bus, dev);
2551 up_write(&kvm->slots_lock);
2552
2553 return ret;
2465} 2554}
2466 2555
2467void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2556/* An unlocked version. Caller must have write lock on slots_lock. */
2557int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
2558 struct kvm_io_device *dev)
2468{ 2559{
2469 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2560 if (bus->dev_count > NR_IOBUS_DEVS-1)
2561 return -ENOSPC;
2470 2562
2471 bus->devs[bus->dev_count++] = dev; 2563 bus->devs[bus->dev_count++] = dev;
2564
2565 return 0;
2566}
2567
2568void kvm_io_bus_unregister_dev(struct kvm *kvm,
2569 struct kvm_io_bus *bus,
2570 struct kvm_io_device *dev)
2571{
2572 down_write(&kvm->slots_lock);
2573 __kvm_io_bus_unregister_dev(bus, dev);
2574 up_write(&kvm->slots_lock);
2575}
2576
2577/* An unlocked version. Caller must have write lock on slots_lock. */
2578void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
2579 struct kvm_io_device *dev)
2580{
2581 int i;
2582
2583 for (i = 0; i < bus->dev_count; i++)
2584 if (bus->devs[i] == dev) {
2585 bus->devs[i] = bus->devs[--bus->dev_count];
2586 break;
2587 }
2472} 2588}
2473 2589
2474static struct notifier_block kvm_cpu_notifier = { 2590static struct notifier_block kvm_cpu_notifier = {
@@ -2501,11 +2617,9 @@ static int vcpu_stat_get(void *_offset, u64 *val)
2501 *val = 0; 2617 *val = 0;
2502 spin_lock(&kvm_lock); 2618 spin_lock(&kvm_lock);
2503 list_for_each_entry(kvm, &vm_list, vm_list) 2619 list_for_each_entry(kvm, &vm_list, vm_list)
2504 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2620 kvm_for_each_vcpu(i, vcpu, kvm)
2505 vcpu = kvm->vcpus[i]; 2621 *val += *(u32 *)((void *)vcpu + offset);
2506 if (vcpu) 2622
2507 *val += *(u32 *)((void *)vcpu + offset);
2508 }
2509 spin_unlock(&kvm_lock); 2623 spin_unlock(&kvm_lock);
2510 return 0; 2624 return 0;
2511} 2625}
@@ -2679,15 +2793,15 @@ out_free_0:
2679 __free_page(bad_page); 2793 __free_page(bad_page);
2680out: 2794out:
2681 kvm_arch_exit(); 2795 kvm_arch_exit();
2682 kvm_exit_debug();
2683out_fail: 2796out_fail:
2797 kvm_exit_debug();
2684 return r; 2798 return r;
2685} 2799}
2686EXPORT_SYMBOL_GPL(kvm_init); 2800EXPORT_SYMBOL_GPL(kvm_init);
2687 2801
2688void kvm_exit(void) 2802void kvm_exit(void)
2689{ 2803{
2690 kvm_trace_cleanup(); 2804 tracepoint_synchronize_unregister();
2691 misc_deregister(&kvm_dev); 2805 misc_deregister(&kvm_dev);
2692 kmem_cache_destroy(kvm_vcpu_cache); 2806 kmem_cache_destroy(kvm_vcpu_cache);
2693 sysdev_unregister(&kvm_sysdev); 2807 sysdev_unregister(&kvm_sysdev);
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
deleted file mode 100644
index f59874446440..000000000000
--- a/virt/kvm/kvm_trace.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * kvm trace
3 *
4 * It is designed to allow debugging traces of kvm to be generated
5 * on UP / SMP machines. Each trace entry can be timestamped so that
6 * it's possible to reconstruct a chronological record of trace events.
7 * The implementation refers to blktrace kernel support.
8 *
9 * Copyright (c) 2008 Intel Corporation
10 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
11 *
12 * Authors: Feng(Eric) Liu, eric.e.liu@intel.com
13 *
14 * Date: Feb 2008
15 */
16
17#include <linux/module.h>
18#include <linux/relay.h>
19#include <linux/debugfs.h>
20#include <linux/ktime.h>
21
22#include <linux/kvm_host.h>
23
24#define KVM_TRACE_STATE_RUNNING (1 << 0)
25#define KVM_TRACE_STATE_PAUSE (1 << 1)
26#define KVM_TRACE_STATE_CLEARUP (1 << 2)
27
28struct kvm_trace {
29 int trace_state;
30 struct rchan *rchan;
31 struct dentry *lost_file;
32 atomic_t lost_records;
33};
34static struct kvm_trace *kvm_trace;
35
36struct kvm_trace_probe {
37 const char *name;
38 const char *format;
39 u32 timestamp_in;
40 marker_probe_func *probe_func;
41};
42
43static inline int calc_rec_size(int timestamp, int extra)
44{
45 int rec_size = KVM_TRC_HEAD_SIZE;
46
47 rec_size += extra;
48 return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
49}
50
51static void kvm_add_trace(void *probe_private, void *call_data,
52 const char *format, va_list *args)
53{
54 struct kvm_trace_probe *p = probe_private;
55 struct kvm_trace *kt = kvm_trace;
56 struct kvm_trace_rec rec;
57 struct kvm_vcpu *vcpu;
58 int i, size;
59 u32 extra;
60
61 if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
62 return;
63
64 rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32));
65 vcpu = va_arg(*args, struct kvm_vcpu *);
66 rec.pid = current->tgid;
67 rec.vcpu_id = vcpu->vcpu_id;
68
69 extra = va_arg(*args, u32);
70 WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
71 extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
72
73 rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
74 | TRACE_REC_NUM_DATA_ARGS(extra);
75
76 if (p->timestamp_in) {
77 rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
78
79 for (i = 0; i < extra; i++)
80 rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
81 } else {
82 for (i = 0; i < extra; i++)
83 rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
84 }
85
86 size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
87 relay_write(kt->rchan, &rec, size);
88}
89
90static struct kvm_trace_probe kvm_trace_probes[] = {
91 { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace },
92 { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace },
93};
94
95static int lost_records_get(void *data, u64 *val)
96{
97 struct kvm_trace *kt = data;
98
99 *val = atomic_read(&kt->lost_records);
100 return 0;
101}
102
103DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n");
104
105/*
106 * The relay channel is used in "no-overwrite" mode, it keeps trace of how
107 * many times we encountered a full subbuffer, to tell user space app the
108 * lost records there were.
109 */
110static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
111 void *prev_subbuf, size_t prev_padding)
112{
113 struct kvm_trace *kt;
114
115 if (!relay_buf_full(buf)) {
116 if (!prev_subbuf) {
117 /*
118 * executed only once when the channel is opened
119 * save metadata as first record
120 */
121 subbuf_start_reserve(buf, sizeof(u32));
122 *(u32 *)subbuf = 0x12345678;
123 }
124
125 return 1;
126 }
127
128 kt = buf->chan->private_data;
129 atomic_inc(&kt->lost_records);
130
131 return 0;
132}
133
134static struct dentry *kvm_create_buf_file_callack(const char *filename,
135 struct dentry *parent,
136 int mode,
137 struct rchan_buf *buf,
138 int *is_global)
139{
140 return debugfs_create_file(filename, mode, parent, buf,
141 &relay_file_operations);
142}
143
144static int kvm_remove_buf_file_callback(struct dentry *dentry)
145{
146 debugfs_remove(dentry);
147 return 0;
148}
149
150static struct rchan_callbacks kvm_relay_callbacks = {
151 .subbuf_start = kvm_subbuf_start_callback,
152 .create_buf_file = kvm_create_buf_file_callack,
153 .remove_buf_file = kvm_remove_buf_file_callback,
154};
155
156static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts)
157{
158 struct kvm_trace *kt;
159 int i, r = -ENOMEM;
160
161 if (!kuts->buf_size || !kuts->buf_nr)
162 return -EINVAL;
163
164 kt = kzalloc(sizeof(*kt), GFP_KERNEL);
165 if (!kt)
166 goto err;
167
168 r = -EIO;
169 atomic_set(&kt->lost_records, 0);
170 kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir,
171 kt, &kvm_trace_lost_ops);
172 if (!kt->lost_file)
173 goto err;
174
175 kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size,
176 kuts->buf_nr, &kvm_relay_callbacks, kt);
177 if (!kt->rchan)
178 goto err;
179
180 kvm_trace = kt;
181
182 for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
183 struct kvm_trace_probe *p = &kvm_trace_probes[i];
184
185 r = marker_probe_register(p->name, p->format, p->probe_func, p);
186 if (r)
187 printk(KERN_INFO "Unable to register probe %s\n",
188 p->name);
189 }
190
191 kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING;
192
193 return 0;
194err:
195 if (kt) {
196 if (kt->lost_file)
197 debugfs_remove(kt->lost_file);
198 if (kt->rchan)
199 relay_close(kt->rchan);
200 kfree(kt);
201 }
202 return r;
203}
204
205static int kvm_trace_enable(char __user *arg)
206{
207 struct kvm_user_trace_setup kuts;
208 int ret;
209
210 ret = copy_from_user(&kuts, arg, sizeof(kuts));
211 if (ret)
212 return -EFAULT;
213
214 ret = do_kvm_trace_enable(&kuts);
215 if (ret)
216 return ret;
217
218 return 0;
219}
220
221static int kvm_trace_pause(void)
222{
223 struct kvm_trace *kt = kvm_trace;
224 int r = -EINVAL;
225
226 if (kt == NULL)
227 return r;
228
229 if (kt->trace_state == KVM_TRACE_STATE_RUNNING) {
230 kt->trace_state = KVM_TRACE_STATE_PAUSE;
231 relay_flush(kt->rchan);
232 r = 0;
233 }
234
235 return r;
236}
237
238void kvm_trace_cleanup(void)
239{
240 struct kvm_trace *kt = kvm_trace;
241 int i;
242
243 if (kt == NULL)
244 return;
245
246 if (kt->trace_state == KVM_TRACE_STATE_RUNNING ||
247 kt->trace_state == KVM_TRACE_STATE_PAUSE) {
248
249 kt->trace_state = KVM_TRACE_STATE_CLEARUP;
250
251 for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
252 struct kvm_trace_probe *p = &kvm_trace_probes[i];
253 marker_probe_unregister(p->name, p->probe_func, p);
254 }
255 marker_synchronize_unregister();
256
257 relay_close(kt->rchan);
258 debugfs_remove(kt->lost_file);
259 kfree(kt);
260 }
261}
262
263int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
264{
265 void __user *argp = (void __user *)arg;
266 long r = -EINVAL;
267
268 if (!capable(CAP_SYS_ADMIN))
269 return -EPERM;
270
271 switch (ioctl) {
272 case KVM_TRACE_ENABLE:
273 r = kvm_trace_enable(argp);
274 break;
275 case KVM_TRACE_PAUSE:
276 r = kvm_trace_pause();
277 break;
278 case KVM_TRACE_DISABLE:
279 r = 0;
280 kvm_trace_cleanup();
281 break;
282 }
283
284 return r;
285}