diff options
Diffstat (limited to 'virt/kvm/eventfd.c')
-rw-r--r-- | virt/kvm/eventfd.c | 578 |
1 files changed, 578 insertions, 0 deletions
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 000000000000..bb4ebd89b9ff --- /dev/null +++ b/virt/kvm/eventfd.c | |||
@@ -0,0 +1,578 @@ | |||
1 | /* | ||
2 | * kvm eventfd support - use eventfd objects to signal various KVM events | ||
3 | * | ||
4 | * Copyright 2009 Novell. All Rights Reserved. | ||
5 | * | ||
6 | * Author: | ||
7 | * Gregory Haskins <ghaskins@novell.com> | ||
8 | * | ||
9 | * This file is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of version 2 of the GNU General Public License | ||
11 | * as published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software Foundation, | ||
20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
21 | */ | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/kvm.h> | ||
25 | #include <linux/workqueue.h> | ||
26 | #include <linux/syscalls.h> | ||
27 | #include <linux/wait.h> | ||
28 | #include <linux/poll.h> | ||
29 | #include <linux/file.h> | ||
30 | #include <linux/list.h> | ||
31 | #include <linux/eventfd.h> | ||
32 | #include <linux/kernel.h> | ||
33 | |||
34 | #include "iodev.h" | ||
35 | |||
36 | /* | ||
37 | * -------------------------------------------------------------------- | ||
38 | * irqfd: Allows an fd to be used to inject an interrupt to the guest | ||
39 | * | ||
40 | * Credit goes to Avi Kivity for the original idea. | ||
41 | * -------------------------------------------------------------------- | ||
42 | */ | ||
43 | |||
44 | struct _irqfd { | ||
45 | struct kvm *kvm; | ||
46 | struct eventfd_ctx *eventfd; | ||
47 | int gsi; | ||
48 | struct list_head list; | ||
49 | poll_table pt; | ||
50 | wait_queue_head_t *wqh; | ||
51 | wait_queue_t wait; | ||
52 | struct work_struct inject; | ||
53 | struct work_struct shutdown; | ||
54 | }; | ||
55 | |||
56 | static struct workqueue_struct *irqfd_cleanup_wq; | ||
57 | |||
58 | static void | ||
59 | irqfd_inject(struct work_struct *work) | ||
60 | { | ||
61 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); | ||
62 | struct kvm *kvm = irqfd->kvm; | ||
63 | |||
64 | mutex_lock(&kvm->irq_lock); | ||
65 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); | ||
66 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | ||
67 | mutex_unlock(&kvm->irq_lock); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * Race-free decouple logic (ordering is critical) | ||
72 | */ | ||
73 | static void | ||
74 | irqfd_shutdown(struct work_struct *work) | ||
75 | { | ||
76 | struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); | ||
77 | |||
78 | /* | ||
79 | * Synchronize with the wait-queue and unhook ourselves to prevent | ||
80 | * further events. | ||
81 | */ | ||
82 | remove_wait_queue(irqfd->wqh, &irqfd->wait); | ||
83 | |||
84 | /* | ||
85 | * We know no new events will be scheduled at this point, so block | ||
86 | * until all previously outstanding events have completed | ||
87 | */ | ||
88 | flush_work(&irqfd->inject); | ||
89 | |||
90 | /* | ||
91 | * It is now safe to release the object's resources | ||
92 | */ | ||
93 | eventfd_ctx_put(irqfd->eventfd); | ||
94 | kfree(irqfd); | ||
95 | } | ||
96 | |||
97 | |||
98 | /* assumes kvm->irqfds.lock is held */ | ||
99 | static bool | ||
100 | irqfd_is_active(struct _irqfd *irqfd) | ||
101 | { | ||
102 | return list_empty(&irqfd->list) ? false : true; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Mark the irqfd as inactive and schedule it for removal | ||
107 | * | ||
108 | * assumes kvm->irqfds.lock is held | ||
109 | */ | ||
110 | static void | ||
111 | irqfd_deactivate(struct _irqfd *irqfd) | ||
112 | { | ||
113 | BUG_ON(!irqfd_is_active(irqfd)); | ||
114 | |||
115 | list_del_init(&irqfd->list); | ||
116 | |||
117 | queue_work(irqfd_cleanup_wq, &irqfd->shutdown); | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * Called with wqh->lock held and interrupts disabled | ||
122 | */ | ||
123 | static int | ||
124 | irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
125 | { | ||
126 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); | ||
127 | unsigned long flags = (unsigned long)key; | ||
128 | |||
129 | if (flags & POLLIN) | ||
130 | /* An event has been signaled, inject an interrupt */ | ||
131 | schedule_work(&irqfd->inject); | ||
132 | |||
133 | if (flags & POLLHUP) { | ||
134 | /* The eventfd is closing, detach from KVM */ | ||
135 | struct kvm *kvm = irqfd->kvm; | ||
136 | unsigned long flags; | ||
137 | |||
138 | spin_lock_irqsave(&kvm->irqfds.lock, flags); | ||
139 | |||
140 | /* | ||
141 | * We must check if someone deactivated the irqfd before | ||
142 | * we could acquire the irqfds.lock since the item is | ||
143 | * deactivated from the KVM side before it is unhooked from | ||
144 | * the wait-queue. If it is already deactivated, we can | ||
145 | * simply return knowing the other side will cleanup for us. | ||
146 | * We cannot race against the irqfd going away since the | ||
147 | * other side is required to acquire wqh->lock, which we hold | ||
148 | */ | ||
149 | if (irqfd_is_active(irqfd)) | ||
150 | irqfd_deactivate(irqfd); | ||
151 | |||
152 | spin_unlock_irqrestore(&kvm->irqfds.lock, flags); | ||
153 | } | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | static void | ||
159 | irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, | ||
160 | poll_table *pt) | ||
161 | { | ||
162 | struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); | ||
163 | |||
164 | irqfd->wqh = wqh; | ||
165 | add_wait_queue(wqh, &irqfd->wait); | ||
166 | } | ||
167 | |||
168 | static int | ||
169 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | ||
170 | { | ||
171 | struct _irqfd *irqfd; | ||
172 | struct file *file = NULL; | ||
173 | struct eventfd_ctx *eventfd = NULL; | ||
174 | int ret; | ||
175 | unsigned int events; | ||
176 | |||
177 | irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); | ||
178 | if (!irqfd) | ||
179 | return -ENOMEM; | ||
180 | |||
181 | irqfd->kvm = kvm; | ||
182 | irqfd->gsi = gsi; | ||
183 | INIT_LIST_HEAD(&irqfd->list); | ||
184 | INIT_WORK(&irqfd->inject, irqfd_inject); | ||
185 | INIT_WORK(&irqfd->shutdown, irqfd_shutdown); | ||
186 | |||
187 | file = eventfd_fget(fd); | ||
188 | if (IS_ERR(file)) { | ||
189 | ret = PTR_ERR(file); | ||
190 | goto fail; | ||
191 | } | ||
192 | |||
193 | eventfd = eventfd_ctx_fileget(file); | ||
194 | if (IS_ERR(eventfd)) { | ||
195 | ret = PTR_ERR(eventfd); | ||
196 | goto fail; | ||
197 | } | ||
198 | |||
199 | irqfd->eventfd = eventfd; | ||
200 | |||
201 | /* | ||
202 | * Install our own custom wake-up handling so we are notified via | ||
203 | * a callback whenever someone signals the underlying eventfd | ||
204 | */ | ||
205 | init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); | ||
206 | init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); | ||
207 | |||
208 | events = file->f_op->poll(file, &irqfd->pt); | ||
209 | |||
210 | spin_lock_irq(&kvm->irqfds.lock); | ||
211 | list_add_tail(&irqfd->list, &kvm->irqfds.items); | ||
212 | spin_unlock_irq(&kvm->irqfds.lock); | ||
213 | |||
214 | /* | ||
215 | * Check if there was an event already pending on the eventfd | ||
216 | * before we registered, and trigger it as if we didn't miss it. | ||
217 | */ | ||
218 | if (events & POLLIN) | ||
219 | schedule_work(&irqfd->inject); | ||
220 | |||
221 | /* | ||
222 | * do not drop the file until the irqfd is fully initialized, otherwise | ||
223 | * we might race against the POLLHUP | ||
224 | */ | ||
225 | fput(file); | ||
226 | |||
227 | return 0; | ||
228 | |||
229 | fail: | ||
230 | if (eventfd && !IS_ERR(eventfd)) | ||
231 | eventfd_ctx_put(eventfd); | ||
232 | |||
233 | if (!IS_ERR(file)) | ||
234 | fput(file); | ||
235 | |||
236 | kfree(irqfd); | ||
237 | return ret; | ||
238 | } | ||
239 | |||
240 | void | ||
241 | kvm_eventfd_init(struct kvm *kvm) | ||
242 | { | ||
243 | spin_lock_init(&kvm->irqfds.lock); | ||
244 | INIT_LIST_HEAD(&kvm->irqfds.items); | ||
245 | INIT_LIST_HEAD(&kvm->ioeventfds); | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * shutdown any irqfd's that match fd+gsi | ||
250 | */ | ||
251 | static int | ||
252 | kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) | ||
253 | { | ||
254 | struct _irqfd *irqfd, *tmp; | ||
255 | struct eventfd_ctx *eventfd; | ||
256 | |||
257 | eventfd = eventfd_ctx_fdget(fd); | ||
258 | if (IS_ERR(eventfd)) | ||
259 | return PTR_ERR(eventfd); | ||
260 | |||
261 | spin_lock_irq(&kvm->irqfds.lock); | ||
262 | |||
263 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { | ||
264 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) | ||
265 | irqfd_deactivate(irqfd); | ||
266 | } | ||
267 | |||
268 | spin_unlock_irq(&kvm->irqfds.lock); | ||
269 | eventfd_ctx_put(eventfd); | ||
270 | |||
271 | /* | ||
272 | * Block until we know all outstanding shutdown jobs have completed | ||
273 | * so that we guarantee there will not be any more interrupts on this | ||
274 | * gsi once this deassign function returns. | ||
275 | */ | ||
276 | flush_workqueue(irqfd_cleanup_wq); | ||
277 | |||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | int | ||
282 | kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) | ||
283 | { | ||
284 | if (flags & KVM_IRQFD_FLAG_DEASSIGN) | ||
285 | return kvm_irqfd_deassign(kvm, fd, gsi); | ||
286 | |||
287 | return kvm_irqfd_assign(kvm, fd, gsi); | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * This function is called as the kvm VM fd is being released. Shutdown all | ||
292 | * irqfds that still remain open | ||
293 | */ | ||
294 | void | ||
295 | kvm_irqfd_release(struct kvm *kvm) | ||
296 | { | ||
297 | struct _irqfd *irqfd, *tmp; | ||
298 | |||
299 | spin_lock_irq(&kvm->irqfds.lock); | ||
300 | |||
301 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) | ||
302 | irqfd_deactivate(irqfd); | ||
303 | |||
304 | spin_unlock_irq(&kvm->irqfds.lock); | ||
305 | |||
306 | /* | ||
307 | * Block until we know all outstanding shutdown jobs have completed | ||
308 | * since we do not take a kvm* reference. | ||
309 | */ | ||
310 | flush_workqueue(irqfd_cleanup_wq); | ||
311 | |||
312 | } | ||
313 | |||
314 | /* | ||
315 | * create a host-wide workqueue for issuing deferred shutdown requests | ||
316 | * aggregated from all vm* instances. We need our own isolated single-thread | ||
317 | * queue to prevent deadlock against flushing the normal work-queue. | ||
318 | */ | ||
319 | static int __init irqfd_module_init(void) | ||
320 | { | ||
321 | irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); | ||
322 | if (!irqfd_cleanup_wq) | ||
323 | return -ENOMEM; | ||
324 | |||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | static void __exit irqfd_module_exit(void) | ||
329 | { | ||
330 | destroy_workqueue(irqfd_cleanup_wq); | ||
331 | } | ||
332 | |||
333 | module_init(irqfd_module_init); | ||
334 | module_exit(irqfd_module_exit); | ||
335 | |||
336 | /* | ||
337 | * -------------------------------------------------------------------- | ||
338 | * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. | ||
339 | * | ||
340 | * userspace can register a PIO/MMIO address with an eventfd for receiving | ||
341 | * notification when the memory has been touched. | ||
342 | * -------------------------------------------------------------------- | ||
343 | */ | ||
344 | |||
345 | struct _ioeventfd { | ||
346 | struct list_head list; | ||
347 | u64 addr; | ||
348 | int length; | ||
349 | struct eventfd_ctx *eventfd; | ||
350 | u64 datamatch; | ||
351 | struct kvm_io_device dev; | ||
352 | bool wildcard; | ||
353 | }; | ||
354 | |||
355 | static inline struct _ioeventfd * | ||
356 | to_ioeventfd(struct kvm_io_device *dev) | ||
357 | { | ||
358 | return container_of(dev, struct _ioeventfd, dev); | ||
359 | } | ||
360 | |||
361 | static void | ||
362 | ioeventfd_release(struct _ioeventfd *p) | ||
363 | { | ||
364 | eventfd_ctx_put(p->eventfd); | ||
365 | list_del(&p->list); | ||
366 | kfree(p); | ||
367 | } | ||
368 | |||
369 | static bool | ||
370 | ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) | ||
371 | { | ||
372 | u64 _val; | ||
373 | |||
374 | if (!(addr == p->addr && len == p->length)) | ||
375 | /* address-range must be precise for a hit */ | ||
376 | return false; | ||
377 | |||
378 | if (p->wildcard) | ||
379 | /* all else equal, wildcard is always a hit */ | ||
380 | return true; | ||
381 | |||
382 | /* otherwise, we have to actually compare the data */ | ||
383 | |||
384 | BUG_ON(!IS_ALIGNED((unsigned long)val, len)); | ||
385 | |||
386 | switch (len) { | ||
387 | case 1: | ||
388 | _val = *(u8 *)val; | ||
389 | break; | ||
390 | case 2: | ||
391 | _val = *(u16 *)val; | ||
392 | break; | ||
393 | case 4: | ||
394 | _val = *(u32 *)val; | ||
395 | break; | ||
396 | case 8: | ||
397 | _val = *(u64 *)val; | ||
398 | break; | ||
399 | default: | ||
400 | return false; | ||
401 | } | ||
402 | |||
403 | return _val == p->datamatch ? true : false; | ||
404 | } | ||
405 | |||
406 | /* MMIO/PIO writes trigger an event if the addr/val match */ | ||
407 | static int | ||
408 | ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, | ||
409 | const void *val) | ||
410 | { | ||
411 | struct _ioeventfd *p = to_ioeventfd(this); | ||
412 | |||
413 | if (!ioeventfd_in_range(p, addr, len, val)) | ||
414 | return -EOPNOTSUPP; | ||
415 | |||
416 | eventfd_signal(p->eventfd, 1); | ||
417 | return 0; | ||
418 | } | ||
419 | |||
420 | /* | ||
421 | * This function is called as KVM is completely shutting down. We do not | ||
422 | * need to worry about locking just nuke anything we have as quickly as possible | ||
423 | */ | ||
424 | static void | ||
425 | ioeventfd_destructor(struct kvm_io_device *this) | ||
426 | { | ||
427 | struct _ioeventfd *p = to_ioeventfd(this); | ||
428 | |||
429 | ioeventfd_release(p); | ||
430 | } | ||
431 | |||
432 | static const struct kvm_io_device_ops ioeventfd_ops = { | ||
433 | .write = ioeventfd_write, | ||
434 | .destructor = ioeventfd_destructor, | ||
435 | }; | ||
436 | |||
437 | /* assumes kvm->slots_lock held */ | ||
438 | static bool | ||
439 | ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) | ||
440 | { | ||
441 | struct _ioeventfd *_p; | ||
442 | |||
443 | list_for_each_entry(_p, &kvm->ioeventfds, list) | ||
444 | if (_p->addr == p->addr && _p->length == p->length && | ||
445 | (_p->wildcard || p->wildcard || | ||
446 | _p->datamatch == p->datamatch)) | ||
447 | return true; | ||
448 | |||
449 | return false; | ||
450 | } | ||
451 | |||
452 | static int | ||
453 | kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
454 | { | ||
455 | int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; | ||
456 | struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; | ||
457 | struct _ioeventfd *p; | ||
458 | struct eventfd_ctx *eventfd; | ||
459 | int ret; | ||
460 | |||
461 | /* must be natural-word sized */ | ||
462 | switch (args->len) { | ||
463 | case 1: | ||
464 | case 2: | ||
465 | case 4: | ||
466 | case 8: | ||
467 | break; | ||
468 | default: | ||
469 | return -EINVAL; | ||
470 | } | ||
471 | |||
472 | /* check for range overflow */ | ||
473 | if (args->addr + args->len < args->addr) | ||
474 | return -EINVAL; | ||
475 | |||
476 | /* check for extra flags that we don't understand */ | ||
477 | if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) | ||
478 | return -EINVAL; | ||
479 | |||
480 | eventfd = eventfd_ctx_fdget(args->fd); | ||
481 | if (IS_ERR(eventfd)) | ||
482 | return PTR_ERR(eventfd); | ||
483 | |||
484 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
485 | if (!p) { | ||
486 | ret = -ENOMEM; | ||
487 | goto fail; | ||
488 | } | ||
489 | |||
490 | INIT_LIST_HEAD(&p->list); | ||
491 | p->addr = args->addr; | ||
492 | p->length = args->len; | ||
493 | p->eventfd = eventfd; | ||
494 | |||
495 | /* The datamatch feature is optional, otherwise this is a wildcard */ | ||
496 | if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) | ||
497 | p->datamatch = args->datamatch; | ||
498 | else | ||
499 | p->wildcard = true; | ||
500 | |||
501 | down_write(&kvm->slots_lock); | ||
502 | |||
503 | /* Verify that there isnt a match already */ | ||
504 | if (ioeventfd_check_collision(kvm, p)) { | ||
505 | ret = -EEXIST; | ||
506 | goto unlock_fail; | ||
507 | } | ||
508 | |||
509 | kvm_iodevice_init(&p->dev, &ioeventfd_ops); | ||
510 | |||
511 | ret = __kvm_io_bus_register_dev(bus, &p->dev); | ||
512 | if (ret < 0) | ||
513 | goto unlock_fail; | ||
514 | |||
515 | list_add_tail(&p->list, &kvm->ioeventfds); | ||
516 | |||
517 | up_write(&kvm->slots_lock); | ||
518 | |||
519 | return 0; | ||
520 | |||
521 | unlock_fail: | ||
522 | up_write(&kvm->slots_lock); | ||
523 | |||
524 | fail: | ||
525 | kfree(p); | ||
526 | eventfd_ctx_put(eventfd); | ||
527 | |||
528 | return ret; | ||
529 | } | ||
530 | |||
531 | static int | ||
532 | kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
533 | { | ||
534 | int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; | ||
535 | struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; | ||
536 | struct _ioeventfd *p, *tmp; | ||
537 | struct eventfd_ctx *eventfd; | ||
538 | int ret = -ENOENT; | ||
539 | |||
540 | eventfd = eventfd_ctx_fdget(args->fd); | ||
541 | if (IS_ERR(eventfd)) | ||
542 | return PTR_ERR(eventfd); | ||
543 | |||
544 | down_write(&kvm->slots_lock); | ||
545 | |||
546 | list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { | ||
547 | bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); | ||
548 | |||
549 | if (p->eventfd != eventfd || | ||
550 | p->addr != args->addr || | ||
551 | p->length != args->len || | ||
552 | p->wildcard != wildcard) | ||
553 | continue; | ||
554 | |||
555 | if (!p->wildcard && p->datamatch != args->datamatch) | ||
556 | continue; | ||
557 | |||
558 | __kvm_io_bus_unregister_dev(bus, &p->dev); | ||
559 | ioeventfd_release(p); | ||
560 | ret = 0; | ||
561 | break; | ||
562 | } | ||
563 | |||
564 | up_write(&kvm->slots_lock); | ||
565 | |||
566 | eventfd_ctx_put(eventfd); | ||
567 | |||
568 | return ret; | ||
569 | } | ||
570 | |||
571 | int | ||
572 | kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
573 | { | ||
574 | if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) | ||
575 | return kvm_deassign_ioeventfd(kvm, args); | ||
576 | |||
577 | return kvm_assign_ioeventfd(kvm, args); | ||
578 | } | ||