aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2012-07-31 10:16:22 -0400
committerAlex Williamson <alex.williamson@redhat.com>2012-07-31 10:16:22 -0400
commitcba3345cc494ad286ca8823f44b2c16cae496679 (patch)
treea4cd502d0abb4dbf0c6e59f998152a22cb4b1606
parent2e3ee613480563a6d5c01b57d342e65cc58c06df (diff)
vfio: VFIO core
VFIO is a secure user level driver for use with both virtual machines and user level drivers. VFIO makes use of IOMMU groups to ensure the isolation of devices in use, allowing unprivileged user access. It's intended that VFIO will replace KVM device assignment and UIO drivers (in cases where the target platform includes a sufficiently capable IOMMU). New in this version of VFIO is support for IOMMU groups managed through the IOMMU core as well as a rework of the API, removing the group merge interface. We now go back to a model more similar to original VFIO with UIOMMU support where the file descriptor obtained from /dev/vfio/vfio allows access to the IOMMU, but only after a group is added, avoiding the previous privilege issues with this type of model. IOMMU support is also now fully modular as IOMMUs have vastly different interface requirements on different platforms. VFIO users are able to query and initialize the IOMMU model of their choice. Please see the follow-on Documentation commit for further description and usage example. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--MAINTAINERS8
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/vfio/Kconfig8
-rw-r--r--drivers/vfio/Makefile1
-rw-r--r--drivers/vfio/vfio.c1413
-rw-r--r--include/linux/vfio.h367
8 files changed, 1801 insertions, 0 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 915f28c470e9..849b771c5e03 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -88,6 +88,7 @@ Code Seq#(hex) Include File Comments
88 and kernel/power/user.c 88 and kernel/power/user.c
89'8' all SNP8023 advanced NIC card 89'8' all SNP8023 advanced NIC card
90 <mailto:mcr@solidum.com> 90 <mailto:mcr@solidum.com>
91';' 64-7F linux/vfio.h
91'@' 00-0F linux/radeonfb.h conflict! 92'@' 00-0F linux/radeonfb.h conflict!
92'@' 00-0F drivers/video/aty/aty128fb.c conflict! 93'@' 00-0F drivers/video/aty/aty128fb.c conflict!
93'A' 00-1F linux/apm_bios.h conflict! 94'A' 00-1F linux/apm_bios.h conflict!
diff --git a/MAINTAINERS b/MAINTAINERS
index fb036a062a5d..eceaab93811e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7411,6 +7411,14 @@ S: Maintained
7411F: Documentation/filesystems/vfat.txt 7411F: Documentation/filesystems/vfat.txt
7412F: fs/fat/ 7412F: fs/fat/
7413 7413
7414VFIO DRIVER
7415M: Alex Williamson <alex.williamson@redhat.com>
7416L: kvm@vger.kernel.org
7417S: Maintained
7418F: Documentation/vfio.txt
7419F: drivers/vfio/
7420F: include/linux/vfio.h
7421
7414VIDEOBUF2 FRAMEWORK 7422VIDEOBUF2 FRAMEWORK
7415M: Pawel Osciak <pawel@osciak.com> 7423M: Pawel Osciak <pawel@osciak.com>
7416M: Marek Szyprowski <m.szyprowski@samsung.com> 7424M: Marek Szyprowski <m.szyprowski@samsung.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 805c432c9439..ece958d3762e 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig"
112 112
113source "drivers/uio/Kconfig" 113source "drivers/uio/Kconfig"
114 114
115source "drivers/vfio/Kconfig"
116
115source "drivers/vlynq/Kconfig" 117source "drivers/vlynq/Kconfig"
116 118
117source "drivers/virtio/Kconfig" 119source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index bd36f09f2246..5b421840c48d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_ATM) += atm/
60obj-$(CONFIG_FUSION) += message/ 60obj-$(CONFIG_FUSION) += message/
61obj-y += firewire/ 61obj-y += firewire/
62obj-$(CONFIG_UIO) += uio/ 62obj-$(CONFIG_UIO) += uio/
63obj-$(CONFIG_VFIO) += vfio/
63obj-y += cdrom/ 64obj-y += cdrom/
64obj-y += auxdisplay/ 65obj-y += auxdisplay/
65obj-$(CONFIG_PCCARD) += pcmcia/ 66obj-$(CONFIG_PCCARD) += pcmcia/
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 000000000000..9acb1e729bd6
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,8 @@
1menuconfig VFIO
2 tristate "VFIO Non-Privileged userspace driver framework"
3 depends on IOMMU_API
4 help
5 VFIO provides a framework for secure userspace device drivers.
6 See Documentation/vfio.txt for more details.
7
8 If you don't know what to do here, say N.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 000000000000..7500a67a42a0
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_VFIO) += vfio.o
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
new file mode 100644
index 000000000000..052e310aed72
--- /dev/null
+++ b/drivers/vfio/vfio.c
@@ -0,0 +1,1413 @@
1/*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/cdev.h>
17#include <linux/compat.h>
18#include <linux/device.h>
19#include <linux/file.h>
20#include <linux/anon_inodes.h>
21#include <linux/fs.h>
22#include <linux/idr.h>
23#include <linux/iommu.h>
24#include <linux/list.h>
25#include <linux/module.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/string.h>
30#include <linux/uaccess.h>
31#include <linux/vfio.h>
32#include <linux/wait.h>
33
34#define DRIVER_VERSION "0.3"
35#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
36#define DRIVER_DESC "VFIO - User Level meta-driver"
37
38static struct vfio {
39 struct class *class;
40 struct list_head iommu_drivers_list;
41 struct mutex iommu_drivers_lock;
42 struct list_head group_list;
43 struct idr group_idr;
44 struct mutex group_lock;
45 struct cdev group_cdev;
46 struct device *dev;
47 dev_t devt;
48 struct cdev cdev;
49 wait_queue_head_t release_q;
50} vfio;
51
52struct vfio_iommu_driver {
53 const struct vfio_iommu_driver_ops *ops;
54 struct list_head vfio_next;
55};
56
57struct vfio_container {
58 struct kref kref;
59 struct list_head group_list;
60 struct mutex group_lock;
61 struct vfio_iommu_driver *iommu_driver;
62 void *iommu_data;
63};
64
65struct vfio_group {
66 struct kref kref;
67 int minor;
68 atomic_t container_users;
69 struct iommu_group *iommu_group;
70 struct vfio_container *container;
71 struct list_head device_list;
72 struct mutex device_lock;
73 struct device *dev;
74 struct notifier_block nb;
75 struct list_head vfio_next;
76 struct list_head container_next;
77};
78
79struct vfio_device {
80 struct kref kref;
81 struct device *dev;
82 const struct vfio_device_ops *ops;
83 struct vfio_group *group;
84 struct list_head group_next;
85 void *device_data;
86};
87
88/**
89 * IOMMU driver registration
90 */
91int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
92{
93 struct vfio_iommu_driver *driver, *tmp;
94
95 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
96 if (!driver)
97 return -ENOMEM;
98
99 driver->ops = ops;
100
101 mutex_lock(&vfio.iommu_drivers_lock);
102
103 /* Check for duplicates */
104 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
105 if (tmp->ops == ops) {
106 mutex_unlock(&vfio.iommu_drivers_lock);
107 kfree(driver);
108 return -EINVAL;
109 }
110 }
111
112 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
113
114 mutex_unlock(&vfio.iommu_drivers_lock);
115
116 return 0;
117}
118EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
119
120void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
121{
122 struct vfio_iommu_driver *driver;
123
124 mutex_lock(&vfio.iommu_drivers_lock);
125 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
126 if (driver->ops == ops) {
127 list_del(&driver->vfio_next);
128 mutex_unlock(&vfio.iommu_drivers_lock);
129 kfree(driver);
130 return;
131 }
132 }
133 mutex_unlock(&vfio.iommu_drivers_lock);
134}
135EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
136
137/**
138 * Group minor allocation/free - both called with vfio.group_lock held
139 */
140static int vfio_alloc_group_minor(struct vfio_group *group)
141{
142 int ret, minor;
143
144again:
145 if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0))
146 return -ENOMEM;
147
148 /* index 0 is used by /dev/vfio/vfio */
149 ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor);
150 if (ret == -EAGAIN)
151 goto again;
152 if (ret || minor > MINORMASK) {
153 if (minor > MINORMASK)
154 idr_remove(&vfio.group_idr, minor);
155 return -ENOSPC;
156 }
157
158 return minor;
159}
160
161static void vfio_free_group_minor(int minor)
162{
163 idr_remove(&vfio.group_idr, minor);
164}
165
166static int vfio_iommu_group_notifier(struct notifier_block *nb,
167 unsigned long action, void *data);
168static void vfio_group_get(struct vfio_group *group);
169
170/**
171 * Container objects - containers are created when /dev/vfio/vfio is
172 * opened, but their lifecycle extends until the last user is done, so
173 * it's freed via kref. Must support container/group/device being
174 * closed in any order.
175 */
176static void vfio_container_get(struct vfio_container *container)
177{
178 kref_get(&container->kref);
179}
180
181static void vfio_container_release(struct kref *kref)
182{
183 struct vfio_container *container;
184 container = container_of(kref, struct vfio_container, kref);
185
186 kfree(container);
187}
188
189static void vfio_container_put(struct vfio_container *container)
190{
191 kref_put(&container->kref, vfio_container_release);
192}
193
194/**
195 * Group objects - create, release, get, put, search
196 */
197static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
198{
199 struct vfio_group *group, *tmp;
200 struct device *dev;
201 int ret, minor;
202
203 group = kzalloc(sizeof(*group), GFP_KERNEL);
204 if (!group)
205 return ERR_PTR(-ENOMEM);
206
207 kref_init(&group->kref);
208 INIT_LIST_HEAD(&group->device_list);
209 mutex_init(&group->device_lock);
210 atomic_set(&group->container_users, 0);
211 group->iommu_group = iommu_group;
212
213 group->nb.notifier_call = vfio_iommu_group_notifier;
214
215 /*
216 * blocking notifiers acquire a rwsem around registering and hold
217 * it around callback. Therefore, need to register outside of
218 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
219 * do anything unless it can find the group in vfio.group_list, so
220 * no harm in registering early.
221 */
222 ret = iommu_group_register_notifier(iommu_group, &group->nb);
223 if (ret) {
224 kfree(group);
225 return ERR_PTR(ret);
226 }
227
228 mutex_lock(&vfio.group_lock);
229
230 minor = vfio_alloc_group_minor(group);
231 if (minor < 0) {
232 mutex_unlock(&vfio.group_lock);
233 kfree(group);
234 return ERR_PTR(minor);
235 }
236
237 /* Did we race creating this group? */
238 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
239 if (tmp->iommu_group == iommu_group) {
240 vfio_group_get(tmp);
241 vfio_free_group_minor(minor);
242 mutex_unlock(&vfio.group_lock);
243 kfree(group);
244 return tmp;
245 }
246 }
247
248 dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor),
249 group, "%d", iommu_group_id(iommu_group));
250 if (IS_ERR(dev)) {
251 vfio_free_group_minor(minor);
252 mutex_unlock(&vfio.group_lock);
253 kfree(group);
254 return (struct vfio_group *)dev; /* ERR_PTR */
255 }
256
257 group->minor = minor;
258 group->dev = dev;
259
260 list_add(&group->vfio_next, &vfio.group_list);
261
262 mutex_unlock(&vfio.group_lock);
263
264 return group;
265}
266
267static void vfio_group_release(struct kref *kref)
268{
269 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
270
271 WARN_ON(!list_empty(&group->device_list));
272
273 device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor));
274 list_del(&group->vfio_next);
275 vfio_free_group_minor(group->minor);
276
277 mutex_unlock(&vfio.group_lock);
278
279 /*
280 * Unregister outside of lock. A spurious callback is harmless now
281 * that the group is no longer in vfio.group_list.
282 */
283 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
284
285 kfree(group);
286}
287
288static void vfio_group_put(struct vfio_group *group)
289{
290 mutex_lock(&vfio.group_lock);
291 /*
292 * Release needs to unlock to unregister the notifier, so only
293 * unlock if not released.
294 */
295 if (!kref_put(&group->kref, vfio_group_release))
296 mutex_unlock(&vfio.group_lock);
297}
298
299/* Assume group_lock or group reference is held */
300static void vfio_group_get(struct vfio_group *group)
301{
302 kref_get(&group->kref);
303}
304
305/*
306 * Not really a try as we will sleep for mutex, but we need to make
307 * sure the group pointer is valid under lock and get a reference.
308 */
309static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
310{
311 struct vfio_group *target = group;
312
313 mutex_lock(&vfio.group_lock);
314 list_for_each_entry(group, &vfio.group_list, vfio_next) {
315 if (group == target) {
316 vfio_group_get(group);
317 mutex_unlock(&vfio.group_lock);
318 return group;
319 }
320 }
321 mutex_unlock(&vfio.group_lock);
322
323 return NULL;
324}
325
326static
327struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
328{
329 struct vfio_group *group;
330
331 mutex_lock(&vfio.group_lock);
332 list_for_each_entry(group, &vfio.group_list, vfio_next) {
333 if (group->iommu_group == iommu_group) {
334 vfio_group_get(group);
335 mutex_unlock(&vfio.group_lock);
336 return group;
337 }
338 }
339 mutex_unlock(&vfio.group_lock);
340
341 return NULL;
342}
343
344static struct vfio_group *vfio_group_get_from_minor(int minor)
345{
346 struct vfio_group *group;
347
348 mutex_lock(&vfio.group_lock);
349 group = idr_find(&vfio.group_idr, minor);
350 if (!group) {
351 mutex_unlock(&vfio.group_lock);
352 return NULL;
353 }
354 vfio_group_get(group);
355 mutex_unlock(&vfio.group_lock);
356
357 return group;
358}
359
360/**
361 * Device objects - create, release, get, put, search
362 */
363static
364struct vfio_device *vfio_group_create_device(struct vfio_group *group,
365 struct device *dev,
366 const struct vfio_device_ops *ops,
367 void *device_data)
368{
369 struct vfio_device *device;
370 int ret;
371
372 device = kzalloc(sizeof(*device), GFP_KERNEL);
373 if (!device)
374 return ERR_PTR(-ENOMEM);
375
376 kref_init(&device->kref);
377 device->dev = dev;
378 device->group = group;
379 device->ops = ops;
380 device->device_data = device_data;
381
382 ret = dev_set_drvdata(dev, device);
383 if (ret) {
384 kfree(device);
385 return ERR_PTR(ret);
386 }
387
388 /* No need to get group_lock, caller has group reference */
389 vfio_group_get(group);
390
391 mutex_lock(&group->device_lock);
392 list_add(&device->group_next, &group->device_list);
393 mutex_unlock(&group->device_lock);
394
395 return device;
396}
397
398static void vfio_device_release(struct kref *kref)
399{
400 struct vfio_device *device = container_of(kref,
401 struct vfio_device, kref);
402 struct vfio_group *group = device->group;
403
404 mutex_lock(&group->device_lock);
405 list_del(&device->group_next);
406 mutex_unlock(&group->device_lock);
407
408 dev_set_drvdata(device->dev, NULL);
409
410 kfree(device);
411
412 /* vfio_del_group_dev may be waiting for this device */
413 wake_up(&vfio.release_q);
414}
415
416/* Device reference always implies a group reference */
417static void vfio_device_put(struct vfio_device *device)
418{
419 kref_put(&device->kref, vfio_device_release);
420 vfio_group_put(device->group);
421}
422
423static void vfio_device_get(struct vfio_device *device)
424{
425 vfio_group_get(device->group);
426 kref_get(&device->kref);
427}
428
429static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
430 struct device *dev)
431{
432 struct vfio_device *device;
433
434 mutex_lock(&group->device_lock);
435 list_for_each_entry(device, &group->device_list, group_next) {
436 if (device->dev == dev) {
437 vfio_device_get(device);
438 mutex_unlock(&group->device_lock);
439 return device;
440 }
441 }
442 mutex_unlock(&group->device_lock);
443 return NULL;
444}
445
446/*
447 * Whitelist some drivers that we know are safe (no dma) or just sit on
448 * a device. It's not always practical to leave a device within a group
449 * driverless as it could get re-bound to something unsafe.
450 */
451static const char * const vfio_driver_whitelist[] = { "pci-stub" };
452
453static bool vfio_whitelisted_driver(struct device_driver *drv)
454{
455 int i;
456
457 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
458 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
459 return true;
460 }
461
462 return false;
463}
464
465/*
466 * A vfio group is viable for use by userspace if all devices are either
467 * driver-less or bound to a vfio or whitelisted driver. We test the
468 * latter by the existence of a struct vfio_device matching the dev.
469 */
470static int vfio_dev_viable(struct device *dev, void *data)
471{
472 struct vfio_group *group = data;
473 struct vfio_device *device;
474
475 if (!dev->driver || vfio_whitelisted_driver(dev->driver))
476 return 0;
477
478 device = vfio_group_get_device(group, dev);
479 if (device) {
480 vfio_device_put(device);
481 return 0;
482 }
483
484 return -EINVAL;
485}
486
487/**
488 * Async device support
489 */
490static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
491{
492 struct vfio_device *device;
493
494 /* Do we already know about it? We shouldn't */
495 device = vfio_group_get_device(group, dev);
496 if (WARN_ON_ONCE(device)) {
497 vfio_device_put(device);
498 return 0;
499 }
500
501 /* Nothing to do for idle groups */
502 if (!atomic_read(&group->container_users))
503 return 0;
504
505 /* TODO Prevent device auto probing */
506 WARN("Device %s added to live group %d!\n", dev_name(dev),
507 iommu_group_id(group->iommu_group));
508
509 return 0;
510}
511
512static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev)
513{
514 struct vfio_device *device;
515
516 /*
517 * Expect to fall out here. If a device was in use, it would
518 * have been bound to a vfio sub-driver, which would have blocked
519 * in .remove at vfio_del_group_dev. Sanity check that we no
520 * longer track the device, so it's safe to remove.
521 */
522 device = vfio_group_get_device(group, dev);
523 if (likely(!device))
524 return 0;
525
526 WARN("Device %s removed from live group %d!\n", dev_name(dev),
527 iommu_group_id(group->iommu_group));
528
529 vfio_device_put(device);
530 return 0;
531}
532
533static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
534{
535 /* We don't care what happens when the group isn't in use */
536 if (!atomic_read(&group->container_users))
537 return 0;
538
539 return vfio_dev_viable(dev, group);
540}
541
542static int vfio_iommu_group_notifier(struct notifier_block *nb,
543 unsigned long action, void *data)
544{
545 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
546 struct device *dev = data;
547
548 /*
549 * Need to go through a group_lock lookup to get a reference or
550 * we risk racing a group being removed. Leave a WARN_ON for
551 * debuging, but if the group no longer exists, a spurious notify
552 * is harmless.
553 */
554 group = vfio_group_try_get(group);
555 if (WARN_ON(!group))
556 return NOTIFY_OK;
557
558 switch (action) {
559 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
560 vfio_group_nb_add_dev(group, dev);
561 break;
562 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
563 vfio_group_nb_del_dev(group, dev);
564 break;
565 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
566 pr_debug("%s: Device %s, group %d binding to driver\n",
567 __func__, dev_name(dev),
568 iommu_group_id(group->iommu_group));
569 break;
570 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
571 pr_debug("%s: Device %s, group %d bound to driver %s\n",
572 __func__, dev_name(dev),
573 iommu_group_id(group->iommu_group), dev->driver->name);
574 BUG_ON(vfio_group_nb_verify(group, dev));
575 break;
576 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
577 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
578 __func__, dev_name(dev),
579 iommu_group_id(group->iommu_group), dev->driver->name);
580 break;
581 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
582 pr_debug("%s: Device %s, group %d unbound from driver\n",
583 __func__, dev_name(dev),
584 iommu_group_id(group->iommu_group));
585 /*
586 * XXX An unbound device in a live group is ok, but we'd
587 * really like to avoid the above BUG_ON by preventing other
588 * drivers from binding to it. Once that occurs, we have to
589 * stop the system to maintain isolation. At a minimum, we'd
590 * want a toggle to disable driver auto probe for this device.
591 */
592 break;
593 }
594
595 vfio_group_put(group);
596 return NOTIFY_OK;
597}
598
599/**
600 * VFIO driver API
601 */
602int vfio_add_group_dev(struct device *dev,
603 const struct vfio_device_ops *ops, void *device_data)
604{
605 struct iommu_group *iommu_group;
606 struct vfio_group *group;
607 struct vfio_device *device;
608
609 iommu_group = iommu_group_get(dev);
610 if (!iommu_group)
611 return -EINVAL;
612
613 group = vfio_group_get_from_iommu(iommu_group);
614 if (!group) {
615 group = vfio_create_group(iommu_group);
616 if (IS_ERR(group)) {
617 iommu_group_put(iommu_group);
618 return PTR_ERR(group);
619 }
620 }
621
622 device = vfio_group_get_device(group, dev);
623 if (device) {
624 WARN(1, "Device %s already exists on group %d\n",
625 dev_name(dev), iommu_group_id(iommu_group));
626 vfio_device_put(device);
627 vfio_group_put(group);
628 iommu_group_put(iommu_group);
629 return -EBUSY;
630 }
631
632 device = vfio_group_create_device(group, dev, ops, device_data);
633 if (IS_ERR(device)) {
634 vfio_group_put(group);
635 iommu_group_put(iommu_group);
636 return PTR_ERR(device);
637 }
638
639 /*
640 * Added device holds reference to iommu_group and vfio_device
641 * (which in turn holds reference to vfio_group). Drop extra
642 * group reference used while acquiring device.
643 */
644 vfio_group_put(group);
645
646 return 0;
647}
648EXPORT_SYMBOL_GPL(vfio_add_group_dev);
649
650/* Test whether a struct device is present in our tracking */
651static bool vfio_dev_present(struct device *dev)
652{
653 struct iommu_group *iommu_group;
654 struct vfio_group *group;
655 struct vfio_device *device;
656
657 iommu_group = iommu_group_get(dev);
658 if (!iommu_group)
659 return false;
660
661 group = vfio_group_get_from_iommu(iommu_group);
662 if (!group) {
663 iommu_group_put(iommu_group);
664 return false;
665 }
666
667 device = vfio_group_get_device(group, dev);
668 if (!device) {
669 vfio_group_put(group);
670 iommu_group_put(iommu_group);
671 return false;
672 }
673
674 vfio_device_put(device);
675 vfio_group_put(group);
676 iommu_group_put(iommu_group);
677 return true;
678}
679
680/*
681 * Decrement the device reference count and wait for the device to be
682 * removed. Open file descriptors for the device... */
683void *vfio_del_group_dev(struct device *dev)
684{
685 struct vfio_device *device = dev_get_drvdata(dev);
686 struct vfio_group *group = device->group;
687 struct iommu_group *iommu_group = group->iommu_group;
688 void *device_data = device->device_data;
689
690 vfio_device_put(device);
691
692 /* TODO send a signal to encourage this to be released */
693 wait_event(vfio.release_q, !vfio_dev_present(dev));
694
695 iommu_group_put(iommu_group);
696
697 return device_data;
698}
699EXPORT_SYMBOL_GPL(vfio_del_group_dev);
700
701/**
702 * VFIO base fd, /dev/vfio/vfio
703 */
704static long vfio_ioctl_check_extension(struct vfio_container *container,
705 unsigned long arg)
706{
707 struct vfio_iommu_driver *driver = container->iommu_driver;
708 long ret = 0;
709
710 switch (arg) {
711 /* No base extensions yet */
712 default:
713 /*
714 * If no driver is set, poll all registered drivers for
715 * extensions and return the first positive result. If
716 * a driver is already set, further queries will be passed
717 * only to that driver.
718 */
719 if (!driver) {
720 mutex_lock(&vfio.iommu_drivers_lock);
721 list_for_each_entry(driver, &vfio.iommu_drivers_list,
722 vfio_next) {
723 if (!try_module_get(driver->ops->owner))
724 continue;
725
726 ret = driver->ops->ioctl(NULL,
727 VFIO_CHECK_EXTENSION,
728 arg);
729 module_put(driver->ops->owner);
730 if (ret > 0)
731 break;
732 }
733 mutex_unlock(&vfio.iommu_drivers_lock);
734 } else
735 ret = driver->ops->ioctl(container->iommu_data,
736 VFIO_CHECK_EXTENSION, arg);
737 }
738
739 return ret;
740}
741
742/* hold container->group_lock */
743static int __vfio_container_attach_groups(struct vfio_container *container,
744 struct vfio_iommu_driver *driver,
745 void *data)
746{
747 struct vfio_group *group;
748 int ret = -ENODEV;
749
750 list_for_each_entry(group, &container->group_list, container_next) {
751 ret = driver->ops->attach_group(data, group->iommu_group);
752 if (ret)
753 goto unwind;
754 }
755
756 return ret;
757
758unwind:
759 list_for_each_entry_continue_reverse(group, &container->group_list,
760 container_next) {
761 driver->ops->detach_group(data, group->iommu_group);
762 }
763
764 return ret;
765}
766
767static long vfio_ioctl_set_iommu(struct vfio_container *container,
768 unsigned long arg)
769{
770 struct vfio_iommu_driver *driver;
771 long ret = -ENODEV;
772
773 mutex_lock(&container->group_lock);
774
775 /*
776 * The container is designed to be an unprivileged interface while
777 * the group can be assigned to specific users. Therefore, only by
778 * adding a group to a container does the user get the privilege of
779 * enabling the iommu, which may allocate finite resources. There
780 * is no unset_iommu, but by removing all the groups from a container,
781 * the container is deprivileged and returns to an unset state.
782 */
783 if (list_empty(&container->group_list) || container->iommu_driver) {
784 mutex_unlock(&container->group_lock);
785 return -EINVAL;
786 }
787
788 mutex_lock(&vfio.iommu_drivers_lock);
789 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
790 void *data;
791
792 if (!try_module_get(driver->ops->owner))
793 continue;
794
795 /*
796 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
797 * so test which iommu driver reported support for this
798 * extension and call open on them. We also pass them the
799 * magic, allowing a single driver to support multiple
800 * interfaces if they'd like.
801 */
802 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
803 module_put(driver->ops->owner);
804 continue;
805 }
806
807 /* module reference holds the driver we're working on */
808 mutex_unlock(&vfio.iommu_drivers_lock);
809
810 data = driver->ops->open(arg);
811 if (IS_ERR(data)) {
812 ret = PTR_ERR(data);
813 module_put(driver->ops->owner);
814 goto skip_drivers_unlock;
815 }
816
817 ret = __vfio_container_attach_groups(container, driver, data);
818 if (!ret) {
819 container->iommu_driver = driver;
820 container->iommu_data = data;
821 } else {
822 driver->ops->release(data);
823 module_put(driver->ops->owner);
824 }
825
826 goto skip_drivers_unlock;
827 }
828
829 mutex_unlock(&vfio.iommu_drivers_lock);
830skip_drivers_unlock:
831 mutex_unlock(&container->group_lock);
832
833 return ret;
834}
835
836static long vfio_fops_unl_ioctl(struct file *filep,
837 unsigned int cmd, unsigned long arg)
838{
839 struct vfio_container *container = filep->private_data;
840 struct vfio_iommu_driver *driver;
841 void *data;
842 long ret = -EINVAL;
843
844 if (!container)
845 return ret;
846
847 driver = container->iommu_driver;
848 data = container->iommu_data;
849
850 switch (cmd) {
851 case VFIO_GET_API_VERSION:
852 ret = VFIO_API_VERSION;
853 break;
854 case VFIO_CHECK_EXTENSION:
855 ret = vfio_ioctl_check_extension(container, arg);
856 break;
857 case VFIO_SET_IOMMU:
858 ret = vfio_ioctl_set_iommu(container, arg);
859 break;
860 default:
861 if (driver) /* passthrough all unrecognized ioctls */
862 ret = driver->ops->ioctl(data, cmd, arg);
863 }
864
865 return ret;
866}
867
868#ifdef CONFIG_COMPAT
869static long vfio_fops_compat_ioctl(struct file *filep,
870 unsigned int cmd, unsigned long arg)
871{
872 arg = (unsigned long)compat_ptr(arg);
873 return vfio_fops_unl_ioctl(filep, cmd, arg);
874}
875#endif /* CONFIG_COMPAT */
876
877static int vfio_fops_open(struct inode *inode, struct file *filep)
878{
879 struct vfio_container *container;
880
881 container = kzalloc(sizeof(*container), GFP_KERNEL);
882 if (!container)
883 return -ENOMEM;
884
885 INIT_LIST_HEAD(&container->group_list);
886 mutex_init(&container->group_lock);
887 kref_init(&container->kref);
888
889 filep->private_data = container;
890
891 return 0;
892}
893
894static int vfio_fops_release(struct inode *inode, struct file *filep)
895{
896 struct vfio_container *container = filep->private_data;
897
898 filep->private_data = NULL;
899
900 vfio_container_put(container);
901
902 return 0;
903}
904
905/*
906 * Once an iommu driver is set, we optionally pass read/write/mmap
907 * on to the driver, allowing management interfaces beyond ioctl.
908 */
909static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
910 size_t count, loff_t *ppos)
911{
912 struct vfio_container *container = filep->private_data;
913 struct vfio_iommu_driver *driver = container->iommu_driver;
914
915 if (unlikely(!driver || !driver->ops->read))
916 return -EINVAL;
917
918 return driver->ops->read(container->iommu_data, buf, count, ppos);
919}
920
921static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
922 size_t count, loff_t *ppos)
923{
924 struct vfio_container *container = filep->private_data;
925 struct vfio_iommu_driver *driver = container->iommu_driver;
926
927 if (unlikely(!driver || !driver->ops->write))
928 return -EINVAL;
929
930 return driver->ops->write(container->iommu_data, buf, count, ppos);
931}
932
933static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
934{
935 struct vfio_container *container = filep->private_data;
936 struct vfio_iommu_driver *driver = container->iommu_driver;
937
938 if (unlikely(!driver || !driver->ops->mmap))
939 return -EINVAL;
940
941 return driver->ops->mmap(container->iommu_data, vma);
942}
943
944static const struct file_operations vfio_fops = {
945 .owner = THIS_MODULE,
946 .open = vfio_fops_open,
947 .release = vfio_fops_release,
948 .read = vfio_fops_read,
949 .write = vfio_fops_write,
950 .unlocked_ioctl = vfio_fops_unl_ioctl,
951#ifdef CONFIG_COMPAT
952 .compat_ioctl = vfio_fops_compat_ioctl,
953#endif
954 .mmap = vfio_fops_mmap,
955};
956
957/**
958 * VFIO Group fd, /dev/vfio/$GROUP
959 */
960static void __vfio_group_unset_container(struct vfio_group *group)
961{
962 struct vfio_container *container = group->container;
963 struct vfio_iommu_driver *driver;
964
965 mutex_lock(&container->group_lock);
966
967 driver = container->iommu_driver;
968 if (driver)
969 driver->ops->detach_group(container->iommu_data,
970 group->iommu_group);
971
972 group->container = NULL;
973 list_del(&group->container_next);
974
975 /* Detaching the last group deprivileges a container, remove iommu */
976 if (driver && list_empty(&container->group_list)) {
977 driver->ops->release(container->iommu_data);
978 module_put(driver->ops->owner);
979 container->iommu_driver = NULL;
980 container->iommu_data = NULL;
981 }
982
983 mutex_unlock(&container->group_lock);
984
985 vfio_container_put(container);
986}
987
988/*
989 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
990 * if there was no container to unset. Since the ioctl is called on
991 * the group, we know that still exists, therefore the only valid
992 * transition here is 1->0.
993 */
994static int vfio_group_unset_container(struct vfio_group *group)
995{
996 int users = atomic_cmpxchg(&group->container_users, 1, 0);
997
998 if (!users)
999 return -EINVAL;
1000 if (users != 1)
1001 return -EBUSY;
1002
1003 __vfio_group_unset_container(group);
1004
1005 return 0;
1006}
1007
1008/*
1009 * When removing container users, anything that removes the last user
1010 * implicitly removes the group from the container. That is, if the
1011 * group file descriptor is closed, as well as any device file descriptors,
1012 * the group is free.
1013 */
1014static void vfio_group_try_dissolve_container(struct vfio_group *group)
1015{
1016 if (0 == atomic_dec_if_positive(&group->container_users))
1017 __vfio_group_unset_container(group);
1018}
1019
1020static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1021{
1022 struct file *filep;
1023 struct vfio_container *container;
1024 struct vfio_iommu_driver *driver;
1025 int ret = 0;
1026
1027 if (atomic_read(&group->container_users))
1028 return -EINVAL;
1029
1030 filep = fget(container_fd);
1031 if (!filep)
1032 return -EBADF;
1033
1034 /* Sanity check, is this really our fd? */
1035 if (filep->f_op != &vfio_fops) {
1036 fput(filep);
1037 return -EINVAL;
1038 }
1039
1040 container = filep->private_data;
1041 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1042
1043 mutex_lock(&container->group_lock);
1044
1045 driver = container->iommu_driver;
1046 if (driver) {
1047 ret = driver->ops->attach_group(container->iommu_data,
1048 group->iommu_group);
1049 if (ret)
1050 goto unlock_out;
1051 }
1052
1053 group->container = container;
1054 list_add(&group->container_next, &container->group_list);
1055
1056 /* Get a reference on the container and mark a user within the group */
1057 vfio_container_get(container);
1058 atomic_inc(&group->container_users);
1059
1060unlock_out:
1061 mutex_unlock(&container->group_lock);
1062 fput(filep);
1063
1064 return ret;
1065}
1066
1067static bool vfio_group_viable(struct vfio_group *group)
1068{
1069 return (iommu_group_for_each_dev(group->iommu_group,
1070 group, vfio_dev_viable) == 0);
1071}
1072
1073static const struct file_operations vfio_device_fops;
1074
1075static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1076{
1077 struct vfio_device *device;
1078 struct file *filep;
1079 int ret = -ENODEV;
1080
1081 if (0 == atomic_read(&group->container_users) ||
1082 !group->container->iommu_driver || !vfio_group_viable(group))
1083 return -EINVAL;
1084
1085 mutex_lock(&group->device_lock);
1086 list_for_each_entry(device, &group->device_list, group_next) {
1087 if (strcmp(dev_name(device->dev), buf))
1088 continue;
1089
1090 ret = device->ops->open(device->device_data);
1091 if (ret)
1092 break;
1093 /*
1094 * We can't use anon_inode_getfd() because we need to modify
1095 * the f_mode flags directly to allow more than just ioctls
1096 */
1097 ret = get_unused_fd();
1098 if (ret < 0) {
1099 device->ops->release(device->device_data);
1100 break;
1101 }
1102
1103 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1104 device, O_RDWR);
1105 if (IS_ERR(filep)) {
1106 put_unused_fd(ret);
1107 ret = PTR_ERR(filep);
1108 device->ops->release(device->device_data);
1109 break;
1110 }
1111
1112 /*
1113 * TODO: add an anon_inode interface to do this.
1114 * Appears to be missing by lack of need rather than
1115 * explicitly prevented. Now there's need.
1116 */
1117 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1118
1119 fd_install(ret, filep);
1120
1121 vfio_device_get(device);
1122 atomic_inc(&group->container_users);
1123 break;
1124 }
1125 mutex_unlock(&group->device_lock);
1126
1127 return ret;
1128}
1129
1130static long vfio_group_fops_unl_ioctl(struct file *filep,
1131 unsigned int cmd, unsigned long arg)
1132{
1133 struct vfio_group *group = filep->private_data;
1134 long ret = -ENOTTY;
1135
1136 switch (cmd) {
1137 case VFIO_GROUP_GET_STATUS:
1138 {
1139 struct vfio_group_status status;
1140 unsigned long minsz;
1141
1142 minsz = offsetofend(struct vfio_group_status, flags);
1143
1144 if (copy_from_user(&status, (void __user *)arg, minsz))
1145 return -EFAULT;
1146
1147 if (status.argsz < minsz)
1148 return -EINVAL;
1149
1150 status.flags = 0;
1151
1152 if (vfio_group_viable(group))
1153 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1154
1155 if (group->container)
1156 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1157
1158 if (copy_to_user((void __user *)arg, &status, minsz))
1159 return -EFAULT;
1160
1161 ret = 0;
1162 break;
1163 }
1164 case VFIO_GROUP_SET_CONTAINER:
1165 {
1166 int fd;
1167
1168 if (get_user(fd, (int __user *)arg))
1169 return -EFAULT;
1170
1171 if (fd < 0)
1172 return -EINVAL;
1173
1174 ret = vfio_group_set_container(group, fd);
1175 break;
1176 }
1177 case VFIO_GROUP_UNSET_CONTAINER:
1178 ret = vfio_group_unset_container(group);
1179 break;
1180 case VFIO_GROUP_GET_DEVICE_FD:
1181 {
1182 char *buf;
1183
1184 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1185 if (IS_ERR(buf))
1186 return PTR_ERR(buf);
1187
1188 ret = vfio_group_get_device_fd(group, buf);
1189 kfree(buf);
1190 break;
1191 }
1192 }
1193
1194 return ret;
1195}
1196
1197#ifdef CONFIG_COMPAT
1198static long vfio_group_fops_compat_ioctl(struct file *filep,
1199 unsigned int cmd, unsigned long arg)
1200{
1201 arg = (unsigned long)compat_ptr(arg);
1202 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1203}
1204#endif /* CONFIG_COMPAT */
1205
1206static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1207{
1208 struct vfio_group *group;
1209
1210 group = vfio_group_get_from_minor(iminor(inode));
1211 if (!group)
1212 return -ENODEV;
1213
1214 if (group->container) {
1215 vfio_group_put(group);
1216 return -EBUSY;
1217 }
1218
1219 filep->private_data = group;
1220
1221 return 0;
1222}
1223
1224static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1225{
1226 struct vfio_group *group = filep->private_data;
1227
1228 filep->private_data = NULL;
1229
1230 vfio_group_try_dissolve_container(group);
1231
1232 vfio_group_put(group);
1233
1234 return 0;
1235}
1236
1237static const struct file_operations vfio_group_fops = {
1238 .owner = THIS_MODULE,
1239 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1240#ifdef CONFIG_COMPAT
1241 .compat_ioctl = vfio_group_fops_compat_ioctl,
1242#endif
1243 .open = vfio_group_fops_open,
1244 .release = vfio_group_fops_release,
1245};
1246
1247/**
1248 * VFIO Device fd
1249 */
1250static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1251{
1252 struct vfio_device *device = filep->private_data;
1253
1254 device->ops->release(device->device_data);
1255
1256 vfio_group_try_dissolve_container(device->group);
1257
1258 vfio_device_put(device);
1259
1260 return 0;
1261}
1262
1263static long vfio_device_fops_unl_ioctl(struct file *filep,
1264 unsigned int cmd, unsigned long arg)
1265{
1266 struct vfio_device *device = filep->private_data;
1267
1268 if (unlikely(!device->ops->ioctl))
1269 return -EINVAL;
1270
1271 return device->ops->ioctl(device->device_data, cmd, arg);
1272}
1273
1274static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1275 size_t count, loff_t *ppos)
1276{
1277 struct vfio_device *device = filep->private_data;
1278
1279 if (unlikely(!device->ops->read))
1280 return -EINVAL;
1281
1282 return device->ops->read(device->device_data, buf, count, ppos);
1283}
1284
1285static ssize_t vfio_device_fops_write(struct file *filep,
1286 const char __user *buf,
1287 size_t count, loff_t *ppos)
1288{
1289 struct vfio_device *device = filep->private_data;
1290
1291 if (unlikely(!device->ops->write))
1292 return -EINVAL;
1293
1294 return device->ops->write(device->device_data, buf, count, ppos);
1295}
1296
1297static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1298{
1299 struct vfio_device *device = filep->private_data;
1300
1301 if (unlikely(!device->ops->mmap))
1302 return -EINVAL;
1303
1304 return device->ops->mmap(device->device_data, vma);
1305}
1306
1307#ifdef CONFIG_COMPAT
1308static long vfio_device_fops_compat_ioctl(struct file *filep,
1309 unsigned int cmd, unsigned long arg)
1310{
1311 arg = (unsigned long)compat_ptr(arg);
1312 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1313}
1314#endif /* CONFIG_COMPAT */
1315
1316static const struct file_operations vfio_device_fops = {
1317 .owner = THIS_MODULE,
1318 .release = vfio_device_fops_release,
1319 .read = vfio_device_fops_read,
1320 .write = vfio_device_fops_write,
1321 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1322#ifdef CONFIG_COMPAT
1323 .compat_ioctl = vfio_device_fops_compat_ioctl,
1324#endif
1325 .mmap = vfio_device_fops_mmap,
1326};
1327
1328/**
1329 * Module/class support
1330 */
1331static char *vfio_devnode(struct device *dev, umode_t *mode)
1332{
1333 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1334}
1335
1336static int __init vfio_init(void)
1337{
1338 int ret;
1339
1340 idr_init(&vfio.group_idr);
1341 mutex_init(&vfio.group_lock);
1342 mutex_init(&vfio.iommu_drivers_lock);
1343 INIT_LIST_HEAD(&vfio.group_list);
1344 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1345 init_waitqueue_head(&vfio.release_q);
1346
1347 vfio.class = class_create(THIS_MODULE, "vfio");
1348 if (IS_ERR(vfio.class)) {
1349 ret = PTR_ERR(vfio.class);
1350 goto err_class;
1351 }
1352
1353 vfio.class->devnode = vfio_devnode;
1354
1355 ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
1356 if (ret)
1357 goto err_base_chrdev;
1358
1359 cdev_init(&vfio.cdev, &vfio_fops);
1360 ret = cdev_add(&vfio.cdev, vfio.devt, 1);
1361 if (ret)
1362 goto err_base_cdev;
1363
1364 vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio");
1365 if (IS_ERR(vfio.dev)) {
1366 ret = PTR_ERR(vfio.dev);
1367 goto err_base_dev;
1368 }
1369
1370 /* /dev/vfio/$GROUP */
1371 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1372 ret = cdev_add(&vfio.group_cdev,
1373 MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1);
1374 if (ret)
1375 goto err_groups_cdev;
1376
1377 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1378
1379 return 0;
1380
1381err_groups_cdev:
1382 device_destroy(vfio.class, vfio.devt);
1383err_base_dev:
1384 cdev_del(&vfio.cdev);
1385err_base_cdev:
1386 unregister_chrdev_region(vfio.devt, MINORMASK);
1387err_base_chrdev:
1388 class_destroy(vfio.class);
1389 vfio.class = NULL;
1390err_class:
1391 return ret;
1392}
1393
1394static void __exit vfio_cleanup(void)
1395{
1396 WARN_ON(!list_empty(&vfio.group_list));
1397
1398 idr_destroy(&vfio.group_idr);
1399 cdev_del(&vfio.group_cdev);
1400 device_destroy(vfio.class, vfio.devt);
1401 cdev_del(&vfio.cdev);
1402 unregister_chrdev_region(vfio.devt, MINORMASK);
1403 class_destroy(vfio.class);
1404 vfio.class = NULL;
1405}
1406
1407module_init(vfio_init);
1408module_exit(vfio_cleanup);
1409
1410MODULE_VERSION(DRIVER_VERSION);
1411MODULE_LICENSE("GPL v2");
1412MODULE_AUTHOR(DRIVER_AUTHOR);
1413MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
new file mode 100644
index 000000000000..03e56a5154b6
--- /dev/null
+++ b/include/linux/vfio.h
@@ -0,0 +1,367 @@
1/*
2 * VFIO API definition
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef VFIO_H
12#define VFIO_H
13
14#include <linux/types.h>
15#include <linux/ioctl.h>
16
17#define VFIO_API_VERSION 0
18
19#ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */
20
21#include <linux/iommu.h>
22#include <linux/mm.h>
23
24/**
25 * struct vfio_device_ops - VFIO bus driver device callbacks
26 *
27 * @open: Called when userspace creates new file descriptor for device
28 * @release: Called when userspace releases file descriptor for device
29 * @read: Perform read(2) on device file descriptor
30 * @write: Perform write(2) on device file descriptor
31 * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
32 * operations documented below
33 * @mmap: Perform mmap(2) on a region of the device file descriptor
34 */
35struct vfio_device_ops {
36 char *name;
37 int (*open)(void *device_data);
38 void (*release)(void *device_data);
39 ssize_t (*read)(void *device_data, char __user *buf,
40 size_t count, loff_t *ppos);
41 ssize_t (*write)(void *device_data, const char __user *buf,
42 size_t count, loff_t *size);
43 long (*ioctl)(void *device_data, unsigned int cmd,
44 unsigned long arg);
45 int (*mmap)(void *device_data, struct vm_area_struct *vma);
46};
47
48extern int vfio_add_group_dev(struct device *dev,
49 const struct vfio_device_ops *ops,
50 void *device_data);
51
52extern void *vfio_del_group_dev(struct device *dev);
53
54/**
55 * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
56 */
57struct vfio_iommu_driver_ops {
58 char *name;
59 struct module *owner;
60 void *(*open)(unsigned long arg);
61 void (*release)(void *iommu_data);
62 ssize_t (*read)(void *iommu_data, char __user *buf,
63 size_t count, loff_t *ppos);
64 ssize_t (*write)(void *iommu_data, const char __user *buf,
65 size_t count, loff_t *size);
66 long (*ioctl)(void *iommu_data, unsigned int cmd,
67 unsigned long arg);
68 int (*mmap)(void *iommu_data, struct vm_area_struct *vma);
69 int (*attach_group)(void *iommu_data,
70 struct iommu_group *group);
71 void (*detach_group)(void *iommu_data,
72 struct iommu_group *group);
73
74};
75
76extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
77
78extern void vfio_unregister_iommu_driver(
79 const struct vfio_iommu_driver_ops *ops);
80
81/**
82 * offsetofend(TYPE, MEMBER)
83 *
84 * @TYPE: The type of the structure
85 * @MEMBER: The member within the structure to get the end offset of
86 *
87 * Simple helper macro for dealing with variable sized structures passed
88 * from user space. This allows us to easily determine if the provided
89 * structure is sized to include various fields.
90 */
91#define offsetofend(TYPE, MEMBER) ({ \
92 TYPE tmp; \
93 offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
94
95#endif /* __KERNEL__ */
96
97/* Kernel & User level defines for VFIO IOCTLs. */
98
99/* Extensions */
100
101/* None yet */
102
103/*
104 * The IOCTL interface is designed for extensibility by embedding the
105 * structure length (argsz) and flags into structures passed between
106 * kernel and userspace. We therefore use the _IO() macro for these
107 * defines to avoid implicitly embedding a size into the ioctl request.
108 * As structure fields are added, argsz will increase to match and flag
109 * bits will be defined to indicate additional fields with valid data.
110 * It's *always* the caller's responsibility to indicate the size of
111 * the structure passed by setting argsz appropriately.
112 */
113
114#define VFIO_TYPE (';')
115#define VFIO_BASE 100
116
117/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
118
119/**
120 * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
121 *
122 * Report the version of the VFIO API. This allows us to bump the entire
123 * API version should we later need to add or change features in incompatible
124 * ways.
125 * Return: VFIO_API_VERSION
126 * Availability: Always
127 */
128#define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0)
129
130/**
131 * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
132 *
133 * Check whether an extension is supported.
134 * Return: 0 if not supported, 1 (or some other positive integer) if supported.
135 * Availability: Always
136 */
137#define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1)
138
139/**
140 * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
141 *
142 * Set the iommu to the given type. The type must be supported by an
143 * iommu driver as verified by calling CHECK_EXTENSION using the same
144 * type. A group must be set to this file descriptor before this
145 * ioctl is available. The IOMMU interfaces enabled by this call are
146 * specific to the value set.
147 * Return: 0 on success, -errno on failure
148 * Availability: When VFIO group attached
149 */
150#define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2)
151
152/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
153
154/**
155 * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
156 * struct vfio_group_status)
157 *
158 * Retrieve information about the group. Fills in provided
159 * struct vfio_group_info. Caller sets argsz.
160 * Return: 0 on succes, -errno on failure.
161 * Availability: Always
162 */
163struct vfio_group_status {
164 __u32 argsz;
165 __u32 flags;
166#define VFIO_GROUP_FLAGS_VIABLE (1 << 0)
167#define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1)
168};
169#define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3)
170
171/**
172 * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
173 *
174 * Set the container for the VFIO group to the open VFIO file
175 * descriptor provided. Groups may only belong to a single
176 * container. Containers may, at their discretion, support multiple
177 * groups. Only when a container is set are all of the interfaces
178 * of the VFIO file descriptor and the VFIO group file descriptor
179 * available to the user.
180 * Return: 0 on success, -errno on failure.
181 * Availability: Always
182 */
183#define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4)
184
185/**
186 * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
187 *
188 * Remove the group from the attached container. This is the
189 * opposite of the SET_CONTAINER call and returns the group to
190 * an initial state. All device file descriptors must be released
191 * prior to calling this interface. When removing the last group
192 * from a container, the IOMMU will be disabled and all state lost,
193 * effectively also returning the VFIO file descriptor to an initial
194 * state.
195 * Return: 0 on success, -errno on failure.
196 * Availability: When attached to container
197 */
198#define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5)
199
200/**
201 * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
202 *
203 * Return a new file descriptor for the device object described by
204 * the provided string. The string should match a device listed in
205 * the devices subdirectory of the IOMMU group sysfs entry. The
206 * group containing the device must already be added to this context.
207 * Return: new file descriptor on success, -errno on failure.
208 * Availability: When attached to container
209 */
210#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6)
211
212/* --------------- IOCTLs for DEVICE file descriptors --------------- */
213
214/**
215 * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
216 * struct vfio_device_info)
217 *
218 * Retrieve information about the device. Fills in provided
219 * struct vfio_device_info. Caller sets argsz.
220 * Return: 0 on success, -errno on failure.
221 */
222struct vfio_device_info {
223 __u32 argsz;
224 __u32 flags;
225#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
226 __u32 num_regions; /* Max region index + 1 */
227 __u32 num_irqs; /* Max IRQ index + 1 */
228};
229#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
230
231/**
232 * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
233 * struct vfio_region_info)
234 *
235 * Retrieve information about a device region. Caller provides
236 * struct vfio_region_info with index value set. Caller sets argsz.
237 * Implementation of region mapping is bus driver specific. This is
238 * intended to describe MMIO, I/O port, as well as bus specific
239 * regions (ex. PCI config space). Zero sized regions may be used
240 * to describe unimplemented regions (ex. unimplemented PCI BARs).
241 * Return: 0 on success, -errno on failure.
242 */
243struct vfio_region_info {
244 __u32 argsz;
245 __u32 flags;
246#define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */
247#define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */
248#define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */
249 __u32 index; /* Region index */
250 __u32 resv; /* Reserved for alignment */
251 __u64 size; /* Region size (bytes) */
252 __u64 offset; /* Region offset from start of device fd */
253};
254#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8)
255
256/**
257 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
258 * struct vfio_irq_info)
259 *
260 * Retrieve information about a device IRQ. Caller provides
261 * struct vfio_irq_info with index value set. Caller sets argsz.
262 * Implementation of IRQ mapping is bus driver specific. Indexes
263 * using multiple IRQs are primarily intended to support MSI-like
264 * interrupt blocks. Zero count irq blocks may be used to describe
265 * unimplemented interrupt types.
266 *
267 * The EVENTFD flag indicates the interrupt index supports eventfd based
268 * signaling.
269 *
270 * The MASKABLE flags indicates the index supports MASK and UNMASK
271 * actions described below.
272 *
273 * AUTOMASKED indicates that after signaling, the interrupt line is
274 * automatically masked by VFIO and the user needs to unmask the line
275 * to receive new interrupts. This is primarily intended to distinguish
276 * level triggered interrupts.
277 *
278 * The NORESIZE flag indicates that the interrupt lines within the index
279 * are setup as a set and new subindexes cannot be enabled without first
280 * disabling the entire index. This is used for interrupts like PCI MSI
281 * and MSI-X where the driver may only use a subset of the available
282 * indexes, but VFIO needs to enable a specific number of vectors
283 * upfront. In the case of MSI-X, where the user can enable MSI-X and
284 * then add and unmask vectors, it's up to userspace to make the decision
285 * whether to allocate the maximum supported number of vectors or tear
286 * down setup and incrementally increase the vectors as each is enabled.
287 */
288struct vfio_irq_info {
289 __u32 argsz;
290 __u32 flags;
291#define VFIO_IRQ_INFO_EVENTFD (1 << 0)
292#define VFIO_IRQ_INFO_MASKABLE (1 << 1)
293#define VFIO_IRQ_INFO_AUTOMASKED (1 << 2)
294#define VFIO_IRQ_INFO_NORESIZE (1 << 3)
295 __u32 index; /* IRQ index */
296 __u32 count; /* Number of IRQs within this index */
297};
298#define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9)
299
300/**
301 * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
302 *
303 * Set signaling, masking, and unmasking of interrupts. Caller provides
304 * struct vfio_irq_set with all fields set. 'start' and 'count' indicate
305 * the range of subindexes being specified.
306 *
307 * The DATA flags specify the type of data provided. If DATA_NONE, the
308 * operation performs the specified action immediately on the specified
309 * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]:
310 * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
311 *
312 * DATA_BOOL allows sparse support for the same on arrays of interrupts.
313 * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
314 * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
315 * data = {1,0,1}
316 *
317 * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
318 * A value of -1 can be used to either de-assign interrupts if already
319 * assigned or skip un-assigned interrupts. For example, to set an eventfd
320 * to be trigger for interrupts [0,0] and [0,2]:
321 * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
322 * data = {fd1, -1, fd2}
323 * If index [0,1] is previously set, two count = 1 ioctls calls would be
324 * required to set [0,0] and [0,2] without changing [0,1].
325 *
326 * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
327 * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
328 * from userspace (ie. simulate hardware triggering).
329 *
330 * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
331 * enables the interrupt index for the device. Individual subindex interrupts
332 * can be disabled using the -1 value for DATA_EVENTFD or the index can be
333 * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
334 *
335 * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
336 * ACTION_TRIGGER specifies kernel->user signaling.
337 */
338struct vfio_irq_set {
339 __u32 argsz;
340 __u32 flags;
341#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
342#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
343#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
344#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
345#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
346#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
347 __u32 index;
348 __u32 start;
349 __u32 count;
350 __u8 data[];
351};
352#define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10)
353
354#define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
355 VFIO_IRQ_SET_DATA_BOOL | \
356 VFIO_IRQ_SET_DATA_EVENTFD)
357#define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
358 VFIO_IRQ_SET_ACTION_UNMASK | \
359 VFIO_IRQ_SET_ACTION_TRIGGER)
360/**
361 * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
362 *
363 * Reset a device.
364 */
365#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
366
367#endif /* VFIO_H */