aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/vfio.txt314
-rw-r--r--MAINTAINERS8
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/vfio/Kconfig16
-rw-r--r--drivers/vfio/Makefile3
-rw-r--r--drivers/vfio/pci/Kconfig8
-rw-r--r--drivers/vfio/pci/Makefile4
-rw-r--r--drivers/vfio/pci/vfio_pci.c579
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c1540
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c740
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h91
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c269
-rw-r--r--drivers/vfio/vfio.c1420
-rw-r--r--drivers/vfio/vfio_iommu_type1.c753
-rw-r--r--include/linux/vfio.h445
17 files changed, 6194 insertions, 0 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 915f28c470e9..849b771c5e03 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -88,6 +88,7 @@ Code Seq#(hex) Include File Comments
88 and kernel/power/user.c 88 and kernel/power/user.c
89'8' all SNP8023 advanced NIC card 89'8' all SNP8023 advanced NIC card
90 <mailto:mcr@solidum.com> 90 <mailto:mcr@solidum.com>
91';' 64-7F linux/vfio.h
91'@' 00-0F linux/radeonfb.h conflict! 92'@' 00-0F linux/radeonfb.h conflict!
92'@' 00-0F drivers/video/aty/aty128fb.c conflict! 93'@' 00-0F drivers/video/aty/aty128fb.c conflict!
93'A' 00-1F linux/apm_bios.h conflict! 94'A' 00-1F linux/apm_bios.h conflict!
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
new file mode 100644
index 000000000000..0cb6685c8029
--- /dev/null
+++ b/Documentation/vfio.txt
@@ -0,0 +1,314 @@
1VFIO - "Virtual Function I/O"[1]
2-------------------------------------------------------------------------------
3Many modern system now provide DMA and interrupt remapping facilities
4to help ensure I/O devices behave within the boundaries they've been
5allotted. This includes x86 hardware with AMD-Vi and Intel VT-d,
6POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC
7systems such as Freescale PAMU. The VFIO driver is an IOMMU/device
8agnostic framework for exposing direct device access to userspace, in
9a secure, IOMMU protected environment. In other words, this allows
10safe[2], non-privileged, userspace drivers.
11
12Why do we want that? Virtual machines often make use of direct device
13access ("device assignment") when configured for the highest possible
14I/O performance. From a device and host perspective, this simply
15turns the VM into a userspace driver, with the benefits of
16significantly reduced latency, higher bandwidth, and direct use of
17bare-metal device drivers[3].
18
19Some applications, particularly in the high performance computing
20field, also benefit from low-overhead, direct device access from
21userspace. Examples include network adapters (often non-TCP/IP based)
22and compute accelerators. Prior to VFIO, these drivers had to either
23go through the full development cycle to become proper upstream
24driver, be maintained out of tree, or make use of the UIO framework,
25which has no notion of IOMMU protection, limited interrupt support,
26and requires root privileges to access things like PCI configuration
27space.
28
29The VFIO driver framework intends to unify these, replacing both the
30KVM PCI specific device assignment code as well as provide a more
31secure, more featureful userspace driver environment than UIO.
32
33Groups, Devices, and IOMMUs
34-------------------------------------------------------------------------------
35
36Devices are the main target of any I/O driver. Devices typically
37create a programming interface made up of I/O access, interrupts,
38and DMA. Without going into the details of each of these, DMA is
39by far the most critical aspect for maintaining a secure environment
40as allowing a device read-write access to system memory imposes the
41greatest risk to the overall system integrity.
42
43To help mitigate this risk, many modern IOMMUs now incorporate
44isolation properties into what was, in many cases, an interface only
45meant for translation (ie. solving the addressing problems of devices
46with limited address spaces). With this, devices can now be isolated
47from each other and from arbitrary memory access, thus allowing
48things like secure direct assignment of devices into virtual machines.
49
50This isolation is not always at the granularity of a single device
51though. Even when an IOMMU is capable of this, properties of devices,
52interconnects, and IOMMU topologies can each reduce this isolation.
53For instance, an individual device may be part of a larger multi-
54function enclosure. While the IOMMU may be able to distinguish
55between devices within the enclosure, the enclosure may not require
56transactions between devices to reach the IOMMU. Examples of this
57could be anything from a multi-function PCI device with backdoors
58between functions to a non-PCI-ACS (Access Control Services) capable
59bridge allowing redirection without reaching the IOMMU. Topology
60can also play a factor in terms of hiding devices. A PCIe-to-PCI
61bridge masks the devices behind it, making transaction appear as if
62from the bridge itself. Obviously IOMMU design plays a major factor
63as well.
64
65Therefore, while for the most part an IOMMU may have device level
66granularity, any system is susceptible to reduced granularity. The
67IOMMU API therefore supports a notion of IOMMU groups. A group is
68a set of devices which is isolatable from all other devices in the
69system. Groups are therefore the unit of ownership used by VFIO.
70
71While the group is the minimum granularity that must be used to
72ensure secure user access, it's not necessarily the preferred
73granularity. In IOMMUs which make use of page tables, it may be
74possible to share a set of page tables between different groups,
75reducing the overhead both to the platform (reduced TLB thrashing,
76reduced duplicate page tables), and to the user (programming only
77a single set of translations). For this reason, VFIO makes use of
78a container class, which may hold one or more groups. A container
79is created by simply opening the /dev/vfio/vfio character device.
80
81On its own, the container provides little functionality, with all
82but a couple version and extension query interfaces locked away.
83The user needs to add a group into the container for the next level
84of functionality. To do this, the user first needs to identify the
85group associated with the desired device. This can be done using
86the sysfs links described in the example below. By unbinding the
87device from the host driver and binding it to a VFIO driver, a new
88VFIO group will appear for the group as /dev/vfio/$GROUP, where
89$GROUP is the IOMMU group number of which the device is a member.
90If the IOMMU group contains multiple devices, each will need to
91be bound to a VFIO driver before operations on the VFIO group
92are allowed (it's also sufficient to only unbind the device from
93host drivers if a VFIO driver is unavailable; this will make the
94group available, but not that particular device). TBD - interface
95for disabling driver probing/locking a device.
96
97Once the group is ready, it may be added to the container by opening
98the VFIO group character device (/dev/vfio/$GROUP) and using the
99VFIO_GROUP_SET_CONTAINER ioctl, passing the file descriptor of the
100previously opened container file. If desired and if the IOMMU driver
101supports sharing the IOMMU context between groups, multiple groups may
102be set to the same container. If a group fails to set to a container
103with existing groups, a new empty container will need to be used
104instead.
105
106With a group (or groups) attached to a container, the remaining
107ioctls become available, enabling access to the VFIO IOMMU interfaces.
108Additionally, it now becomes possible to get file descriptors for each
109device within a group using an ioctl on the VFIO group file descriptor.
110
111The VFIO device API includes ioctls for describing the device, the I/O
112regions and their read/write/mmap offsets on the device descriptor, as
113well as mechanisms for describing and registering interrupt
114notifications.
115
116VFIO Usage Example
117-------------------------------------------------------------------------------
118
119Assume user wants to access PCI device 0000:06:0d.0
120
121$ readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
122../../../../kernel/iommu_groups/26
123
124This device is therefore in IOMMU group 26. This device is on the
125pci bus, therefore the user will make use of vfio-pci to manage the
126group:
127
128# modprobe vfio-pci
129
130Binding this device to the vfio-pci driver creates the VFIO group
131character devices for this group:
132
133$ lspci -n -s 0000:06:0d.0
13406:0d.0 0401: 1102:0002 (rev 08)
135# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
136# echo 1102 0002 > /sys/bus/pci/drivers/vfio/new_id
137
138Now we need to look at what other devices are in the group to free
139it for use by VFIO:
140
141$ ls -l /sys/bus/pci/devices/0000:06:0d.0/iommu_group/devices
142total 0
143lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:00:1e.0 ->
144 ../../../../devices/pci0000:00/0000:00:1e.0
145lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.0 ->
146 ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.0
147lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.1 ->
148 ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.1
149
150This device is behind a PCIe-to-PCI bridge[4], therefore we also
151need to add device 0000:06:0d.1 to the group following the same
152procedure as above. Device 0000:00:1e.0 is a bridge that does
153not currently have a host driver, therefore it's not required to
154bind this device to the vfio-pci driver (vfio-pci does not currently
155support PCI bridges).
156
157The final step is to provide the user with access to the group if
158unprivileged operation is desired (note that /dev/vfio/vfio provides
159no capabilities on its own and is therefore expected to be set to
160mode 0666 by the system).
161
162# chown user:user /dev/vfio/26
163
164The user now has full access to all the devices and the iommu for this
165group and can access them as follows:
166
167 int container, group, device, i;
168 struct vfio_group_status group_status =
169 { .argsz = sizeof(group_status) };
170 struct vfio_iommu_x86_info iommu_info = { .argsz = sizeof(iommu_info) };
171 struct vfio_iommu_x86_dma_map dma_map = { .argsz = sizeof(dma_map) };
172 struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
173
174 /* Create a new container */
175 container = open("/dev/vfio/vfio, O_RDWR);
176
177 if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
178 /* Unknown API version */
179
180 if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_X86_IOMMU))
181 /* Doesn't support the IOMMU driver we want. */
182
183 /* Open the group */
184 group = open("/dev/vfio/26", O_RDWR);
185
186 /* Test the group is viable and available */
187 ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
188
189 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE))
190 /* Group is not viable (ie, not all devices bound for vfio) */
191
192 /* Add the group to the container */
193 ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
194
195 /* Enable the IOMMU model we want */
196 ioctl(container, VFIO_SET_IOMMU, VFIO_X86_IOMMU)
197
198 /* Get addition IOMMU info */
199 ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
200
201 /* Allocate some space and setup a DMA mapping */
202 dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
203 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
204 dma_map.size = 1024 * 1024;
205 dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
206 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
207
208 ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
209
210 /* Get a file descriptor for the device */
211 device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
212
213 /* Test and setup the device */
214 ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
215
216 for (i = 0; i < device_info.num_regions; i++) {
217 struct vfio_region_info reg = { .argsz = sizeof(reg) };
218
219 reg.index = i;
220
221 ioctl(device, VFIO_DEVICE_GET_REGION_INFO, &reg);
222
223 /* Setup mappings... read/write offsets, mmaps
224 * For PCI devices, config space is a region */
225 }
226
227 for (i = 0; i < device_info.num_irqs; i++) {
228 struct vfio_irq_info irq = { .argsz = sizeof(irq) };
229
230 irq.index = i;
231
232 ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &reg);
233
234 /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
235 }
236
237 /* Gratuitous device reset and go... */
238 ioctl(device, VFIO_DEVICE_RESET);
239
240VFIO User API
241-------------------------------------------------------------------------------
242
243Please see include/linux/vfio.h for complete API documentation.
244
245VFIO bus driver API
246-------------------------------------------------------------------------------
247
248VFIO bus drivers, such as vfio-pci make use of only a few interfaces
249into VFIO core. When devices are bound and unbound to the driver,
250the driver should call vfio_add_group_dev() and vfio_del_group_dev()
251respectively:
252
253extern int vfio_add_group_dev(struct iommu_group *iommu_group,
254 struct device *dev,
255 const struct vfio_device_ops *ops,
256 void *device_data);
257
258extern void *vfio_del_group_dev(struct device *dev);
259
260vfio_add_group_dev() indicates to the core to begin tracking the
261specified iommu_group and register the specified dev as owned by
262a VFIO bus driver. The driver provides an ops structure for callbacks
263similar to a file operations structure:
264
265struct vfio_device_ops {
266 int (*open)(void *device_data);
267 void (*release)(void *device_data);
268 ssize_t (*read)(void *device_data, char __user *buf,
269 size_t count, loff_t *ppos);
270 ssize_t (*write)(void *device_data, const char __user *buf,
271 size_t size, loff_t *ppos);
272 long (*ioctl)(void *device_data, unsigned int cmd,
273 unsigned long arg);
274 int (*mmap)(void *device_data, struct vm_area_struct *vma);
275};
276
277Each function is passed the device_data that was originally registered
278in the vfio_add_group_dev() call above. This allows the bus driver
279an easy place to store its opaque, private data. The open/release
280callbacks are issued when a new file descriptor is created for a
281device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides
282a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap
283interfaces implement the device region access defined by the device's
284own VFIO_DEVICE_GET_REGION_INFO ioctl.
285
286-------------------------------------------------------------------------------
287
288[1] VFIO was originally an acronym for "Virtual Function I/O" in its
289initial implementation by Tom Lyon while as Cisco. We've since
290outgrown the acronym, but it's catchy.
291
292[2] "safe" also depends upon a device being "well behaved". It's
293possible for multi-function devices to have backdoors between
294functions and even for single function devices to have alternative
295access to things like PCI config space through MMIO registers. To
296guard against the former we can include additional precautions in the
297IOMMU driver to group multi-function PCI devices together
298(iommu=group_mf). The latter we can't prevent, but the IOMMU should
299still provide isolation. For PCI, SR-IOV Virtual Functions are the
300best indicator of "well behaved", as these are designed for
301virtualization usage models.
302
303[3] As always there are trade-offs to virtual machine device
304assignment that are beyond the scope of VFIO. It's expected that
305future IOMMU technologies will reduce some, but maybe not all, of
306these trade-offs.
307
308[4] In this case the device is below a PCI bridge, so transactions
309from either function of the device are indistinguishable to the iommu:
310
311-[0000:00]-+-1e.0-[06]--+-0d.0
312 \-0d.1
313
31400:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90)
diff --git a/MAINTAINERS b/MAINTAINERS
index 36ed8a14e8e2..6720018bc674 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7414,6 +7414,14 @@ S: Maintained
7414F: Documentation/filesystems/vfat.txt 7414F: Documentation/filesystems/vfat.txt
7415F: fs/fat/ 7415F: fs/fat/
7416 7416
7417VFIO DRIVER
7418M: Alex Williamson <alex.williamson@redhat.com>
7419L: kvm@vger.kernel.org
7420S: Maintained
7421F: Documentation/vfio.txt
7422F: drivers/vfio/
7423F: include/linux/vfio.h
7424
7417VIDEOBUF2 FRAMEWORK 7425VIDEOBUF2 FRAMEWORK
7418M: Pawel Osciak <pawel@osciak.com> 7426M: Pawel Osciak <pawel@osciak.com>
7419M: Marek Szyprowski <m.szyprowski@samsung.com> 7427M: Marek Szyprowski <m.szyprowski@samsung.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 805c432c9439..ece958d3762e 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig"
112 112
113source "drivers/uio/Kconfig" 113source "drivers/uio/Kconfig"
114 114
115source "drivers/vfio/Kconfig"
116
115source "drivers/vlynq/Kconfig" 117source "drivers/vlynq/Kconfig"
116 118
117source "drivers/virtio/Kconfig" 119source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index bd36f09f2246..5b421840c48d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_ATM) += atm/
60obj-$(CONFIG_FUSION) += message/ 60obj-$(CONFIG_FUSION) += message/
61obj-y += firewire/ 61obj-y += firewire/
62obj-$(CONFIG_UIO) += uio/ 62obj-$(CONFIG_UIO) += uio/
63obj-$(CONFIG_VFIO) += vfio/
63obj-y += cdrom/ 64obj-y += cdrom/
64obj-y += auxdisplay/ 65obj-y += auxdisplay/
65obj-$(CONFIG_PCCARD) += pcmcia/ 66obj-$(CONFIG_PCCARD) += pcmcia/
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 000000000000..7cd5dec0abd1
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,16 @@
1config VFIO_IOMMU_TYPE1
2 tristate
3 depends on VFIO
4 default n
5
6menuconfig VFIO
7 tristate "VFIO Non-Privileged userspace driver framework"
8 depends on IOMMU_API
9 select VFIO_IOMMU_TYPE1 if X86
10 help
11 VFIO provides a framework for secure userspace device drivers.
12 See Documentation/vfio.txt for more details.
13
14 If you don't know what to do here, say N.
15
16source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 000000000000..2398d4a0e38b
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_VFIO) += vfio.o
2obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
3obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
new file mode 100644
index 000000000000..5980758563eb
--- /dev/null
+++ b/drivers/vfio/pci/Kconfig
@@ -0,0 +1,8 @@
1config VFIO_PCI
2 tristate "VFIO support for PCI devices"
3 depends on VFIO && PCI && EVENTFD
4 help
5 Support for the PCI VFIO bus driver. This is required to make
6 use of PCI drivers using the VFIO framework.
7
8 If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
new file mode 100644
index 000000000000..131079255fd9
--- /dev/null
+++ b/drivers/vfio/pci/Makefile
@@ -0,0 +1,4 @@
1
2vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
3
4obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 000000000000..6968b7232232
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/interrupt.h>
17#include <linux/iommu.h>
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/notifier.h>
21#include <linux/pci.h>
22#include <linux/pm_runtime.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/uaccess.h>
26#include <linux/vfio.h>
27
28#include "vfio_pci_private.h"
29
30#define DRIVER_VERSION "0.2"
31#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
33
34static bool nointxmask;
35module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36MODULE_PARM_DESC(nointxmask,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38
39static int vfio_pci_enable(struct vfio_pci_device *vdev)
40{
41 struct pci_dev *pdev = vdev->pdev;
42 int ret;
43 u16 cmd;
44 u8 msix_pos;
45
46 vdev->reset_works = (pci_reset_function(pdev) == 0);
47 pci_save_state(pdev);
48 vdev->pci_saved_state = pci_store_saved_state(pdev);
49 if (!vdev->pci_saved_state)
50 pr_debug("%s: Couldn't store %s saved state\n",
51 __func__, dev_name(&pdev->dev));
52
53 ret = vfio_config_init(vdev);
54 if (ret)
55 goto out;
56
57 if (likely(!nointxmask))
58 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
59
60 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
61 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
62 cmd &= ~PCI_COMMAND_INTX_DISABLE;
63 pci_write_config_word(pdev, PCI_COMMAND, cmd);
64 }
65
66 msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
67 if (msix_pos) {
68 u16 flags;
69 u32 table;
70
71 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
72 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
73
74 vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
75 vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
76 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
77 } else
78 vdev->msix_bar = 0xFF;
79
80 ret = pci_enable_device(pdev);
81 if (ret)
82 goto out;
83
84 return ret;
85
86out:
87 kfree(vdev->pci_saved_state);
88 vdev->pci_saved_state = NULL;
89 vfio_config_free(vdev);
90 return ret;
91}
92
93static void vfio_pci_disable(struct vfio_pci_device *vdev)
94{
95 int bar;
96
97 pci_disable_device(vdev->pdev);
98
99 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
100 VFIO_IRQ_SET_ACTION_TRIGGER,
101 vdev->irq_type, 0, 0, NULL);
102
103 vdev->virq_disabled = false;
104
105 vfio_config_free(vdev);
106
107 pci_reset_function(vdev->pdev);
108
109 if (pci_load_and_free_saved_state(vdev->pdev,
110 &vdev->pci_saved_state) == 0)
111 pci_restore_state(vdev->pdev);
112 else
113 pr_info("%s: Couldn't reload %s saved state\n",
114 __func__, dev_name(&vdev->pdev->dev));
115
116 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
117 if (!vdev->barmap[bar])
118 continue;
119 pci_iounmap(vdev->pdev, vdev->barmap[bar]);
120 pci_release_selected_regions(vdev->pdev, 1 << bar);
121 vdev->barmap[bar] = NULL;
122 }
123}
124
125static void vfio_pci_release(void *device_data)
126{
127 struct vfio_pci_device *vdev = device_data;
128
129 if (atomic_dec_and_test(&vdev->refcnt))
130 vfio_pci_disable(vdev);
131
132 module_put(THIS_MODULE);
133}
134
135static int vfio_pci_open(void *device_data)
136{
137 struct vfio_pci_device *vdev = device_data;
138
139 if (!try_module_get(THIS_MODULE))
140 return -ENODEV;
141
142 if (atomic_inc_return(&vdev->refcnt) == 1) {
143 int ret = vfio_pci_enable(vdev);
144 if (ret) {
145 module_put(THIS_MODULE);
146 return ret;
147 }
148 }
149
150 return 0;
151}
152
153static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
154{
155 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
156 u8 pin;
157 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
158 if (pin)
159 return 1;
160
161 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
162 u8 pos;
163 u16 flags;
164
165 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
166 if (pos) {
167 pci_read_config_word(vdev->pdev,
168 pos + PCI_MSI_FLAGS, &flags);
169
170 return 1 << (flags & PCI_MSI_FLAGS_QMASK);
171 }
172 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
173 u8 pos;
174 u16 flags;
175
176 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
177 if (pos) {
178 pci_read_config_word(vdev->pdev,
179 pos + PCI_MSIX_FLAGS, &flags);
180
181 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
182 }
183 }
184
185 return 0;
186}
187
188static long vfio_pci_ioctl(void *device_data,
189 unsigned int cmd, unsigned long arg)
190{
191 struct vfio_pci_device *vdev = device_data;
192 unsigned long minsz;
193
194 if (cmd == VFIO_DEVICE_GET_INFO) {
195 struct vfio_device_info info;
196
197 minsz = offsetofend(struct vfio_device_info, num_irqs);
198
199 if (copy_from_user(&info, (void __user *)arg, minsz))
200 return -EFAULT;
201
202 if (info.argsz < minsz)
203 return -EINVAL;
204
205 info.flags = VFIO_DEVICE_FLAGS_PCI;
206
207 if (vdev->reset_works)
208 info.flags |= VFIO_DEVICE_FLAGS_RESET;
209
210 info.num_regions = VFIO_PCI_NUM_REGIONS;
211 info.num_irqs = VFIO_PCI_NUM_IRQS;
212
213 return copy_to_user((void __user *)arg, &info, minsz);
214
215 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
216 struct pci_dev *pdev = vdev->pdev;
217 struct vfio_region_info info;
218
219 minsz = offsetofend(struct vfio_region_info, offset);
220
221 if (copy_from_user(&info, (void __user *)arg, minsz))
222 return -EFAULT;
223
224 if (info.argsz < minsz)
225 return -EINVAL;
226
227 switch (info.index) {
228 case VFIO_PCI_CONFIG_REGION_INDEX:
229 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
230 info.size = pdev->cfg_size;
231 info.flags = VFIO_REGION_INFO_FLAG_READ |
232 VFIO_REGION_INFO_FLAG_WRITE;
233 break;
234 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
235 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
236 info.size = pci_resource_len(pdev, info.index);
237 if (!info.size) {
238 info.flags = 0;
239 break;
240 }
241
242 info.flags = VFIO_REGION_INFO_FLAG_READ |
243 VFIO_REGION_INFO_FLAG_WRITE;
244 if (pci_resource_flags(pdev, info.index) &
245 IORESOURCE_MEM && info.size >= PAGE_SIZE)
246 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
247 break;
248 case VFIO_PCI_ROM_REGION_INDEX:
249 {
250 void __iomem *io;
251 size_t size;
252
253 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
254 info.flags = 0;
255
256 /* Report the BAR size, not the ROM size */
257 info.size = pci_resource_len(pdev, info.index);
258 if (!info.size)
259 break;
260
261 /* Is it really there? */
262 io = pci_map_rom(pdev, &size);
263 if (!io || !size) {
264 info.size = 0;
265 break;
266 }
267 pci_unmap_rom(pdev, io);
268
269 info.flags = VFIO_REGION_INFO_FLAG_READ;
270 break;
271 }
272 default:
273 return -EINVAL;
274 }
275
276 return copy_to_user((void __user *)arg, &info, minsz);
277
278 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
279 struct vfio_irq_info info;
280
281 minsz = offsetofend(struct vfio_irq_info, count);
282
283 if (copy_from_user(&info, (void __user *)arg, minsz))
284 return -EFAULT;
285
286 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
287 return -EINVAL;
288
289 info.flags = VFIO_IRQ_INFO_EVENTFD;
290
291 info.count = vfio_pci_get_irq_count(vdev, info.index);
292
293 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
294 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
295 VFIO_IRQ_INFO_AUTOMASKED);
296 else
297 info.flags |= VFIO_IRQ_INFO_NORESIZE;
298
299 return copy_to_user((void __user *)arg, &info, minsz);
300
301 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
302 struct vfio_irq_set hdr;
303 u8 *data = NULL;
304 int ret = 0;
305
306 minsz = offsetofend(struct vfio_irq_set, count);
307
308 if (copy_from_user(&hdr, (void __user *)arg, minsz))
309 return -EFAULT;
310
311 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
312 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
313 VFIO_IRQ_SET_ACTION_TYPE_MASK))
314 return -EINVAL;
315
316 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
317 size_t size;
318
319 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
320 size = sizeof(uint8_t);
321 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
322 size = sizeof(int32_t);
323 else
324 return -EINVAL;
325
326 if (hdr.argsz - minsz < hdr.count * size ||
327 hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
328 return -EINVAL;
329
330 data = kmalloc(hdr.count * size, GFP_KERNEL);
331 if (!data)
332 return -ENOMEM;
333
334 if (copy_from_user(data, (void __user *)(arg + minsz),
335 hdr.count * size)) {
336 kfree(data);
337 return -EFAULT;
338 }
339 }
340
341 mutex_lock(&vdev->igate);
342
343 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
344 hdr.start, hdr.count, data);
345
346 mutex_unlock(&vdev->igate);
347 kfree(data);
348
349 return ret;
350
351 } else if (cmd == VFIO_DEVICE_RESET)
352 return vdev->reset_works ?
353 pci_reset_function(vdev->pdev) : -EINVAL;
354
355 return -ENOTTY;
356}
357
358static ssize_t vfio_pci_read(void *device_data, char __user *buf,
359 size_t count, loff_t *ppos)
360{
361 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
362 struct vfio_pci_device *vdev = device_data;
363 struct pci_dev *pdev = vdev->pdev;
364
365 if (index >= VFIO_PCI_NUM_REGIONS)
366 return -EINVAL;
367
368 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
369 return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
370 else if (index == VFIO_PCI_ROM_REGION_INDEX)
371 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
372 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
373 return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
374 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
375 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
376
377 return -EINVAL;
378}
379
380static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
381 size_t count, loff_t *ppos)
382{
383 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
384 struct vfio_pci_device *vdev = device_data;
385 struct pci_dev *pdev = vdev->pdev;
386
387 if (index >= VFIO_PCI_NUM_REGIONS)
388 return -EINVAL;
389
390 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
391 return vfio_pci_config_readwrite(vdev, (char __user *)buf,
392 count, ppos, true);
393 else if (index == VFIO_PCI_ROM_REGION_INDEX)
394 return -EINVAL;
395 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
396 return vfio_pci_io_readwrite(vdev, (char __user *)buf,
397 count, ppos, true);
398 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
399 return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
400 count, ppos, true);
401 }
402
403 return -EINVAL;
404}
405
406static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
407{
408 struct vfio_pci_device *vdev = device_data;
409 struct pci_dev *pdev = vdev->pdev;
410 unsigned int index;
411 u64 phys_len, req_len, pgoff, req_start, phys;
412 int ret;
413
414 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
415
416 if (vma->vm_end < vma->vm_start)
417 return -EINVAL;
418 if ((vma->vm_flags & VM_SHARED) == 0)
419 return -EINVAL;
420 if (index >= VFIO_PCI_ROM_REGION_INDEX)
421 return -EINVAL;
422 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
423 return -EINVAL;
424
425 phys_len = pci_resource_len(pdev, index);
426 req_len = vma->vm_end - vma->vm_start;
427 pgoff = vma->vm_pgoff &
428 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
429 req_start = pgoff << PAGE_SHIFT;
430
431 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
432 return -EINVAL;
433
434 if (index == vdev->msix_bar) {
435 /*
436 * Disallow mmaps overlapping the MSI-X table; users don't
437 * get to touch this directly. We could find somewhere
438 * else to map the overlap, but page granularity is only
439 * a recommendation, not a requirement, so the user needs
440 * to know which bits are real. Requiring them to mmap
441 * around the table makes that clear.
442 */
443
444 /* If neither entirely above nor below, then it overlaps */
445 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
446 req_start + req_len <= vdev->msix_offset))
447 return -EINVAL;
448 }
449
450 /*
451 * Even though we don't make use of the barmap for the mmap,
452 * we need to request the region and the barmap tracks that.
453 */
454 if (!vdev->barmap[index]) {
455 ret = pci_request_selected_regions(pdev,
456 1 << index, "vfio-pci");
457 if (ret)
458 return ret;
459
460 vdev->barmap[index] = pci_iomap(pdev, index, 0);
461 }
462
463 vma->vm_private_data = vdev;
464 vma->vm_flags |= (VM_IO | VM_RESERVED);
465 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
466
467 phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
468
469 return remap_pfn_range(vma, vma->vm_start, phys,
470 req_len, vma->vm_page_prot);
471}
472
473static const struct vfio_device_ops vfio_pci_ops = {
474 .name = "vfio-pci",
475 .open = vfio_pci_open,
476 .release = vfio_pci_release,
477 .ioctl = vfio_pci_ioctl,
478 .read = vfio_pci_read,
479 .write = vfio_pci_write,
480 .mmap = vfio_pci_mmap,
481};
482
483static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
484{
485 u8 type;
486 struct vfio_pci_device *vdev;
487 struct iommu_group *group;
488 int ret;
489
490 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
491 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
492 return -EINVAL;
493
494 group = iommu_group_get(&pdev->dev);
495 if (!group)
496 return -EINVAL;
497
498 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
499 if (!vdev) {
500 iommu_group_put(group);
501 return -ENOMEM;
502 }
503
504 vdev->pdev = pdev;
505 vdev->irq_type = VFIO_PCI_NUM_IRQS;
506 mutex_init(&vdev->igate);
507 spin_lock_init(&vdev->irqlock);
508 atomic_set(&vdev->refcnt, 0);
509
510 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
511 if (ret) {
512 iommu_group_put(group);
513 kfree(vdev);
514 }
515
516 return ret;
517}
518
519static void vfio_pci_remove(struct pci_dev *pdev)
520{
521 struct vfio_pci_device *vdev;
522
523 vdev = vfio_del_group_dev(&pdev->dev);
524 if (!vdev)
525 return;
526
527 iommu_group_put(pdev->dev.iommu_group);
528 kfree(vdev);
529}
530
531static struct pci_driver vfio_pci_driver = {
532 .name = "vfio-pci",
533 .id_table = NULL, /* only dynamic ids */
534 .probe = vfio_pci_probe,
535 .remove = vfio_pci_remove,
536};
537
538static void __exit vfio_pci_cleanup(void)
539{
540 pci_unregister_driver(&vfio_pci_driver);
541 vfio_pci_virqfd_exit();
542 vfio_pci_uninit_perm_bits();
543}
544
545static int __init vfio_pci_init(void)
546{
547 int ret;
548
549 /* Allocate shared config space permision data used by all devices */
550 ret = vfio_pci_init_perm_bits();
551 if (ret)
552 return ret;
553
554 /* Start the virqfd cleanup handler */
555 ret = vfio_pci_virqfd_init();
556 if (ret)
557 goto out_virqfd;
558
559 /* Register and scan for devices */
560 ret = pci_register_driver(&vfio_pci_driver);
561 if (ret)
562 goto out_driver;
563
564 return 0;
565
566out_virqfd:
567 vfio_pci_virqfd_exit();
568out_driver:
569 vfio_pci_uninit_perm_bits();
570 return ret;
571}
572
573module_init(vfio_pci_init);
574module_exit(vfio_pci_cleanup);
575
576MODULE_VERSION(DRIVER_VERSION);
577MODULE_LICENSE("GPL v2");
578MODULE_AUTHOR(DRIVER_AUTHOR);
579MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
new file mode 100644
index 000000000000..8b8f7d11e102
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -0,0 +1,1540 @@
1/*
2 * VFIO PCI config space virtualization
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16/*
17 * This code handles reading and writing of PCI configuration registers.
18 * This is hairy because we want to allow a lot of flexibility to the
19 * user driver, but cannot trust it with all of the config fields.
20 * Tables determine which fields can be read and written, as well as
21 * which fields are 'virtualized' - special actions and translations to
22 * make it appear to the user that he has control, when in fact things
23 * must be negotiated with the underlying OS.
24 */
25
26#include <linux/fs.h>
27#include <linux/pci.h>
28#include <linux/uaccess.h>
29#include <linux/vfio.h>
30
31#include "vfio_pci_private.h"
32
33#define PCI_CFG_SPACE_SIZE 256
34
35/* Useful "pseudo" capabilities */
36#define PCI_CAP_ID_BASIC 0
37#define PCI_CAP_ID_INVALID 0xFF
38
39#define is_bar(offset) \
40 ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
41 (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
42
43/*
44 * Lengths of PCI Config Capabilities
45 * 0: Removed from the user visible capability list
46 * FF: Variable length
47 */
48static u8 pci_cap_length[] = {
49 [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */
50 [PCI_CAP_ID_PM] = PCI_PM_SIZEOF,
51 [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF,
52 [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF,
53 [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */
54 [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */
55 [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */
56 [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */
57 [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */
58 [PCI_CAP_ID_VNDR] = 0xFF, /* variable */
59 [PCI_CAP_ID_DBG] = 0, /* debug - don't care */
60 [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */
61 [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */
62 [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */
63 [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */
64 [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */
65 [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */
66 [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF,
67 [PCI_CAP_ID_SATA] = 0xFF,
68 [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF,
69};
70
71/*
72 * Lengths of PCIe/PCI-X Extended Config Capabilities
73 * 0: Removed or masked from the user visible capabilty list
74 * FF: Variable length
75 */
76static u16 pci_ext_cap_length[] = {
77 [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND,
78 [PCI_EXT_CAP_ID_VC] = 0xFF,
79 [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF,
80 [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF,
81 [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */
82 [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */
83 [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */
84 [PCI_EXT_CAP_ID_MFVC] = 0xFF,
85 [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */
86 [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */
87 [PCI_EXT_CAP_ID_VNDR] = 0xFF,
88 [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */
89 [PCI_EXT_CAP_ID_ACS] = 0xFF,
90 [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF,
91 [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF,
92 [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF,
93 [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */
94 [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
95 [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF,
96 [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */
97 [PCI_EXT_CAP_ID_REBAR] = 0xFF,
98 [PCI_EXT_CAP_ID_DPA] = 0xFF,
99 [PCI_EXT_CAP_ID_TPH] = 0xFF,
100 [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF,
101 [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */
102 [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */
103 [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */
104};
105
106/*
107 * Read/Write Permission Bits - one bit for each bit in capability
108 * Any field can be read if it exists, but what is read depends on
109 * whether the field is 'virtualized', or just pass thru to the
110 * hardware. Any virtualized field is also virtualized for writes.
111 * Writes are only permitted if they have a 1 bit here.
112 */
113struct perm_bits {
114 u8 *virt; /* read/write virtual data, not hw */
115 u8 *write; /* writeable bits */
116 int (*readfn)(struct vfio_pci_device *vdev, int pos, int count,
117 struct perm_bits *perm, int offset, __le32 *val);
118 int (*writefn)(struct vfio_pci_device *vdev, int pos, int count,
119 struct perm_bits *perm, int offset, __le32 val);
120};
121
122#define NO_VIRT 0
123#define ALL_VIRT 0xFFFFFFFFU
124#define NO_WRITE 0
125#define ALL_WRITE 0xFFFFFFFFU
126
127static int vfio_user_config_read(struct pci_dev *pdev, int offset,
128 __le32 *val, int count)
129{
130 int ret = -EINVAL;
131 u32 tmp_val = 0;
132
133 switch (count) {
134 case 1:
135 {
136 u8 tmp;
137 ret = pci_user_read_config_byte(pdev, offset, &tmp);
138 tmp_val = tmp;
139 break;
140 }
141 case 2:
142 {
143 u16 tmp;
144 ret = pci_user_read_config_word(pdev, offset, &tmp);
145 tmp_val = tmp;
146 break;
147 }
148 case 4:
149 ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
150 break;
151 }
152
153 *val = cpu_to_le32(tmp_val);
154
155 return pcibios_err_to_errno(ret);
156}
157
158static int vfio_user_config_write(struct pci_dev *pdev, int offset,
159 __le32 val, int count)
160{
161 int ret = -EINVAL;
162 u32 tmp_val = le32_to_cpu(val);
163
164 switch (count) {
165 case 1:
166 ret = pci_user_write_config_byte(pdev, offset, tmp_val);
167 break;
168 case 2:
169 ret = pci_user_write_config_word(pdev, offset, tmp_val);
170 break;
171 case 4:
172 ret = pci_user_write_config_dword(pdev, offset, tmp_val);
173 break;
174 }
175
176 return pcibios_err_to_errno(ret);
177}
178
179static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
180 int count, struct perm_bits *perm,
181 int offset, __le32 *val)
182{
183 __le32 virt = 0;
184
185 memcpy(val, vdev->vconfig + pos, count);
186
187 memcpy(&virt, perm->virt + offset, count);
188
189 /* Any non-virtualized bits? */
190 if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
191 struct pci_dev *pdev = vdev->pdev;
192 __le32 phys_val = 0;
193 int ret;
194
195 ret = vfio_user_config_read(pdev, pos, &phys_val, count);
196 if (ret)
197 return ret;
198
199 *val = (phys_val & ~virt) | (*val & virt);
200 }
201
202 return count;
203}
204
205static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
206 int count, struct perm_bits *perm,
207 int offset, __le32 val)
208{
209 __le32 virt = 0, write = 0;
210
211 memcpy(&write, perm->write + offset, count);
212
213 if (!write)
214 return count; /* drop, no writable bits */
215
216 memcpy(&virt, perm->virt + offset, count);
217
218 /* Virtualized and writable bits go to vconfig */
219 if (write & virt) {
220 __le32 virt_val = 0;
221
222 memcpy(&virt_val, vdev->vconfig + pos, count);
223
224 virt_val &= ~(write & virt);
225 virt_val |= (val & (write & virt));
226
227 memcpy(vdev->vconfig + pos, &virt_val, count);
228 }
229
230 /* Non-virtualzed and writable bits go to hardware */
231 if (write & ~virt) {
232 struct pci_dev *pdev = vdev->pdev;
233 __le32 phys_val = 0;
234 int ret;
235
236 ret = vfio_user_config_read(pdev, pos, &phys_val, count);
237 if (ret)
238 return ret;
239
240 phys_val &= ~(write & ~virt);
241 phys_val |= (val & (write & ~virt));
242
243 ret = vfio_user_config_write(pdev, pos, phys_val, count);
244 if (ret)
245 return ret;
246 }
247
248 return count;
249}
250
251/* Allow direct read from hardware, except for capability next pointer */
252static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
253 int count, struct perm_bits *perm,
254 int offset, __le32 *val)
255{
256 int ret;
257
258 ret = vfio_user_config_read(vdev->pdev, pos, val, count);
259 if (ret)
260 return pcibios_err_to_errno(ret);
261
262 if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
263 if (offset < 4)
264 memcpy(val, vdev->vconfig + pos, count);
265 } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
266 if (offset == PCI_CAP_LIST_ID && count > 1)
267 memcpy(val, vdev->vconfig + pos,
268 min(PCI_CAP_FLAGS, count));
269 else if (offset == PCI_CAP_LIST_NEXT)
270 memcpy(val, vdev->vconfig + pos, 1);
271 }
272
273 return count;
274}
275
276static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
277 int count, struct perm_bits *perm,
278 int offset, __le32 val)
279{
280 int ret;
281
282 ret = vfio_user_config_write(vdev->pdev, pos, val, count);
283 if (ret)
284 return ret;
285
286 return count;
287}
288
289/* Default all regions to read-only, no-virtualization */
290static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
291 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
292};
293static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
294 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
295};
296
297static void free_perm_bits(struct perm_bits *perm)
298{
299 kfree(perm->virt);
300 kfree(perm->write);
301 perm->virt = NULL;
302 perm->write = NULL;
303}
304
305static int alloc_perm_bits(struct perm_bits *perm, int size)
306{
307 /*
308 * Round up all permission bits to the next dword, this lets us
309 * ignore whether a read/write exceeds the defined capability
310 * structure. We can do this because:
311 * - Standard config space is already dword aligned
312 * - Capabilities are all dword alinged (bits 0:1 of next reserved)
313 * - Express capabilities defined as dword aligned
314 */
315 size = round_up(size, 4);
316
317 /*
318 * Zero state is
319 * - All Readable, None Writeable, None Virtualized
320 */
321 perm->virt = kzalloc(size, GFP_KERNEL);
322 perm->write = kzalloc(size, GFP_KERNEL);
323 if (!perm->virt || !perm->write) {
324 free_perm_bits(perm);
325 return -ENOMEM;
326 }
327
328 perm->readfn = vfio_default_config_read;
329 perm->writefn = vfio_default_config_write;
330
331 return 0;
332}
333
334/*
335 * Helper functions for filling in permission tables
336 */
337static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
338{
339 p->virt[off] = virt;
340 p->write[off] = write;
341}
342
343/* Handle endian-ness - pci and tables are little-endian */
344static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
345{
346 *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
347 *(__le16 *)(&p->write[off]) = cpu_to_le16(write);
348}
349
350/* Handle endian-ness - pci and tables are little-endian */
351static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
352{
353 *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
354 *(__le32 *)(&p->write[off]) = cpu_to_le32(write);
355}
356
357/*
358 * Restore the *real* BARs after we detect a FLR or backdoor reset.
359 * (backdoor = some device specific technique that we didn't catch)
360 */
361static void vfio_bar_restore(struct vfio_pci_device *vdev)
362{
363 struct pci_dev *pdev = vdev->pdev;
364 u32 *rbar = vdev->rbar;
365 int i;
366
367 if (pdev->is_virtfn)
368 return;
369
370 pr_info("%s: %s reset recovery - restoring bars\n",
371 __func__, dev_name(&pdev->dev));
372
373 for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
374 pci_user_write_config_dword(pdev, i, *rbar);
375
376 pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
377}
378
379static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
380{
381 unsigned long flags = pci_resource_flags(pdev, bar);
382 u32 val;
383
384 if (flags & IORESOURCE_IO)
385 return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
386
387 val = PCI_BASE_ADDRESS_SPACE_MEMORY;
388
389 if (flags & IORESOURCE_PREFETCH)
390 val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
391
392 if (flags & IORESOURCE_MEM_64)
393 val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
394
395 return cpu_to_le32(val);
396}
397
398/*
399 * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
400 * to reflect the hardware capabilities. This implements BAR sizing.
401 */
402static void vfio_bar_fixup(struct vfio_pci_device *vdev)
403{
404 struct pci_dev *pdev = vdev->pdev;
405 int i;
406 __le32 *bar;
407 u64 mask;
408
409 bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
410
411 for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
412 if (!pci_resource_start(pdev, i)) {
413 *bar = 0; /* Unmapped by host = unimplemented to user */
414 continue;
415 }
416
417 mask = ~(pci_resource_len(pdev, i) - 1);
418
419 *bar &= cpu_to_le32((u32)mask);
420 *bar |= vfio_generate_bar_flags(pdev, i);
421
422 if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
423 bar++;
424 *bar &= cpu_to_le32((u32)(mask >> 32));
425 i++;
426 }
427 }
428
429 bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
430
431 /*
432 * NB. we expose the actual BAR size here, regardless of whether
433 * we can read it. When we report the REGION_INFO for the ROM
434 * we report what PCI tells us is the actual ROM size.
435 */
436 if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
437 mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
438 mask |= PCI_ROM_ADDRESS_ENABLE;
439 *bar &= cpu_to_le32((u32)mask);
440 } else
441 *bar = 0;
442
443 vdev->bardirty = false;
444}
445
446static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
447 int count, struct perm_bits *perm,
448 int offset, __le32 *val)
449{
450 if (is_bar(offset)) /* pos == offset for basic config */
451 vfio_bar_fixup(vdev);
452
453 count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
454
455 /* Mask in virtual memory enable for SR-IOV devices */
456 if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
457 u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
458 u32 tmp_val = le32_to_cpu(*val);
459
460 tmp_val |= cmd & PCI_COMMAND_MEMORY;
461 *val = cpu_to_le32(tmp_val);
462 }
463
464 return count;
465}
466
467static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
468 int count, struct perm_bits *perm,
469 int offset, __le32 val)
470{
471 struct pci_dev *pdev = vdev->pdev;
472 __le16 *virt_cmd;
473 u16 new_cmd = 0;
474 int ret;
475
476 virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND];
477
478 if (offset == PCI_COMMAND) {
479 bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io;
480 u16 phys_cmd;
481
482 ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd);
483 if (ret)
484 return ret;
485
486 new_cmd = le32_to_cpu(val);
487
488 phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY);
489 virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
490 new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
491
492 phys_io = !!(phys_cmd & PCI_COMMAND_IO);
493 virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO);
494 new_io = !!(new_cmd & PCI_COMMAND_IO);
495
496 /*
497 * If the user is writing mem/io enable (new_mem/io) and we
498 * think it's already enabled (virt_mem/io), but the hardware
499 * shows it disabled (phys_mem/io, then the device has
500 * undergone some kind of backdoor reset and needs to be
501 * restored before we allow it to enable the bars.
502 * SR-IOV devices will trigger this, but we catch them later
503 */
504 if ((new_mem && virt_mem && !phys_mem) ||
505 (new_io && virt_io && !phys_io))
506 vfio_bar_restore(vdev);
507 }
508
509 count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
510 if (count < 0)
511 return count;
512
513 /*
514 * Save current memory/io enable bits in vconfig to allow for
515 * the test above next time.
516 */
517 if (offset == PCI_COMMAND) {
518 u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO;
519
520 *virt_cmd &= cpu_to_le16(~mask);
521 *virt_cmd |= cpu_to_le16(new_cmd & mask);
522 }
523
524 /* Emulate INTx disable */
525 if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) {
526 bool virt_intx_disable;
527
528 virt_intx_disable = !!(le16_to_cpu(*virt_cmd) &
529 PCI_COMMAND_INTX_DISABLE);
530
531 if (virt_intx_disable && !vdev->virq_disabled) {
532 vdev->virq_disabled = true;
533 vfio_pci_intx_mask(vdev);
534 } else if (!virt_intx_disable && vdev->virq_disabled) {
535 vdev->virq_disabled = false;
536 vfio_pci_intx_unmask(vdev);
537 }
538 }
539
540 if (is_bar(offset))
541 vdev->bardirty = true;
542
543 return count;
544}
545
546/* Permissions for the Basic PCI Header */
547static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
548{
549 if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF))
550 return -ENOMEM;
551
552 perm->readfn = vfio_basic_config_read;
553 perm->writefn = vfio_basic_config_write;
554
555 /* Virtualized for SR-IOV functions, which just have FFFF */
556 p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE);
557 p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE);
558
559 /*
560 * Virtualize INTx disable, we use it internally for interrupt
561 * control and can emulate it for non-PCI 2.3 devices.
562 */
563 p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE);
564
565 /* Virtualize capability list, we might want to skip/disable */
566 p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE);
567
568 /* No harm to write */
569 p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE);
570 p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE);
571 p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE);
572
573 /* Virtualize all bars, can't touch the real ones */
574 p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE);
575 p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE);
576 p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE);
577 p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE);
578 p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE);
579 p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE);
580 p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE);
581
582 /* Allow us to adjust capability chain */
583 p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE);
584
585 /* Sometimes used by sw, just virtualize */
586 p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE);
587 return 0;
588}
589
590/* Permissions for the Power Management capability */
591static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
592{
593 if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
594 return -ENOMEM;
595
596 /*
597 * We always virtualize the next field so we can remove
598 * capabilities from the chain if we want to.
599 */
600 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
601
602 /*
603 * Power management is defined *per function*,
604 * so we let the user write this
605 */
606 p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE);
607 return 0;
608}
609
610/* Permissions for PCI-X capability */
611static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
612{
613 /* Alloc 24, but only 8 are used in v0 */
614 if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2))
615 return -ENOMEM;
616
617 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
618
619 p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE);
620 p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE);
621 return 0;
622}
623
624/* Permissions for PCI Express capability */
625static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
626{
627 /* Alloc larger of two possible sizes */
628 if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
629 return -ENOMEM;
630
631 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
632
633 /*
634 * Allow writes to device control fields (includes FLR!)
635 * but not to devctl_phantom which could confuse IOMMU
636 * or to the ARI bit in devctl2 which is set at probe time
637 */
638 p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM);
639 p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
640 return 0;
641}
642
643/* Permissions for Advanced Function capability */
644static int __init init_pci_cap_af_perm(struct perm_bits *perm)
645{
646 if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
647 return -ENOMEM;
648
649 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
650 p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR);
651 return 0;
652}
653
654/* Permissions for Advanced Error Reporting extended capability */
655static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm)
656{
657 u32 mask;
658
659 if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR]))
660 return -ENOMEM;
661
662 /*
663 * Virtualize the first dword of all express capabilities
664 * because it includes the next pointer. This lets us later
665 * remove capabilities from the chain if we need to.
666 */
667 p_setd(perm, 0, ALL_VIRT, NO_WRITE);
668
669 /* Writable bits mask */
670 mask = PCI_ERR_UNC_TRAIN | /* Training */
671 PCI_ERR_UNC_DLP | /* Data Link Protocol */
672 PCI_ERR_UNC_SURPDN | /* Surprise Down */
673 PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */
674 PCI_ERR_UNC_FCP | /* Flow Control Protocol */
675 PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */
676 PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */
677 PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */
678 PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */
679 PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */
680 PCI_ERR_UNC_ECRC | /* ECRC Error Status */
681 PCI_ERR_UNC_UNSUP | /* Unsupported Request */
682 PCI_ERR_UNC_ACSV | /* ACS Violation */
683 PCI_ERR_UNC_INTN | /* internal error */
684 PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */
685 PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */
686 PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */
687 p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask);
688 p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask);
689 p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask);
690
691 mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */
692 PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */
693 PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */
694 PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */
695 PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */
696 PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */
697 PCI_ERR_COR_INTERNAL | /* Corrected Internal */
698 PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */
699 p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask);
700 p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask);
701
702 mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */
703 PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */
704 p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask);
705 return 0;
706}
707
708/* Permissions for Power Budgeting extended capability */
709static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm)
710{
711 if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR]))
712 return -ENOMEM;
713
714 p_setd(perm, 0, ALL_VIRT, NO_WRITE);
715
716 /* Writing the data selector is OK, the info is still read-only */
717 p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE);
718 return 0;
719}
720
721/*
722 * Initialize the shared permission tables
723 */
724void vfio_pci_uninit_perm_bits(void)
725{
726 free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]);
727
728 free_perm_bits(&cap_perms[PCI_CAP_ID_PM]);
729 free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]);
730 free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]);
731 free_perm_bits(&cap_perms[PCI_CAP_ID_AF]);
732
733 free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
734 free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
735}
736
737int __init vfio_pci_init_perm_bits(void)
738{
739 int ret;
740
741 /* Basic config space */
742 ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]);
743
744 /* Capabilities */
745 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
746 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write;
747 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
748 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write;
749 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
750 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
751
752 /* Extended capabilities */
753 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
754 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
755 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write;
756
757 if (ret)
758 vfio_pci_uninit_perm_bits();
759
760 return ret;
761}
762
763static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
764{
765 u8 cap;
766 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
767 PCI_STD_HEADER_SIZEOF;
768 base /= 4;
769 pos /= 4;
770
771 cap = vdev->pci_config_map[pos];
772
773 if (cap == PCI_CAP_ID_BASIC)
774 return 0;
775
776 /* XXX Can we have to abutting capabilities of the same type? */
777 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
778 pos--;
779
780 return pos * 4;
781}
782
783static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
784 int count, struct perm_bits *perm,
785 int offset, __le32 *val)
786{
787 /* Update max available queue size from msi_qmax */
788 if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
789 __le16 *flags;
790 int start;
791
792 start = vfio_find_cap_start(vdev, pos);
793
794 flags = (__le16 *)&vdev->vconfig[start];
795
796 *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK);
797 *flags |= cpu_to_le16(vdev->msi_qmax << 1);
798 }
799
800 return vfio_default_config_read(vdev, pos, count, perm, offset, val);
801}
802
803static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
804 int count, struct perm_bits *perm,
805 int offset, __le32 val)
806{
807 count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
808 if (count < 0)
809 return count;
810
811 /* Fixup and write configured queue size and enable to hardware */
812 if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
813 __le16 *pflags;
814 u16 flags;
815 int start, ret;
816
817 start = vfio_find_cap_start(vdev, pos);
818
819 pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS];
820
821 flags = le16_to_cpu(*pflags);
822
823 /* MSI is enabled via ioctl */
824 if (!is_msi(vdev))
825 flags &= ~PCI_MSI_FLAGS_ENABLE;
826
827 /* Check queue size */
828 if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) {
829 flags &= ~PCI_MSI_FLAGS_QSIZE;
830 flags |= vdev->msi_qmax << 4;
831 }
832
833 /* Write back to virt and to hardware */
834 *pflags = cpu_to_le16(flags);
835 ret = pci_user_write_config_word(vdev->pdev,
836 start + PCI_MSI_FLAGS,
837 flags);
838 if (ret)
839 return pcibios_err_to_errno(ret);
840 }
841
842 return count;
843}
844
845/*
846 * MSI determination is per-device, so this routine gets used beyond
847 * initialization time. Don't add __init
848 */
849static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
850{
851 if (alloc_perm_bits(perm, len))
852 return -ENOMEM;
853
854 perm->readfn = vfio_msi_config_read;
855 perm->writefn = vfio_msi_config_write;
856
857 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
858
859 /*
860 * The upper byte of the control register is reserved,
861 * just setup the lower byte.
862 */
863 p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE);
864 p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE);
865 if (flags & PCI_MSI_FLAGS_64BIT) {
866 p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE);
867 p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE);
868 if (flags & PCI_MSI_FLAGS_MASKBIT) {
869 p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE);
870 p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE);
871 }
872 } else {
873 p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE);
874 if (flags & PCI_MSI_FLAGS_MASKBIT) {
875 p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE);
876 p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE);
877 }
878 }
879 return 0;
880}
881
882/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
883static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
884{
885 struct pci_dev *pdev = vdev->pdev;
886 int len, ret;
887 u16 flags;
888
889 ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
890 if (ret)
891 return pcibios_err_to_errno(ret);
892
893 len = 10; /* Minimum size */
894 if (flags & PCI_MSI_FLAGS_64BIT)
895 len += 4;
896 if (flags & PCI_MSI_FLAGS_MASKBIT)
897 len += 10;
898
899 if (vdev->msi_perm)
900 return len;
901
902 vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL);
903 if (!vdev->msi_perm)
904 return -ENOMEM;
905
906 ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
907 if (ret)
908 return ret;
909
910 return len;
911}
912
913/* Determine extended capability length for VC (2 & 9) and MFVC */
914static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
915{
916 struct pci_dev *pdev = vdev->pdev;
917 u32 tmp;
918 int ret, evcc, phases, vc_arb;
919 int len = PCI_CAP_VC_BASE_SIZEOF;
920
921 ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp);
922 if (ret)
923 return pcibios_err_to_errno(ret);
924
925 evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */
926 ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp);
927 if (ret)
928 return pcibios_err_to_errno(ret);
929
930 if (tmp & PCI_VC_REG2_128_PHASE)
931 phases = 128;
932 else if (tmp & PCI_VC_REG2_64_PHASE)
933 phases = 64;
934 else if (tmp & PCI_VC_REG2_32_PHASE)
935 phases = 32;
936 else
937 phases = 0;
938
939 vc_arb = phases * 4;
940
941 /*
942 * Port arbitration tables are root & switch only;
943 * function arbitration tables are function 0 only.
944 * In either case, we'll never let user write them so
945 * we don't care how big they are
946 */
947 len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF;
948 if (vc_arb) {
949 len = round_up(len, 16);
950 len += vc_arb / 8;
951 }
952 return len;
953}
954
955static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
956{
957 struct pci_dev *pdev = vdev->pdev;
958 u16 word;
959 u8 byte;
960 int ret;
961
962 switch (cap) {
963 case PCI_CAP_ID_MSI:
964 return vfio_msi_cap_len(vdev, pos);
965 case PCI_CAP_ID_PCIX:
966 ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word);
967 if (ret)
968 return pcibios_err_to_errno(ret);
969
970 if (PCI_X_CMD_VERSION(word)) {
971 vdev->extended_caps = true;
972 return PCI_CAP_PCIX_SIZEOF_V2;
973 } else
974 return PCI_CAP_PCIX_SIZEOF_V0;
975 case PCI_CAP_ID_VNDR:
976 /* length follows next field */
977 ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte);
978 if (ret)
979 return pcibios_err_to_errno(ret);
980
981 return byte;
982 case PCI_CAP_ID_EXP:
983 /* length based on version */
984 ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
985 if (ret)
986 return pcibios_err_to_errno(ret);
987
988 if ((word & PCI_EXP_FLAGS_VERS) == 1)
989 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
990 else {
991 vdev->extended_caps = true;
992 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
993 }
994 case PCI_CAP_ID_HT:
995 ret = pci_read_config_byte(pdev, pos + 3, &byte);
996 if (ret)
997 return pcibios_err_to_errno(ret);
998
999 return (byte & HT_3BIT_CAP_MASK) ?
1000 HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG;
1001 case PCI_CAP_ID_SATA:
1002 ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte);
1003 if (ret)
1004 return pcibios_err_to_errno(ret);
1005
1006 byte &= PCI_SATA_REGS_MASK;
1007 if (byte == PCI_SATA_REGS_INLINE)
1008 return PCI_SATA_SIZEOF_LONG;
1009 else
1010 return PCI_SATA_SIZEOF_SHORT;
1011 default:
1012 pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n",
1013 dev_name(&pdev->dev), __func__, cap, pos);
1014 }
1015
1016 return 0;
1017}
1018
1019static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
1020{
1021 struct pci_dev *pdev = vdev->pdev;
1022 u8 byte;
1023 u32 dword;
1024 int ret;
1025
1026 switch (ecap) {
1027 case PCI_EXT_CAP_ID_VNDR:
1028 ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
1029 if (ret)
1030 return pcibios_err_to_errno(ret);
1031
1032 return dword >> PCI_VSEC_HDR_LEN_SHIFT;
1033 case PCI_EXT_CAP_ID_VC:
1034 case PCI_EXT_CAP_ID_VC9:
1035 case PCI_EXT_CAP_ID_MFVC:
1036 return vfio_vc_cap_len(vdev, epos);
1037 case PCI_EXT_CAP_ID_ACS:
1038 ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte);
1039 if (ret)
1040 return pcibios_err_to_errno(ret);
1041
1042 if (byte & PCI_ACS_EC) {
1043 int bits;
1044
1045 ret = pci_read_config_byte(pdev,
1046 epos + PCI_ACS_EGRESS_BITS,
1047 &byte);
1048 if (ret)
1049 return pcibios_err_to_errno(ret);
1050
1051 bits = byte ? round_up(byte, 32) : 256;
1052 return 8 + (bits / 8);
1053 }
1054 return 8;
1055
1056 case PCI_EXT_CAP_ID_REBAR:
1057 ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte);
1058 if (ret)
1059 return pcibios_err_to_errno(ret);
1060
1061 byte &= PCI_REBAR_CTRL_NBAR_MASK;
1062 byte >>= PCI_REBAR_CTRL_NBAR_SHIFT;
1063
1064 return 4 + (byte * 8);
1065 case PCI_EXT_CAP_ID_DPA:
1066 ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte);
1067 if (ret)
1068 return pcibios_err_to_errno(ret);
1069
1070 byte &= PCI_DPA_CAP_SUBSTATE_MASK;
1071 byte = round_up(byte + 1, 4);
1072 return PCI_DPA_BASE_SIZEOF + byte;
1073 case PCI_EXT_CAP_ID_TPH:
1074 ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword);
1075 if (ret)
1076 return pcibios_err_to_errno(ret);
1077
1078 if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) {
1079 int sts;
1080
1081 sts = byte & PCI_TPH_CAP_ST_MASK;
1082 sts >>= PCI_TPH_CAP_ST_SHIFT;
1083 return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4);
1084 }
1085 return PCI_TPH_BASE_SIZEOF;
1086 default:
1087 pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n",
1088 dev_name(&pdev->dev), __func__, ecap, epos);
1089 }
1090
1091 return 0;
1092}
1093
1094static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
1095 int offset, int size)
1096{
1097 struct pci_dev *pdev = vdev->pdev;
1098 int ret = 0;
1099
1100 /*
1101 * We try to read physical config space in the largest chunks
1102 * we can, assuming that all of the fields support dword access.
1103 * pci_save_state() makes this same assumption and seems to do ok.
1104 */
1105 while (size) {
1106 int filled;
1107
1108 if (size >= 4 && !(offset % 4)) {
1109 __le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
1110 u32 dword;
1111
1112 ret = pci_read_config_dword(pdev, offset, &dword);
1113 if (ret)
1114 return ret;
1115 *dwordp = cpu_to_le32(dword);
1116 filled = 4;
1117 } else if (size >= 2 && !(offset % 2)) {
1118 __le16 *wordp = (__le16 *)&vdev->vconfig[offset];
1119 u16 word;
1120
1121 ret = pci_read_config_word(pdev, offset, &word);
1122 if (ret)
1123 return ret;
1124 *wordp = cpu_to_le16(word);
1125 filled = 2;
1126 } else {
1127 u8 *byte = &vdev->vconfig[offset];
1128 ret = pci_read_config_byte(pdev, offset, byte);
1129 if (ret)
1130 return ret;
1131 filled = 1;
1132 }
1133
1134 offset += filled;
1135 size -= filled;
1136 }
1137
1138 return ret;
1139}
1140
1141static int vfio_cap_init(struct vfio_pci_device *vdev)
1142{
1143 struct pci_dev *pdev = vdev->pdev;
1144 u8 *map = vdev->pci_config_map;
1145 u16 status;
1146 u8 pos, *prev, cap;
1147 int loops, ret, caps = 0;
1148
1149 /* Any capabilities? */
1150 ret = pci_read_config_word(pdev, PCI_STATUS, &status);
1151 if (ret)
1152 return ret;
1153
1154 if (!(status & PCI_STATUS_CAP_LIST))
1155 return 0; /* Done */
1156
1157 ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
1158 if (ret)
1159 return ret;
1160
1161 /* Mark the previous position in case we want to skip a capability */
1162 prev = &vdev->vconfig[PCI_CAPABILITY_LIST];
1163
1164 /* We can bound our loop, capabilities are dword aligned */
1165 loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF;
1166 while (pos && loops--) {
1167 u8 next;
1168 int i, len = 0;
1169
1170 ret = pci_read_config_byte(pdev, pos, &cap);
1171 if (ret)
1172 return ret;
1173
1174 ret = pci_read_config_byte(pdev,
1175 pos + PCI_CAP_LIST_NEXT, &next);
1176 if (ret)
1177 return ret;
1178
1179 if (cap <= PCI_CAP_ID_MAX) {
1180 len = pci_cap_length[cap];
1181 if (len == 0xFF) { /* Variable length */
1182 len = vfio_cap_len(vdev, cap, pos);
1183 if (len < 0)
1184 return len;
1185 }
1186 }
1187
1188 if (!len) {
1189 pr_info("%s: %s hiding cap 0x%x\n",
1190 __func__, dev_name(&pdev->dev), cap);
1191 *prev = next;
1192 pos = next;
1193 continue;
1194 }
1195
1196 /* Sanity check, do we overlap other capabilities? */
1197 for (i = 0; i < len; i += 4) {
1198 if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID))
1199 continue;
1200
1201 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
1202 __func__, dev_name(&pdev->dev),
1203 pos + i, map[pos + i], cap);
1204 }
1205
1206 memset(map + (pos / 4), cap, len / 4);
1207 ret = vfio_fill_vconfig_bytes(vdev, pos, len);
1208 if (ret)
1209 return ret;
1210
1211 prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT];
1212 pos = next;
1213 caps++;
1214 }
1215
1216 /* If we didn't fill any capabilities, clear the status flag */
1217 if (!caps) {
1218 __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS];
1219 *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST);
1220 }
1221
1222 return 0;
1223}
1224
1225static int vfio_ecap_init(struct vfio_pci_device *vdev)
1226{
1227 struct pci_dev *pdev = vdev->pdev;
1228 u8 *map = vdev->pci_config_map;
1229 u16 epos;
1230 __le32 *prev = NULL;
1231 int loops, ret, ecaps = 0;
1232
1233 if (!vdev->extended_caps)
1234 return 0;
1235
1236 epos = PCI_CFG_SPACE_SIZE;
1237
1238 loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
1239
1240 while (loops-- && epos >= PCI_CFG_SPACE_SIZE) {
1241 u32 header;
1242 u16 ecap;
1243 int i, len = 0;
1244 bool hidden = false;
1245
1246 ret = pci_read_config_dword(pdev, epos, &header);
1247 if (ret)
1248 return ret;
1249
1250 ecap = PCI_EXT_CAP_ID(header);
1251
1252 if (ecap <= PCI_EXT_CAP_ID_MAX) {
1253 len = pci_ext_cap_length[ecap];
1254 if (len == 0xFF) {
1255 len = vfio_ext_cap_len(vdev, ecap, epos);
1256 if (len < 0)
1257 return ret;
1258 }
1259 }
1260
1261 if (!len) {
1262 pr_info("%s: %s hiding ecap 0x%x@0x%x\n",
1263 __func__, dev_name(&pdev->dev), ecap, epos);
1264
1265 /* If not the first in the chain, we can skip over it */
1266 if (prev) {
1267 u32 val = epos = PCI_EXT_CAP_NEXT(header);
1268 *prev &= cpu_to_le32(~(0xffcU << 20));
1269 *prev |= cpu_to_le32(val << 20);
1270 continue;
1271 }
1272
1273 /*
1274 * Otherwise, fill in a placeholder, the direct
1275 * readfn will virtualize this automatically
1276 */
1277 len = PCI_CAP_SIZEOF;
1278 hidden = true;
1279 }
1280
1281 for (i = 0; i < len; i += 4) {
1282 if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID))
1283 continue;
1284
1285 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
1286 __func__, dev_name(&pdev->dev),
1287 epos + i, map[epos + i], ecap);
1288 }
1289
1290 /*
1291 * Even though ecap is 2 bytes, we're currently a long way
1292 * from exceeding 1 byte capabilities. If we ever make it
1293 * up to 0xFF we'll need to up this to a two-byte, byte map.
1294 */
1295 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
1296
1297 memset(map + (epos / 4), ecap, len / 4);
1298 ret = vfio_fill_vconfig_bytes(vdev, epos, len);
1299 if (ret)
1300 return ret;
1301
1302 /*
1303 * If we're just using this capability to anchor the list,
1304 * hide the real ID. Only count real ecaps. XXX PCI spec
1305 * indicates to use cap id = 0, version = 0, next = 0 if
1306 * ecaps are absent, hope users check all the way to next.
1307 */
1308 if (hidden)
1309 *(__le32 *)&vdev->vconfig[epos] &=
1310 cpu_to_le32((0xffcU << 20));
1311 else
1312 ecaps++;
1313
1314 prev = (__le32 *)&vdev->vconfig[epos];
1315 epos = PCI_EXT_CAP_NEXT(header);
1316 }
1317
1318 if (!ecaps)
1319 *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
1320
1321 return 0;
1322}
1323
1324/*
1325 * For each device we allocate a pci_config_map that indicates the
1326 * capability occupying each dword and thus the struct perm_bits we
1327 * use for read and write. We also allocate a virtualized config
1328 * space which tracks reads and writes to bits that we emulate for
1329 * the user. Initial values filled from device.
1330 *
1331 * Using shared stuct perm_bits between all vfio-pci devices saves
1332 * us from allocating cfg_size buffers for virt and write for every
1333 * device. We could remove vconfig and allocate individual buffers
1334 * for each area requring emulated bits, but the array of pointers
1335 * would be comparable in size (at least for standard config space).
1336 */
1337int vfio_config_init(struct vfio_pci_device *vdev)
1338{
1339 struct pci_dev *pdev = vdev->pdev;
1340 u8 *map, *vconfig;
1341 int ret;
1342
1343 /*
1344 * Config space, caps and ecaps are all dword aligned, so we can
1345 * use one byte per dword to record the type.
1346 */
1347 map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL);
1348 if (!map)
1349 return -ENOMEM;
1350
1351 vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL);
1352 if (!vconfig) {
1353 kfree(map);
1354 return -ENOMEM;
1355 }
1356
1357 vdev->pci_config_map = map;
1358 vdev->vconfig = vconfig;
1359
1360 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4);
1361 memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID,
1362 (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4);
1363
1364 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
1365 if (ret)
1366 goto out;
1367
1368 vdev->bardirty = true;
1369
1370 /*
1371 * XXX can we just pci_load_saved_state/pci_restore_state?
1372 * may need to rebuild vconfig after that
1373 */
1374
1375 /* For restore after reset */
1376 vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]);
1377 vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]);
1378 vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]);
1379 vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]);
1380 vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]);
1381 vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]);
1382 vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]);
1383
1384 if (pdev->is_virtfn) {
1385 *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
1386 *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
1387 }
1388
1389 ret = vfio_cap_init(vdev);
1390 if (ret)
1391 goto out;
1392
1393 ret = vfio_ecap_init(vdev);
1394 if (ret)
1395 goto out;
1396
1397 return 0;
1398
1399out:
1400 kfree(map);
1401 vdev->pci_config_map = NULL;
1402 kfree(vconfig);
1403 vdev->vconfig = NULL;
1404 return pcibios_err_to_errno(ret);
1405}
1406
1407void vfio_config_free(struct vfio_pci_device *vdev)
1408{
1409 kfree(vdev->vconfig);
1410 vdev->vconfig = NULL;
1411 kfree(vdev->pci_config_map);
1412 vdev->pci_config_map = NULL;
1413 kfree(vdev->msi_perm);
1414 vdev->msi_perm = NULL;
1415}
1416
1417static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1418 size_t count, loff_t *ppos, bool iswrite)
1419{
1420 struct pci_dev *pdev = vdev->pdev;
1421 struct perm_bits *perm;
1422 __le32 val = 0;
1423 int cap_start = 0, offset;
1424 u8 cap_id;
1425 ssize_t ret = count;
1426
1427 if (*ppos < 0 || *ppos + count > pdev->cfg_size)
1428 return -EFAULT;
1429
1430 /*
1431 * gcc can't seem to figure out we're a static function, only called
1432 * with count of 1/2/4 and hits copy_from_user_overflow without this.
1433 */
1434 if (count > sizeof(val))
1435 return -EINVAL;
1436
1437 cap_id = vdev->pci_config_map[*ppos / 4];
1438
1439 if (cap_id == PCI_CAP_ID_INVALID) {
1440 if (iswrite)
1441 return ret; /* drop */
1442
1443 /*
1444 * Per PCI spec 3.0, section 6.1, reads from reserved and
1445 * unimplemented registers return 0
1446 */
1447 if (copy_to_user(buf, &val, count))
1448 return -EFAULT;
1449
1450 return ret;
1451 }
1452
1453 /*
1454 * All capabilities are minimum 4 bytes and aligned on dword
1455 * boundaries. Since we don't support unaligned accesses, we're
1456 * only ever accessing a single capability.
1457 */
1458 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1459 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1460
1461 perm = &ecap_perms[cap_id];
1462 cap_start = vfio_find_cap_start(vdev, *ppos);
1463
1464 } else {
1465 WARN_ON(cap_id > PCI_CAP_ID_MAX);
1466
1467 perm = &cap_perms[cap_id];
1468
1469 if (cap_id == PCI_CAP_ID_MSI)
1470 perm = vdev->msi_perm;
1471
1472 if (cap_id > PCI_CAP_ID_BASIC)
1473 cap_start = vfio_find_cap_start(vdev, *ppos);
1474 }
1475
1476 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
1477 WARN_ON(cap_start > *ppos);
1478
1479 offset = *ppos - cap_start;
1480
1481 if (iswrite) {
1482 if (!perm->writefn)
1483 return ret;
1484
1485 if (copy_from_user(&val, buf, count))
1486 return -EFAULT;
1487
1488 ret = perm->writefn(vdev, *ppos, count, perm, offset, val);
1489 } else {
1490 if (perm->readfn) {
1491 ret = perm->readfn(vdev, *ppos, count,
1492 perm, offset, &val);
1493 if (ret < 0)
1494 return ret;
1495 }
1496
1497 if (copy_to_user(buf, &val, count))
1498 return -EFAULT;
1499 }
1500
1501 return ret;
1502}
1503
1504ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
1505 char __user *buf, size_t count,
1506 loff_t *ppos, bool iswrite)
1507{
1508 size_t done = 0;
1509 int ret = 0;
1510 loff_t pos = *ppos;
1511
1512 pos &= VFIO_PCI_OFFSET_MASK;
1513
1514 /*
1515 * We want to both keep the access size the caller users as well as
1516 * support reading large chunks of config space in a single call.
1517 * PCI doesn't support unaligned accesses, so we can safely break
1518 * those apart.
1519 */
1520 while (count) {
1521 if (count >= 4 && !(pos % 4))
1522 ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
1523 else if (count >= 2 && !(pos % 2))
1524 ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
1525 else
1526 ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
1527
1528 if (ret < 0)
1529 return ret;
1530
1531 count -= ret;
1532 done += ret;
1533 buf += ret;
1534 pos += ret;
1535 }
1536
1537 *ppos += done;
1538
1539 return done;
1540}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
new file mode 100644
index 000000000000..211a4920b88a
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -0,0 +1,740 @@
1/*
2 * VFIO PCI interrupt handling
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/device.h>
17#include <linux/interrupt.h>
18#include <linux/eventfd.h>
19#include <linux/pci.h>
20#include <linux/file.h>
21#include <linux/poll.h>
22#include <linux/vfio.h>
23#include <linux/wait.h>
24#include <linux/workqueue.h>
25
26#include "vfio_pci_private.h"
27
28/*
29 * IRQfd - generic
30 */
31struct virqfd {
32 struct vfio_pci_device *vdev;
33 struct eventfd_ctx *eventfd;
34 int (*handler)(struct vfio_pci_device *, void *);
35 void (*thread)(struct vfio_pci_device *, void *);
36 void *data;
37 struct work_struct inject;
38 wait_queue_t wait;
39 poll_table pt;
40 struct work_struct shutdown;
41 struct virqfd **pvirqfd;
42};
43
44static struct workqueue_struct *vfio_irqfd_cleanup_wq;
45
46int __init vfio_pci_virqfd_init(void)
47{
48 vfio_irqfd_cleanup_wq =
49 create_singlethread_workqueue("vfio-irqfd-cleanup");
50 if (!vfio_irqfd_cleanup_wq)
51 return -ENOMEM;
52
53 return 0;
54}
55
56void vfio_pci_virqfd_exit(void)
57{
58 destroy_workqueue(vfio_irqfd_cleanup_wq);
59}
60
61static void virqfd_deactivate(struct virqfd *virqfd)
62{
63 queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
64}
65
66static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
67{
68 struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
69 unsigned long flags = (unsigned long)key;
70
71 if (flags & POLLIN) {
72 /* An event has been signaled, call function */
73 if ((!virqfd->handler ||
74 virqfd->handler(virqfd->vdev, virqfd->data)) &&
75 virqfd->thread)
76 schedule_work(&virqfd->inject);
77 }
78
79 if (flags & POLLHUP)
80 /* The eventfd is closing, detach from VFIO */
81 virqfd_deactivate(virqfd);
82
83 return 0;
84}
85
86static void virqfd_ptable_queue_proc(struct file *file,
87 wait_queue_head_t *wqh, poll_table *pt)
88{
89 struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
90 add_wait_queue(wqh, &virqfd->wait);
91}
92
93static void virqfd_shutdown(struct work_struct *work)
94{
95 struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
96 struct virqfd **pvirqfd = virqfd->pvirqfd;
97 u64 cnt;
98
99 eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
100 flush_work(&virqfd->inject);
101 eventfd_ctx_put(virqfd->eventfd);
102
103 kfree(virqfd);
104 *pvirqfd = NULL;
105}
106
107static void virqfd_inject(struct work_struct *work)
108{
109 struct virqfd *virqfd = container_of(work, struct virqfd, inject);
110 if (virqfd->thread)
111 virqfd->thread(virqfd->vdev, virqfd->data);
112}
113
114static int virqfd_enable(struct vfio_pci_device *vdev,
115 int (*handler)(struct vfio_pci_device *, void *),
116 void (*thread)(struct vfio_pci_device *, void *),
117 void *data, struct virqfd **pvirqfd, int fd)
118{
119 struct file *file = NULL;
120 struct eventfd_ctx *ctx = NULL;
121 struct virqfd *virqfd;
122 int ret = 0;
123 unsigned int events;
124
125 if (*pvirqfd)
126 return -EBUSY;
127
128 virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
129 if (!virqfd)
130 return -ENOMEM;
131
132 virqfd->pvirqfd = pvirqfd;
133 *pvirqfd = virqfd;
134 virqfd->vdev = vdev;
135 virqfd->handler = handler;
136 virqfd->thread = thread;
137 virqfd->data = data;
138
139 INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
140 INIT_WORK(&virqfd->inject, virqfd_inject);
141
142 file = eventfd_fget(fd);
143 if (IS_ERR(file)) {
144 ret = PTR_ERR(file);
145 goto fail;
146 }
147
148 ctx = eventfd_ctx_fileget(file);
149 if (IS_ERR(ctx)) {
150 ret = PTR_ERR(ctx);
151 goto fail;
152 }
153
154 virqfd->eventfd = ctx;
155
156 /*
157 * Install our own custom wake-up handling so we are notified via
158 * a callback whenever someone signals the underlying eventfd.
159 */
160 init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
161 init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
162
163 events = file->f_op->poll(file, &virqfd->pt);
164
165 /*
166 * Check if there was an event already pending on the eventfd
167 * before we registered and trigger it as if we didn't miss it.
168 */
169 if (events & POLLIN) {
170 if ((!handler || handler(vdev, data)) && thread)
171 schedule_work(&virqfd->inject);
172 }
173
174 /*
175 * Do not drop the file until the irqfd is fully initialized,
176 * otherwise we might race against the POLLHUP.
177 */
178 fput(file);
179
180 return 0;
181
182fail:
183 if (ctx && !IS_ERR(ctx))
184 eventfd_ctx_put(ctx);
185
186 if (file && !IS_ERR(file))
187 fput(file);
188
189 kfree(virqfd);
190 *pvirqfd = NULL;
191
192 return ret;
193}
194
195static void virqfd_disable(struct virqfd *virqfd)
196{
197 if (!virqfd)
198 return;
199
200 virqfd_deactivate(virqfd);
201
202 /* Block until we know all outstanding shutdown jobs have completed. */
203 flush_workqueue(vfio_irqfd_cleanup_wq);
204}
205
206/*
207 * INTx
208 */
209static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
210{
211 if (likely(is_intx(vdev) && !vdev->virq_disabled))
212 eventfd_signal(vdev->ctx[0].trigger, 1);
213}
214
215void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
216{
217 struct pci_dev *pdev = vdev->pdev;
218 unsigned long flags;
219
220 spin_lock_irqsave(&vdev->irqlock, flags);
221
222 /*
223 * Masking can come from interrupt, ioctl, or config space
224 * via INTx disable. The latter means this can get called
225 * even when not using intx delivery. In this case, just
226 * try to have the physical bit follow the virtual bit.
227 */
228 if (unlikely(!is_intx(vdev))) {
229 if (vdev->pci_2_3)
230 pci_intx(pdev, 0);
231 } else if (!vdev->ctx[0].masked) {
232 /*
233 * Can't use check_and_mask here because we always want to
234 * mask, not just when something is pending.
235 */
236 if (vdev->pci_2_3)
237 pci_intx(pdev, 0);
238 else
239 disable_irq_nosync(pdev->irq);
240
241 vdev->ctx[0].masked = true;
242 }
243
244 spin_unlock_irqrestore(&vdev->irqlock, flags);
245}
246
247/*
248 * If this is triggered by an eventfd, we can't call eventfd_signal
249 * or else we'll deadlock on the eventfd wait queue. Return >0 when
250 * a signal is necessary, which can then be handled via a work queue
251 * or directly depending on the caller.
252 */
253int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
254{
255 struct pci_dev *pdev = vdev->pdev;
256 unsigned long flags;
257 int ret = 0;
258
259 spin_lock_irqsave(&vdev->irqlock, flags);
260
261 /*
262 * Unmasking comes from ioctl or config, so again, have the
263 * physical bit follow the virtual even when not using INTx.
264 */
265 if (unlikely(!is_intx(vdev))) {
266 if (vdev->pci_2_3)
267 pci_intx(pdev, 1);
268 } else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
269 /*
270 * A pending interrupt here would immediately trigger,
271 * but we can avoid that overhead by just re-sending
272 * the interrupt to the user.
273 */
274 if (vdev->pci_2_3) {
275 if (!pci_check_and_unmask_intx(pdev))
276 ret = 1;
277 } else
278 enable_irq(pdev->irq);
279
280 vdev->ctx[0].masked = (ret > 0);
281 }
282
283 spin_unlock_irqrestore(&vdev->irqlock, flags);
284
285 return ret;
286}
287
288void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
289{
290 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
291 vfio_send_intx_eventfd(vdev, NULL);
292}
293
294static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
295{
296 struct vfio_pci_device *vdev = dev_id;
297 unsigned long flags;
298 int ret = IRQ_NONE;
299
300 spin_lock_irqsave(&vdev->irqlock, flags);
301
302 if (!vdev->pci_2_3) {
303 disable_irq_nosync(vdev->pdev->irq);
304 vdev->ctx[0].masked = true;
305 ret = IRQ_HANDLED;
306 } else if (!vdev->ctx[0].masked && /* may be shared */
307 pci_check_and_mask_intx(vdev->pdev)) {
308 vdev->ctx[0].masked = true;
309 ret = IRQ_HANDLED;
310 }
311
312 spin_unlock_irqrestore(&vdev->irqlock, flags);
313
314 if (ret == IRQ_HANDLED)
315 vfio_send_intx_eventfd(vdev, NULL);
316
317 return ret;
318}
319
320static int vfio_intx_enable(struct vfio_pci_device *vdev)
321{
322 if (!is_irq_none(vdev))
323 return -EINVAL;
324
325 if (!vdev->pdev->irq)
326 return -ENODEV;
327
328 vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
329 if (!vdev->ctx)
330 return -ENOMEM;
331
332 vdev->num_ctx = 1;
333 vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
334
335 return 0;
336}
337
338static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
339{
340 struct pci_dev *pdev = vdev->pdev;
341 unsigned long irqflags = IRQF_SHARED;
342 struct eventfd_ctx *trigger;
343 unsigned long flags;
344 int ret;
345
346 if (vdev->ctx[0].trigger) {
347 free_irq(pdev->irq, vdev);
348 kfree(vdev->ctx[0].name);
349 eventfd_ctx_put(vdev->ctx[0].trigger);
350 vdev->ctx[0].trigger = NULL;
351 }
352
353 if (fd < 0) /* Disable only */
354 return 0;
355
356 vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
357 pci_name(pdev));
358 if (!vdev->ctx[0].name)
359 return -ENOMEM;
360
361 trigger = eventfd_ctx_fdget(fd);
362 if (IS_ERR(trigger)) {
363 kfree(vdev->ctx[0].name);
364 return PTR_ERR(trigger);
365 }
366
367 if (!vdev->pci_2_3)
368 irqflags = 0;
369
370 ret = request_irq(pdev->irq, vfio_intx_handler,
371 irqflags, vdev->ctx[0].name, vdev);
372 if (ret) {
373 kfree(vdev->ctx[0].name);
374 eventfd_ctx_put(trigger);
375 return ret;
376 }
377
378 vdev->ctx[0].trigger = trigger;
379
380 /*
381 * INTx disable will stick across the new irq setup,
382 * disable_irq won't.
383 */
384 spin_lock_irqsave(&vdev->irqlock, flags);
385 if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled))
386 disable_irq_nosync(pdev->irq);
387 spin_unlock_irqrestore(&vdev->irqlock, flags);
388
389 return 0;
390}
391
392static void vfio_intx_disable(struct vfio_pci_device *vdev)
393{
394 vfio_intx_set_signal(vdev, -1);
395 virqfd_disable(vdev->ctx[0].unmask);
396 virqfd_disable(vdev->ctx[0].mask);
397 vdev->irq_type = VFIO_PCI_NUM_IRQS;
398 vdev->num_ctx = 0;
399 kfree(vdev->ctx);
400}
401
402/*
403 * MSI/MSI-X
404 */
405static irqreturn_t vfio_msihandler(int irq, void *arg)
406{
407 struct eventfd_ctx *trigger = arg;
408
409 eventfd_signal(trigger, 1);
410 return IRQ_HANDLED;
411}
412
413static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
414{
415 struct pci_dev *pdev = vdev->pdev;
416 int ret;
417
418 if (!is_irq_none(vdev))
419 return -EINVAL;
420
421 vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
422 if (!vdev->ctx)
423 return -ENOMEM;
424
425 if (msix) {
426 int i;
427
428 vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
429 GFP_KERNEL);
430 if (!vdev->msix) {
431 kfree(vdev->ctx);
432 return -ENOMEM;
433 }
434
435 for (i = 0; i < nvec; i++)
436 vdev->msix[i].entry = i;
437
438 ret = pci_enable_msix(pdev, vdev->msix, nvec);
439 if (ret) {
440 kfree(vdev->msix);
441 kfree(vdev->ctx);
442 return ret;
443 }
444 } else {
445 ret = pci_enable_msi_block(pdev, nvec);
446 if (ret) {
447 kfree(vdev->ctx);
448 return ret;
449 }
450 }
451
452 vdev->num_ctx = nvec;
453 vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
454 VFIO_PCI_MSI_IRQ_INDEX;
455
456 if (!msix) {
457 /*
458 * Compute the virtual hardware field for max msi vectors -
459 * it is the log base 2 of the number of vectors.
460 */
461 vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
462 }
463
464 return 0;
465}
466
467static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
468 int vector, int fd, bool msix)
469{
470 struct pci_dev *pdev = vdev->pdev;
471 int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
472 char *name = msix ? "vfio-msix" : "vfio-msi";
473 struct eventfd_ctx *trigger;
474 int ret;
475
476 if (vector >= vdev->num_ctx)
477 return -EINVAL;
478
479 if (vdev->ctx[vector].trigger) {
480 free_irq(irq, vdev->ctx[vector].trigger);
481 kfree(vdev->ctx[vector].name);
482 eventfd_ctx_put(vdev->ctx[vector].trigger);
483 vdev->ctx[vector].trigger = NULL;
484 }
485
486 if (fd < 0)
487 return 0;
488
489 vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
490 name, vector, pci_name(pdev));
491 if (!vdev->ctx[vector].name)
492 return -ENOMEM;
493
494 trigger = eventfd_ctx_fdget(fd);
495 if (IS_ERR(trigger)) {
496 kfree(vdev->ctx[vector].name);
497 return PTR_ERR(trigger);
498 }
499
500 ret = request_irq(irq, vfio_msihandler, 0,
501 vdev->ctx[vector].name, trigger);
502 if (ret) {
503 kfree(vdev->ctx[vector].name);
504 eventfd_ctx_put(trigger);
505 return ret;
506 }
507
508 vdev->ctx[vector].trigger = trigger;
509
510 return 0;
511}
512
513static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
514 unsigned count, int32_t *fds, bool msix)
515{
516 int i, j, ret = 0;
517
518 if (start + count > vdev->num_ctx)
519 return -EINVAL;
520
521 for (i = 0, j = start; i < count && !ret; i++, j++) {
522 int fd = fds ? fds[i] : -1;
523 ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
524 }
525
526 if (ret) {
527 for (--j; j >= start; j--)
528 vfio_msi_set_vector_signal(vdev, j, -1, msix);
529 }
530
531 return ret;
532}
533
534static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
535{
536 struct pci_dev *pdev = vdev->pdev;
537 int i;
538
539 vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
540
541 for (i = 0; i < vdev->num_ctx; i++) {
542 virqfd_disable(vdev->ctx[i].unmask);
543 virqfd_disable(vdev->ctx[i].mask);
544 }
545
546 if (msix) {
547 pci_disable_msix(vdev->pdev);
548 kfree(vdev->msix);
549 } else
550 pci_disable_msi(pdev);
551
552 vdev->irq_type = VFIO_PCI_NUM_IRQS;
553 vdev->num_ctx = 0;
554 kfree(vdev->ctx);
555}
556
557/*
558 * IOCTL support
559 */
560static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
561 unsigned index, unsigned start,
562 unsigned count, uint32_t flags, void *data)
563{
564 if (!is_intx(vdev) || start != 0 || count != 1)
565 return -EINVAL;
566
567 if (flags & VFIO_IRQ_SET_DATA_NONE) {
568 vfio_pci_intx_unmask(vdev);
569 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
570 uint8_t unmask = *(uint8_t *)data;
571 if (unmask)
572 vfio_pci_intx_unmask(vdev);
573 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
574 int32_t fd = *(int32_t *)data;
575 if (fd >= 0)
576 return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
577 vfio_send_intx_eventfd, NULL,
578 &vdev->ctx[0].unmask, fd);
579
580 virqfd_disable(vdev->ctx[0].unmask);
581 }
582
583 return 0;
584}
585
586static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
587 unsigned index, unsigned start,
588 unsigned count, uint32_t flags, void *data)
589{
590 if (!is_intx(vdev) || start != 0 || count != 1)
591 return -EINVAL;
592
593 if (flags & VFIO_IRQ_SET_DATA_NONE) {
594 vfio_pci_intx_mask(vdev);
595 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
596 uint8_t mask = *(uint8_t *)data;
597 if (mask)
598 vfio_pci_intx_mask(vdev);
599 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
600 return -ENOTTY; /* XXX implement me */
601 }
602
603 return 0;
604}
605
606static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
607 unsigned index, unsigned start,
608 unsigned count, uint32_t flags, void *data)
609{
610 if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
611 vfio_intx_disable(vdev);
612 return 0;
613 }
614
615 if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
616 return -EINVAL;
617
618 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
619 int32_t fd = *(int32_t *)data;
620 int ret;
621
622 if (is_intx(vdev))
623 return vfio_intx_set_signal(vdev, fd);
624
625 ret = vfio_intx_enable(vdev);
626 if (ret)
627 return ret;
628
629 ret = vfio_intx_set_signal(vdev, fd);
630 if (ret)
631 vfio_intx_disable(vdev);
632
633 return ret;
634 }
635
636 if (!is_intx(vdev))
637 return -EINVAL;
638
639 if (flags & VFIO_IRQ_SET_DATA_NONE) {
640 vfio_send_intx_eventfd(vdev, NULL);
641 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
642 uint8_t trigger = *(uint8_t *)data;
643 if (trigger)
644 vfio_send_intx_eventfd(vdev, NULL);
645 }
646 return 0;
647}
648
649static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
650 unsigned index, unsigned start,
651 unsigned count, uint32_t flags, void *data)
652{
653 int i;
654 bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
655
656 if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
657 vfio_msi_disable(vdev, msix);
658 return 0;
659 }
660
661 if (!(irq_is(vdev, index) || is_irq_none(vdev)))
662 return -EINVAL;
663
664 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
665 int32_t *fds = data;
666 int ret;
667
668 if (vdev->irq_type == index)
669 return vfio_msi_set_block(vdev, start, count,
670 fds, msix);
671
672 ret = vfio_msi_enable(vdev, start + count, msix);
673 if (ret)
674 return ret;
675
676 ret = vfio_msi_set_block(vdev, start, count, fds, msix);
677 if (ret)
678 vfio_msi_disable(vdev, msix);
679
680 return ret;
681 }
682
683 if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
684 return -EINVAL;
685
686 for (i = start; i < start + count; i++) {
687 if (!vdev->ctx[i].trigger)
688 continue;
689 if (flags & VFIO_IRQ_SET_DATA_NONE) {
690 eventfd_signal(vdev->ctx[i].trigger, 1);
691 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
692 uint8_t *bools = data;
693 if (bools[i - start])
694 eventfd_signal(vdev->ctx[i].trigger, 1);
695 }
696 }
697 return 0;
698}
699
700int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
701 unsigned index, unsigned start, unsigned count,
702 void *data)
703{
704 int (*func)(struct vfio_pci_device *vdev, unsigned index,
705 unsigned start, unsigned count, uint32_t flags,
706 void *data) = NULL;
707
708 switch (index) {
709 case VFIO_PCI_INTX_IRQ_INDEX:
710 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
711 case VFIO_IRQ_SET_ACTION_MASK:
712 func = vfio_pci_set_intx_mask;
713 break;
714 case VFIO_IRQ_SET_ACTION_UNMASK:
715 func = vfio_pci_set_intx_unmask;
716 break;
717 case VFIO_IRQ_SET_ACTION_TRIGGER:
718 func = vfio_pci_set_intx_trigger;
719 break;
720 }
721 break;
722 case VFIO_PCI_MSI_IRQ_INDEX:
723 case VFIO_PCI_MSIX_IRQ_INDEX:
724 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
725 case VFIO_IRQ_SET_ACTION_MASK:
726 case VFIO_IRQ_SET_ACTION_UNMASK:
727 /* XXX Need masking support exported */
728 break;
729 case VFIO_IRQ_SET_ACTION_TRIGGER:
730 func = vfio_pci_set_msi_trigger;
731 break;
732 }
733 break;
734 }
735
736 if (!func)
737 return -ENOTTY;
738
739 return func(vdev, index, start, count, flags, data);
740}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
new file mode 100644
index 000000000000..611827cba8cd
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -0,0 +1,91 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/mutex.h>
15#include <linux/pci.h>
16
17#ifndef VFIO_PCI_PRIVATE_H
18#define VFIO_PCI_PRIVATE_H
19
20#define VFIO_PCI_OFFSET_SHIFT 40
21
22#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
23#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
24#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
25
26struct vfio_pci_irq_ctx {
27 struct eventfd_ctx *trigger;
28 struct virqfd *unmask;
29 struct virqfd *mask;
30 char *name;
31 bool masked;
32};
33
34struct vfio_pci_device {
35 struct pci_dev *pdev;
36 void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
37 u8 *pci_config_map;
38 u8 *vconfig;
39 struct perm_bits *msi_perm;
40 spinlock_t irqlock;
41 struct mutex igate;
42 struct msix_entry *msix;
43 struct vfio_pci_irq_ctx *ctx;
44 int num_ctx;
45 int irq_type;
46 u8 msi_qmax;
47 u8 msix_bar;
48 u16 msix_size;
49 u32 msix_offset;
50 u32 rbar[7];
51 bool pci_2_3;
52 bool virq_disabled;
53 bool reset_works;
54 bool extended_caps;
55 bool bardirty;
56 struct pci_saved_state *pci_saved_state;
57 atomic_t refcnt;
58};
59
60#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
61#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
62#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
63#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
64#define irq_is(vdev, type) (vdev->irq_type == type)
65
66extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
67extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
68
69extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
70 uint32_t flags, unsigned index,
71 unsigned start, unsigned count, void *data);
72
73extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
74 char __user *buf, size_t count,
75 loff_t *ppos, bool iswrite);
76extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev,
77 char __user *buf, size_t count,
78 loff_t *ppos, bool iswrite);
79extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev,
80 char __user *buf, size_t count,
81 loff_t *ppos, bool iswrite);
82
83extern int vfio_pci_init_perm_bits(void);
84extern void vfio_pci_uninit_perm_bits(void);
85
86extern int vfio_pci_virqfd_init(void);
87extern void vfio_pci_virqfd_exit(void);
88
89extern int vfio_config_init(struct vfio_pci_device *vdev);
90extern void vfio_config_free(struct vfio_pci_device *vdev);
91#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
new file mode 100644
index 000000000000..4362d9e7baa3
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -0,0 +1,269 @@
1/*
2 * VFIO PCI I/O Port & MMIO access
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/fs.h>
17#include <linux/pci.h>
18#include <linux/uaccess.h>
19#include <linux/io.h>
20
21#include "vfio_pci_private.h"
22
23/* I/O Port BAR access */
24ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf,
25 size_t count, loff_t *ppos, bool iswrite)
26{
27 struct pci_dev *pdev = vdev->pdev;
28 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
29 int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
30 void __iomem *io;
31 size_t done = 0;
32
33 if (!pci_resource_start(pdev, bar))
34 return -EINVAL;
35
36 if (pos + count > pci_resource_len(pdev, bar))
37 return -EINVAL;
38
39 if (!vdev->barmap[bar]) {
40 int ret;
41
42 ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
43 if (ret)
44 return ret;
45
46 vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
47
48 if (!vdev->barmap[bar]) {
49 pci_release_selected_regions(pdev, 1 << bar);
50 return -EINVAL;
51 }
52 }
53
54 io = vdev->barmap[bar];
55
56 while (count) {
57 int filled;
58
59 if (count >= 3 && !(pos % 4)) {
60 __le32 val;
61
62 if (iswrite) {
63 if (copy_from_user(&val, buf, 4))
64 return -EFAULT;
65
66 iowrite32(le32_to_cpu(val), io + pos);
67 } else {
68 val = cpu_to_le32(ioread32(io + pos));
69
70 if (copy_to_user(buf, &val, 4))
71 return -EFAULT;
72 }
73
74 filled = 4;
75
76 } else if ((pos % 2) == 0 && count >= 2) {
77 __le16 val;
78
79 if (iswrite) {
80 if (copy_from_user(&val, buf, 2))
81 return -EFAULT;
82
83 iowrite16(le16_to_cpu(val), io + pos);
84 } else {
85 val = cpu_to_le16(ioread16(io + pos));
86
87 if (copy_to_user(buf, &val, 2))
88 return -EFAULT;
89 }
90
91 filled = 2;
92 } else {
93 u8 val;
94
95 if (iswrite) {
96 if (copy_from_user(&val, buf, 1))
97 return -EFAULT;
98
99 iowrite8(val, io + pos);
100 } else {
101 val = ioread8(io + pos);
102
103 if (copy_to_user(buf, &val, 1))
104 return -EFAULT;
105 }
106
107 filled = 1;
108 }
109
110 count -= filled;
111 done += filled;
112 buf += filled;
113 pos += filled;
114 }
115
116 *ppos += done;
117
118 return done;
119}
120
121/*
122 * MMIO BAR access
123 * We handle two excluded ranges here as well, if the user tries to read
124 * the ROM beyond what PCI tells us is available or the MSI-X table region,
125 * we return 0xFF and writes are dropped.
126 */
127ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf,
128 size_t count, loff_t *ppos, bool iswrite)
129{
130 struct pci_dev *pdev = vdev->pdev;
131 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
132 int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
133 void __iomem *io;
134 resource_size_t end;
135 size_t done = 0;
136 size_t x_start = 0, x_end = 0; /* excluded range */
137
138 if (!pci_resource_start(pdev, bar))
139 return -EINVAL;
140
141 end = pci_resource_len(pdev, bar);
142
143 if (pos > end)
144 return -EINVAL;
145
146 if (pos == end)
147 return 0;
148
149 if (pos + count > end)
150 count = end - pos;
151
152 if (bar == PCI_ROM_RESOURCE) {
153 io = pci_map_rom(pdev, &x_start);
154 x_end = end;
155 } else {
156 if (!vdev->barmap[bar]) {
157 int ret;
158
159 ret = pci_request_selected_regions(pdev, 1 << bar,
160 "vfio");
161 if (ret)
162 return ret;
163
164 vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
165
166 if (!vdev->barmap[bar]) {
167 pci_release_selected_regions(pdev, 1 << bar);
168 return -EINVAL;
169 }
170 }
171
172 io = vdev->barmap[bar];
173
174 if (bar == vdev->msix_bar) {
175 x_start = vdev->msix_offset;
176 x_end = vdev->msix_offset + vdev->msix_size;
177 }
178 }
179
180 if (!io)
181 return -EINVAL;
182
183 while (count) {
184 size_t fillable, filled;
185
186 if (pos < x_start)
187 fillable = x_start - pos;
188 else if (pos >= x_end)
189 fillable = end - pos;
190 else
191 fillable = 0;
192
193 if (fillable >= 4 && !(pos % 4) && (count >= 4)) {
194 __le32 val;
195
196 if (iswrite) {
197 if (copy_from_user(&val, buf, 4))
198 goto out;
199
200 iowrite32(le32_to_cpu(val), io + pos);
201 } else {
202 val = cpu_to_le32(ioread32(io + pos));
203
204 if (copy_to_user(buf, &val, 4))
205 goto out;
206 }
207
208 filled = 4;
209 } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) {
210 __le16 val;
211
212 if (iswrite) {
213 if (copy_from_user(&val, buf, 2))
214 goto out;
215
216 iowrite16(le16_to_cpu(val), io + pos);
217 } else {
218 val = cpu_to_le16(ioread16(io + pos));
219
220 if (copy_to_user(buf, &val, 2))
221 goto out;
222 }
223
224 filled = 2;
225 } else if (fillable) {
226 u8 val;
227
228 if (iswrite) {
229 if (copy_from_user(&val, buf, 1))
230 goto out;
231
232 iowrite8(val, io + pos);
233 } else {
234 val = ioread8(io + pos);
235
236 if (copy_to_user(buf, &val, 1))
237 goto out;
238 }
239
240 filled = 1;
241 } else {
242 /* Drop writes, fill reads with FF */
243 if (!iswrite) {
244 char val = 0xFF;
245 size_t i;
246
247 for (i = 0; i < x_end - pos; i++) {
248 if (put_user(val, buf + i))
249 goto out;
250 }
251 }
252
253 filled = x_end - pos;
254 }
255
256 count -= filled;
257 done += filled;
258 buf += filled;
259 pos += filled;
260 }
261
262 *ppos += done;
263
264out:
265 if (bar == PCI_ROM_RESOURCE)
266 pci_unmap_rom(pdev, io);
267
268 return count ? -EFAULT : done;
269}
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
new file mode 100644
index 000000000000..9591e2b509d7
--- /dev/null
+++ b/drivers/vfio/vfio.c
@@ -0,0 +1,1420 @@
1/*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/cdev.h>
17#include <linux/compat.h>
18#include <linux/device.h>
19#include <linux/file.h>
20#include <linux/anon_inodes.h>
21#include <linux/fs.h>
22#include <linux/idr.h>
23#include <linux/iommu.h>
24#include <linux/list.h>
25#include <linux/module.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/string.h>
30#include <linux/uaccess.h>
31#include <linux/vfio.h>
32#include <linux/wait.h>
33
34#define DRIVER_VERSION "0.3"
35#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
36#define DRIVER_DESC "VFIO - User Level meta-driver"
37
38static struct vfio {
39 struct class *class;
40 struct list_head iommu_drivers_list;
41 struct mutex iommu_drivers_lock;
42 struct list_head group_list;
43 struct idr group_idr;
44 struct mutex group_lock;
45 struct cdev group_cdev;
46 struct device *dev;
47 dev_t devt;
48 struct cdev cdev;
49 wait_queue_head_t release_q;
50} vfio;
51
52struct vfio_iommu_driver {
53 const struct vfio_iommu_driver_ops *ops;
54 struct list_head vfio_next;
55};
56
57struct vfio_container {
58 struct kref kref;
59 struct list_head group_list;
60 struct mutex group_lock;
61 struct vfio_iommu_driver *iommu_driver;
62 void *iommu_data;
63};
64
65struct vfio_group {
66 struct kref kref;
67 int minor;
68 atomic_t container_users;
69 struct iommu_group *iommu_group;
70 struct vfio_container *container;
71 struct list_head device_list;
72 struct mutex device_lock;
73 struct device *dev;
74 struct notifier_block nb;
75 struct list_head vfio_next;
76 struct list_head container_next;
77};
78
79struct vfio_device {
80 struct kref kref;
81 struct device *dev;
82 const struct vfio_device_ops *ops;
83 struct vfio_group *group;
84 struct list_head group_next;
85 void *device_data;
86};
87
88/**
89 * IOMMU driver registration
90 */
91int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
92{
93 struct vfio_iommu_driver *driver, *tmp;
94
95 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
96 if (!driver)
97 return -ENOMEM;
98
99 driver->ops = ops;
100
101 mutex_lock(&vfio.iommu_drivers_lock);
102
103 /* Check for duplicates */
104 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
105 if (tmp->ops == ops) {
106 mutex_unlock(&vfio.iommu_drivers_lock);
107 kfree(driver);
108 return -EINVAL;
109 }
110 }
111
112 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
113
114 mutex_unlock(&vfio.iommu_drivers_lock);
115
116 return 0;
117}
118EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
119
120void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
121{
122 struct vfio_iommu_driver *driver;
123
124 mutex_lock(&vfio.iommu_drivers_lock);
125 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
126 if (driver->ops == ops) {
127 list_del(&driver->vfio_next);
128 mutex_unlock(&vfio.iommu_drivers_lock);
129 kfree(driver);
130 return;
131 }
132 }
133 mutex_unlock(&vfio.iommu_drivers_lock);
134}
135EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
136
137/**
138 * Group minor allocation/free - both called with vfio.group_lock held
139 */
140static int vfio_alloc_group_minor(struct vfio_group *group)
141{
142 int ret, minor;
143
144again:
145 if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0))
146 return -ENOMEM;
147
148 /* index 0 is used by /dev/vfio/vfio */
149 ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor);
150 if (ret == -EAGAIN)
151 goto again;
152 if (ret || minor > MINORMASK) {
153 if (minor > MINORMASK)
154 idr_remove(&vfio.group_idr, minor);
155 return -ENOSPC;
156 }
157
158 return minor;
159}
160
161static void vfio_free_group_minor(int minor)
162{
163 idr_remove(&vfio.group_idr, minor);
164}
165
166static int vfio_iommu_group_notifier(struct notifier_block *nb,
167 unsigned long action, void *data);
168static void vfio_group_get(struct vfio_group *group);
169
170/**
171 * Container objects - containers are created when /dev/vfio/vfio is
172 * opened, but their lifecycle extends until the last user is done, so
173 * it's freed via kref. Must support container/group/device being
174 * closed in any order.
175 */
176static void vfio_container_get(struct vfio_container *container)
177{
178 kref_get(&container->kref);
179}
180
181static void vfio_container_release(struct kref *kref)
182{
183 struct vfio_container *container;
184 container = container_of(kref, struct vfio_container, kref);
185
186 kfree(container);
187}
188
189static void vfio_container_put(struct vfio_container *container)
190{
191 kref_put(&container->kref, vfio_container_release);
192}
193
194/**
195 * Group objects - create, release, get, put, search
196 */
197static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
198{
199 struct vfio_group *group, *tmp;
200 struct device *dev;
201 int ret, minor;
202
203 group = kzalloc(sizeof(*group), GFP_KERNEL);
204 if (!group)
205 return ERR_PTR(-ENOMEM);
206
207 kref_init(&group->kref);
208 INIT_LIST_HEAD(&group->device_list);
209 mutex_init(&group->device_lock);
210 atomic_set(&group->container_users, 0);
211 group->iommu_group = iommu_group;
212
213 group->nb.notifier_call = vfio_iommu_group_notifier;
214
215 /*
216 * blocking notifiers acquire a rwsem around registering and hold
217 * it around callback. Therefore, need to register outside of
218 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
219 * do anything unless it can find the group in vfio.group_list, so
220 * no harm in registering early.
221 */
222 ret = iommu_group_register_notifier(iommu_group, &group->nb);
223 if (ret) {
224 kfree(group);
225 return ERR_PTR(ret);
226 }
227
228 mutex_lock(&vfio.group_lock);
229
230 minor = vfio_alloc_group_minor(group);
231 if (minor < 0) {
232 mutex_unlock(&vfio.group_lock);
233 kfree(group);
234 return ERR_PTR(minor);
235 }
236
237 /* Did we race creating this group? */
238 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
239 if (tmp->iommu_group == iommu_group) {
240 vfio_group_get(tmp);
241 vfio_free_group_minor(minor);
242 mutex_unlock(&vfio.group_lock);
243 kfree(group);
244 return tmp;
245 }
246 }
247
248 dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor),
249 group, "%d", iommu_group_id(iommu_group));
250 if (IS_ERR(dev)) {
251 vfio_free_group_minor(minor);
252 mutex_unlock(&vfio.group_lock);
253 kfree(group);
254 return (struct vfio_group *)dev; /* ERR_PTR */
255 }
256
257 group->minor = minor;
258 group->dev = dev;
259
260 list_add(&group->vfio_next, &vfio.group_list);
261
262 mutex_unlock(&vfio.group_lock);
263
264 return group;
265}
266
267static void vfio_group_release(struct kref *kref)
268{
269 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
270
271 WARN_ON(!list_empty(&group->device_list));
272
273 device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor));
274 list_del(&group->vfio_next);
275 vfio_free_group_minor(group->minor);
276
277 mutex_unlock(&vfio.group_lock);
278
279 /*
280 * Unregister outside of lock. A spurious callback is harmless now
281 * that the group is no longer in vfio.group_list.
282 */
283 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
284
285 kfree(group);
286}
287
288static void vfio_group_put(struct vfio_group *group)
289{
290 mutex_lock(&vfio.group_lock);
291 /*
292 * Release needs to unlock to unregister the notifier, so only
293 * unlock if not released.
294 */
295 if (!kref_put(&group->kref, vfio_group_release))
296 mutex_unlock(&vfio.group_lock);
297}
298
299/* Assume group_lock or group reference is held */
300static void vfio_group_get(struct vfio_group *group)
301{
302 kref_get(&group->kref);
303}
304
305/*
306 * Not really a try as we will sleep for mutex, but we need to make
307 * sure the group pointer is valid under lock and get a reference.
308 */
309static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
310{
311 struct vfio_group *target = group;
312
313 mutex_lock(&vfio.group_lock);
314 list_for_each_entry(group, &vfio.group_list, vfio_next) {
315 if (group == target) {
316 vfio_group_get(group);
317 mutex_unlock(&vfio.group_lock);
318 return group;
319 }
320 }
321 mutex_unlock(&vfio.group_lock);
322
323 return NULL;
324}
325
326static
327struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
328{
329 struct vfio_group *group;
330
331 mutex_lock(&vfio.group_lock);
332 list_for_each_entry(group, &vfio.group_list, vfio_next) {
333 if (group->iommu_group == iommu_group) {
334 vfio_group_get(group);
335 mutex_unlock(&vfio.group_lock);
336 return group;
337 }
338 }
339 mutex_unlock(&vfio.group_lock);
340
341 return NULL;
342}
343
344static struct vfio_group *vfio_group_get_from_minor(int minor)
345{
346 struct vfio_group *group;
347
348 mutex_lock(&vfio.group_lock);
349 group = idr_find(&vfio.group_idr, minor);
350 if (!group) {
351 mutex_unlock(&vfio.group_lock);
352 return NULL;
353 }
354 vfio_group_get(group);
355 mutex_unlock(&vfio.group_lock);
356
357 return group;
358}
359
360/**
361 * Device objects - create, release, get, put, search
362 */
363static
364struct vfio_device *vfio_group_create_device(struct vfio_group *group,
365 struct device *dev,
366 const struct vfio_device_ops *ops,
367 void *device_data)
368{
369 struct vfio_device *device;
370 int ret;
371
372 device = kzalloc(sizeof(*device), GFP_KERNEL);
373 if (!device)
374 return ERR_PTR(-ENOMEM);
375
376 kref_init(&device->kref);
377 device->dev = dev;
378 device->group = group;
379 device->ops = ops;
380 device->device_data = device_data;
381
382 ret = dev_set_drvdata(dev, device);
383 if (ret) {
384 kfree(device);
385 return ERR_PTR(ret);
386 }
387
388 /* No need to get group_lock, caller has group reference */
389 vfio_group_get(group);
390
391 mutex_lock(&group->device_lock);
392 list_add(&device->group_next, &group->device_list);
393 mutex_unlock(&group->device_lock);
394
395 return device;
396}
397
398static void vfio_device_release(struct kref *kref)
399{
400 struct vfio_device *device = container_of(kref,
401 struct vfio_device, kref);
402 struct vfio_group *group = device->group;
403
404 mutex_lock(&group->device_lock);
405 list_del(&device->group_next);
406 mutex_unlock(&group->device_lock);
407
408 dev_set_drvdata(device->dev, NULL);
409
410 kfree(device);
411
412 /* vfio_del_group_dev may be waiting for this device */
413 wake_up(&vfio.release_q);
414}
415
416/* Device reference always implies a group reference */
417static void vfio_device_put(struct vfio_device *device)
418{
419 kref_put(&device->kref, vfio_device_release);
420 vfio_group_put(device->group);
421}
422
423static void vfio_device_get(struct vfio_device *device)
424{
425 vfio_group_get(device->group);
426 kref_get(&device->kref);
427}
428
429static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
430 struct device *dev)
431{
432 struct vfio_device *device;
433
434 mutex_lock(&group->device_lock);
435 list_for_each_entry(device, &group->device_list, group_next) {
436 if (device->dev == dev) {
437 vfio_device_get(device);
438 mutex_unlock(&group->device_lock);
439 return device;
440 }
441 }
442 mutex_unlock(&group->device_lock);
443 return NULL;
444}
445
446/*
447 * Whitelist some drivers that we know are safe (no dma) or just sit on
448 * a device. It's not always practical to leave a device within a group
449 * driverless as it could get re-bound to something unsafe.
450 */
451static const char * const vfio_driver_whitelist[] = { "pci-stub" };
452
453static bool vfio_whitelisted_driver(struct device_driver *drv)
454{
455 int i;
456
457 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
458 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
459 return true;
460 }
461
462 return false;
463}
464
465/*
466 * A vfio group is viable for use by userspace if all devices are either
467 * driver-less or bound to a vfio or whitelisted driver. We test the
468 * latter by the existence of a struct vfio_device matching the dev.
469 */
470static int vfio_dev_viable(struct device *dev, void *data)
471{
472 struct vfio_group *group = data;
473 struct vfio_device *device;
474
475 if (!dev->driver || vfio_whitelisted_driver(dev->driver))
476 return 0;
477
478 device = vfio_group_get_device(group, dev);
479 if (device) {
480 vfio_device_put(device);
481 return 0;
482 }
483
484 return -EINVAL;
485}
486
487/**
488 * Async device support
489 */
490static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
491{
492 struct vfio_device *device;
493
494 /* Do we already know about it? We shouldn't */
495 device = vfio_group_get_device(group, dev);
496 if (WARN_ON_ONCE(device)) {
497 vfio_device_put(device);
498 return 0;
499 }
500
501 /* Nothing to do for idle groups */
502 if (!atomic_read(&group->container_users))
503 return 0;
504
505 /* TODO Prevent device auto probing */
506 WARN("Device %s added to live group %d!\n", dev_name(dev),
507 iommu_group_id(group->iommu_group));
508
509 return 0;
510}
511
512static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev)
513{
514 struct vfio_device *device;
515
516 /*
517 * Expect to fall out here. If a device was in use, it would
518 * have been bound to a vfio sub-driver, which would have blocked
519 * in .remove at vfio_del_group_dev. Sanity check that we no
520 * longer track the device, so it's safe to remove.
521 */
522 device = vfio_group_get_device(group, dev);
523 if (likely(!device))
524 return 0;
525
526 WARN("Device %s removed from live group %d!\n", dev_name(dev),
527 iommu_group_id(group->iommu_group));
528
529 vfio_device_put(device);
530 return 0;
531}
532
533static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
534{
535 /* We don't care what happens when the group isn't in use */
536 if (!atomic_read(&group->container_users))
537 return 0;
538
539 return vfio_dev_viable(dev, group);
540}
541
542static int vfio_iommu_group_notifier(struct notifier_block *nb,
543 unsigned long action, void *data)
544{
545 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
546 struct device *dev = data;
547
548 /*
549 * Need to go through a group_lock lookup to get a reference or
550 * we risk racing a group being removed. Leave a WARN_ON for
551 * debuging, but if the group no longer exists, a spurious notify
552 * is harmless.
553 */
554 group = vfio_group_try_get(group);
555 if (WARN_ON(!group))
556 return NOTIFY_OK;
557
558 switch (action) {
559 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
560 vfio_group_nb_add_dev(group, dev);
561 break;
562 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
563 vfio_group_nb_del_dev(group, dev);
564 break;
565 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
566 pr_debug("%s: Device %s, group %d binding to driver\n",
567 __func__, dev_name(dev),
568 iommu_group_id(group->iommu_group));
569 break;
570 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
571 pr_debug("%s: Device %s, group %d bound to driver %s\n",
572 __func__, dev_name(dev),
573 iommu_group_id(group->iommu_group), dev->driver->name);
574 BUG_ON(vfio_group_nb_verify(group, dev));
575 break;
576 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
577 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
578 __func__, dev_name(dev),
579 iommu_group_id(group->iommu_group), dev->driver->name);
580 break;
581 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
582 pr_debug("%s: Device %s, group %d unbound from driver\n",
583 __func__, dev_name(dev),
584 iommu_group_id(group->iommu_group));
585 /*
586 * XXX An unbound device in a live group is ok, but we'd
587 * really like to avoid the above BUG_ON by preventing other
588 * drivers from binding to it. Once that occurs, we have to
589 * stop the system to maintain isolation. At a minimum, we'd
590 * want a toggle to disable driver auto probe for this device.
591 */
592 break;
593 }
594
595 vfio_group_put(group);
596 return NOTIFY_OK;
597}
598
599/**
600 * VFIO driver API
601 */
602int vfio_add_group_dev(struct device *dev,
603 const struct vfio_device_ops *ops, void *device_data)
604{
605 struct iommu_group *iommu_group;
606 struct vfio_group *group;
607 struct vfio_device *device;
608
609 iommu_group = iommu_group_get(dev);
610 if (!iommu_group)
611 return -EINVAL;
612
613 group = vfio_group_get_from_iommu(iommu_group);
614 if (!group) {
615 group = vfio_create_group(iommu_group);
616 if (IS_ERR(group)) {
617 iommu_group_put(iommu_group);
618 return PTR_ERR(group);
619 }
620 }
621
622 device = vfio_group_get_device(group, dev);
623 if (device) {
624 WARN(1, "Device %s already exists on group %d\n",
625 dev_name(dev), iommu_group_id(iommu_group));
626 vfio_device_put(device);
627 vfio_group_put(group);
628 iommu_group_put(iommu_group);
629 return -EBUSY;
630 }
631
632 device = vfio_group_create_device(group, dev, ops, device_data);
633 if (IS_ERR(device)) {
634 vfio_group_put(group);
635 iommu_group_put(iommu_group);
636 return PTR_ERR(device);
637 }
638
639 /*
640 * Added device holds reference to iommu_group and vfio_device
641 * (which in turn holds reference to vfio_group). Drop extra
642 * group reference used while acquiring device.
643 */
644 vfio_group_put(group);
645
646 return 0;
647}
648EXPORT_SYMBOL_GPL(vfio_add_group_dev);
649
650/* Test whether a struct device is present in our tracking */
651static bool vfio_dev_present(struct device *dev)
652{
653 struct iommu_group *iommu_group;
654 struct vfio_group *group;
655 struct vfio_device *device;
656
657 iommu_group = iommu_group_get(dev);
658 if (!iommu_group)
659 return false;
660
661 group = vfio_group_get_from_iommu(iommu_group);
662 if (!group) {
663 iommu_group_put(iommu_group);
664 return false;
665 }
666
667 device = vfio_group_get_device(group, dev);
668 if (!device) {
669 vfio_group_put(group);
670 iommu_group_put(iommu_group);
671 return false;
672 }
673
674 vfio_device_put(device);
675 vfio_group_put(group);
676 iommu_group_put(iommu_group);
677 return true;
678}
679
680/*
681 * Decrement the device reference count and wait for the device to be
682 * removed. Open file descriptors for the device... */
683void *vfio_del_group_dev(struct device *dev)
684{
685 struct vfio_device *device = dev_get_drvdata(dev);
686 struct vfio_group *group = device->group;
687 struct iommu_group *iommu_group = group->iommu_group;
688 void *device_data = device->device_data;
689
690 vfio_device_put(device);
691
692 /* TODO send a signal to encourage this to be released */
693 wait_event(vfio.release_q, !vfio_dev_present(dev));
694
695 iommu_group_put(iommu_group);
696
697 return device_data;
698}
699EXPORT_SYMBOL_GPL(vfio_del_group_dev);
700
701/**
702 * VFIO base fd, /dev/vfio/vfio
703 */
704static long vfio_ioctl_check_extension(struct vfio_container *container,
705 unsigned long arg)
706{
707 struct vfio_iommu_driver *driver = container->iommu_driver;
708 long ret = 0;
709
710 switch (arg) {
711 /* No base extensions yet */
712 default:
713 /*
714 * If no driver is set, poll all registered drivers for
715 * extensions and return the first positive result. If
716 * a driver is already set, further queries will be passed
717 * only to that driver.
718 */
719 if (!driver) {
720 mutex_lock(&vfio.iommu_drivers_lock);
721 list_for_each_entry(driver, &vfio.iommu_drivers_list,
722 vfio_next) {
723 if (!try_module_get(driver->ops->owner))
724 continue;
725
726 ret = driver->ops->ioctl(NULL,
727 VFIO_CHECK_EXTENSION,
728 arg);
729 module_put(driver->ops->owner);
730 if (ret > 0)
731 break;
732 }
733 mutex_unlock(&vfio.iommu_drivers_lock);
734 } else
735 ret = driver->ops->ioctl(container->iommu_data,
736 VFIO_CHECK_EXTENSION, arg);
737 }
738
739 return ret;
740}
741
742/* hold container->group_lock */
743static int __vfio_container_attach_groups(struct vfio_container *container,
744 struct vfio_iommu_driver *driver,
745 void *data)
746{
747 struct vfio_group *group;
748 int ret = -ENODEV;
749
750 list_for_each_entry(group, &container->group_list, container_next) {
751 ret = driver->ops->attach_group(data, group->iommu_group);
752 if (ret)
753 goto unwind;
754 }
755
756 return ret;
757
758unwind:
759 list_for_each_entry_continue_reverse(group, &container->group_list,
760 container_next) {
761 driver->ops->detach_group(data, group->iommu_group);
762 }
763
764 return ret;
765}
766
767static long vfio_ioctl_set_iommu(struct vfio_container *container,
768 unsigned long arg)
769{
770 struct vfio_iommu_driver *driver;
771 long ret = -ENODEV;
772
773 mutex_lock(&container->group_lock);
774
775 /*
776 * The container is designed to be an unprivileged interface while
777 * the group can be assigned to specific users. Therefore, only by
778 * adding a group to a container does the user get the privilege of
779 * enabling the iommu, which may allocate finite resources. There
780 * is no unset_iommu, but by removing all the groups from a container,
781 * the container is deprivileged and returns to an unset state.
782 */
783 if (list_empty(&container->group_list) || container->iommu_driver) {
784 mutex_unlock(&container->group_lock);
785 return -EINVAL;
786 }
787
788 mutex_lock(&vfio.iommu_drivers_lock);
789 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
790 void *data;
791
792 if (!try_module_get(driver->ops->owner))
793 continue;
794
795 /*
796 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
797 * so test which iommu driver reported support for this
798 * extension and call open on them. We also pass them the
799 * magic, allowing a single driver to support multiple
800 * interfaces if they'd like.
801 */
802 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
803 module_put(driver->ops->owner);
804 continue;
805 }
806
807 /* module reference holds the driver we're working on */
808 mutex_unlock(&vfio.iommu_drivers_lock);
809
810 data = driver->ops->open(arg);
811 if (IS_ERR(data)) {
812 ret = PTR_ERR(data);
813 module_put(driver->ops->owner);
814 goto skip_drivers_unlock;
815 }
816
817 ret = __vfio_container_attach_groups(container, driver, data);
818 if (!ret) {
819 container->iommu_driver = driver;
820 container->iommu_data = data;
821 } else {
822 driver->ops->release(data);
823 module_put(driver->ops->owner);
824 }
825
826 goto skip_drivers_unlock;
827 }
828
829 mutex_unlock(&vfio.iommu_drivers_lock);
830skip_drivers_unlock:
831 mutex_unlock(&container->group_lock);
832
833 return ret;
834}
835
836static long vfio_fops_unl_ioctl(struct file *filep,
837 unsigned int cmd, unsigned long arg)
838{
839 struct vfio_container *container = filep->private_data;
840 struct vfio_iommu_driver *driver;
841 void *data;
842 long ret = -EINVAL;
843
844 if (!container)
845 return ret;
846
847 driver = container->iommu_driver;
848 data = container->iommu_data;
849
850 switch (cmd) {
851 case VFIO_GET_API_VERSION:
852 ret = VFIO_API_VERSION;
853 break;
854 case VFIO_CHECK_EXTENSION:
855 ret = vfio_ioctl_check_extension(container, arg);
856 break;
857 case VFIO_SET_IOMMU:
858 ret = vfio_ioctl_set_iommu(container, arg);
859 break;
860 default:
861 if (driver) /* passthrough all unrecognized ioctls */
862 ret = driver->ops->ioctl(data, cmd, arg);
863 }
864
865 return ret;
866}
867
868#ifdef CONFIG_COMPAT
869static long vfio_fops_compat_ioctl(struct file *filep,
870 unsigned int cmd, unsigned long arg)
871{
872 arg = (unsigned long)compat_ptr(arg);
873 return vfio_fops_unl_ioctl(filep, cmd, arg);
874}
875#endif /* CONFIG_COMPAT */
876
877static int vfio_fops_open(struct inode *inode, struct file *filep)
878{
879 struct vfio_container *container;
880
881 container = kzalloc(sizeof(*container), GFP_KERNEL);
882 if (!container)
883 return -ENOMEM;
884
885 INIT_LIST_HEAD(&container->group_list);
886 mutex_init(&container->group_lock);
887 kref_init(&container->kref);
888
889 filep->private_data = container;
890
891 return 0;
892}
893
894static int vfio_fops_release(struct inode *inode, struct file *filep)
895{
896 struct vfio_container *container = filep->private_data;
897
898 filep->private_data = NULL;
899
900 vfio_container_put(container);
901
902 return 0;
903}
904
905/*
906 * Once an iommu driver is set, we optionally pass read/write/mmap
907 * on to the driver, allowing management interfaces beyond ioctl.
908 */
909static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
910 size_t count, loff_t *ppos)
911{
912 struct vfio_container *container = filep->private_data;
913 struct vfio_iommu_driver *driver = container->iommu_driver;
914
915 if (unlikely(!driver || !driver->ops->read))
916 return -EINVAL;
917
918 return driver->ops->read(container->iommu_data, buf, count, ppos);
919}
920
921static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
922 size_t count, loff_t *ppos)
923{
924 struct vfio_container *container = filep->private_data;
925 struct vfio_iommu_driver *driver = container->iommu_driver;
926
927 if (unlikely(!driver || !driver->ops->write))
928 return -EINVAL;
929
930 return driver->ops->write(container->iommu_data, buf, count, ppos);
931}
932
933static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
934{
935 struct vfio_container *container = filep->private_data;
936 struct vfio_iommu_driver *driver = container->iommu_driver;
937
938 if (unlikely(!driver || !driver->ops->mmap))
939 return -EINVAL;
940
941 return driver->ops->mmap(container->iommu_data, vma);
942}
943
944static const struct file_operations vfio_fops = {
945 .owner = THIS_MODULE,
946 .open = vfio_fops_open,
947 .release = vfio_fops_release,
948 .read = vfio_fops_read,
949 .write = vfio_fops_write,
950 .unlocked_ioctl = vfio_fops_unl_ioctl,
951#ifdef CONFIG_COMPAT
952 .compat_ioctl = vfio_fops_compat_ioctl,
953#endif
954 .mmap = vfio_fops_mmap,
955};
956
957/**
958 * VFIO Group fd, /dev/vfio/$GROUP
959 */
960static void __vfio_group_unset_container(struct vfio_group *group)
961{
962 struct vfio_container *container = group->container;
963 struct vfio_iommu_driver *driver;
964
965 mutex_lock(&container->group_lock);
966
967 driver = container->iommu_driver;
968 if (driver)
969 driver->ops->detach_group(container->iommu_data,
970 group->iommu_group);
971
972 group->container = NULL;
973 list_del(&group->container_next);
974
975 /* Detaching the last group deprivileges a container, remove iommu */
976 if (driver && list_empty(&container->group_list)) {
977 driver->ops->release(container->iommu_data);
978 module_put(driver->ops->owner);
979 container->iommu_driver = NULL;
980 container->iommu_data = NULL;
981 }
982
983 mutex_unlock(&container->group_lock);
984
985 vfio_container_put(container);
986}
987
988/*
989 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
990 * if there was no container to unset. Since the ioctl is called on
991 * the group, we know that still exists, therefore the only valid
992 * transition here is 1->0.
993 */
994static int vfio_group_unset_container(struct vfio_group *group)
995{
996 int users = atomic_cmpxchg(&group->container_users, 1, 0);
997
998 if (!users)
999 return -EINVAL;
1000 if (users != 1)
1001 return -EBUSY;
1002
1003 __vfio_group_unset_container(group);
1004
1005 return 0;
1006}
1007
1008/*
1009 * When removing container users, anything that removes the last user
1010 * implicitly removes the group from the container. That is, if the
1011 * group file descriptor is closed, as well as any device file descriptors,
1012 * the group is free.
1013 */
1014static void vfio_group_try_dissolve_container(struct vfio_group *group)
1015{
1016 if (0 == atomic_dec_if_positive(&group->container_users))
1017 __vfio_group_unset_container(group);
1018}
1019
1020static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1021{
1022 struct file *filep;
1023 struct vfio_container *container;
1024 struct vfio_iommu_driver *driver;
1025 int ret = 0;
1026
1027 if (atomic_read(&group->container_users))
1028 return -EINVAL;
1029
1030 filep = fget(container_fd);
1031 if (!filep)
1032 return -EBADF;
1033
1034 /* Sanity check, is this really our fd? */
1035 if (filep->f_op != &vfio_fops) {
1036 fput(filep);
1037 return -EINVAL;
1038 }
1039
1040 container = filep->private_data;
1041 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1042
1043 mutex_lock(&container->group_lock);
1044
1045 driver = container->iommu_driver;
1046 if (driver) {
1047 ret = driver->ops->attach_group(container->iommu_data,
1048 group->iommu_group);
1049 if (ret)
1050 goto unlock_out;
1051 }
1052
1053 group->container = container;
1054 list_add(&group->container_next, &container->group_list);
1055
1056 /* Get a reference on the container and mark a user within the group */
1057 vfio_container_get(container);
1058 atomic_inc(&group->container_users);
1059
1060unlock_out:
1061 mutex_unlock(&container->group_lock);
1062 fput(filep);
1063
1064 return ret;
1065}
1066
1067static bool vfio_group_viable(struct vfio_group *group)
1068{
1069 return (iommu_group_for_each_dev(group->iommu_group,
1070 group, vfio_dev_viable) == 0);
1071}
1072
1073static const struct file_operations vfio_device_fops;
1074
1075static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1076{
1077 struct vfio_device *device;
1078 struct file *filep;
1079 int ret = -ENODEV;
1080
1081 if (0 == atomic_read(&group->container_users) ||
1082 !group->container->iommu_driver || !vfio_group_viable(group))
1083 return -EINVAL;
1084
1085 mutex_lock(&group->device_lock);
1086 list_for_each_entry(device, &group->device_list, group_next) {
1087 if (strcmp(dev_name(device->dev), buf))
1088 continue;
1089
1090 ret = device->ops->open(device->device_data);
1091 if (ret)
1092 break;
1093 /*
1094 * We can't use anon_inode_getfd() because we need to modify
1095 * the f_mode flags directly to allow more than just ioctls
1096 */
1097 ret = get_unused_fd();
1098 if (ret < 0) {
1099 device->ops->release(device->device_data);
1100 break;
1101 }
1102
1103 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1104 device, O_RDWR);
1105 if (IS_ERR(filep)) {
1106 put_unused_fd(ret);
1107 ret = PTR_ERR(filep);
1108 device->ops->release(device->device_data);
1109 break;
1110 }
1111
1112 /*
1113 * TODO: add an anon_inode interface to do this.
1114 * Appears to be missing by lack of need rather than
1115 * explicitly prevented. Now there's need.
1116 */
1117 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1118
1119 fd_install(ret, filep);
1120
1121 vfio_device_get(device);
1122 atomic_inc(&group->container_users);
1123 break;
1124 }
1125 mutex_unlock(&group->device_lock);
1126
1127 return ret;
1128}
1129
1130static long vfio_group_fops_unl_ioctl(struct file *filep,
1131 unsigned int cmd, unsigned long arg)
1132{
1133 struct vfio_group *group = filep->private_data;
1134 long ret = -ENOTTY;
1135
1136 switch (cmd) {
1137 case VFIO_GROUP_GET_STATUS:
1138 {
1139 struct vfio_group_status status;
1140 unsigned long minsz;
1141
1142 minsz = offsetofend(struct vfio_group_status, flags);
1143
1144 if (copy_from_user(&status, (void __user *)arg, minsz))
1145 return -EFAULT;
1146
1147 if (status.argsz < minsz)
1148 return -EINVAL;
1149
1150 status.flags = 0;
1151
1152 if (vfio_group_viable(group))
1153 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1154
1155 if (group->container)
1156 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1157
1158 if (copy_to_user((void __user *)arg, &status, minsz))
1159 return -EFAULT;
1160
1161 ret = 0;
1162 break;
1163 }
1164 case VFIO_GROUP_SET_CONTAINER:
1165 {
1166 int fd;
1167
1168 if (get_user(fd, (int __user *)arg))
1169 return -EFAULT;
1170
1171 if (fd < 0)
1172 return -EINVAL;
1173
1174 ret = vfio_group_set_container(group, fd);
1175 break;
1176 }
1177 case VFIO_GROUP_UNSET_CONTAINER:
1178 ret = vfio_group_unset_container(group);
1179 break;
1180 case VFIO_GROUP_GET_DEVICE_FD:
1181 {
1182 char *buf;
1183
1184 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1185 if (IS_ERR(buf))
1186 return PTR_ERR(buf);
1187
1188 ret = vfio_group_get_device_fd(group, buf);
1189 kfree(buf);
1190 break;
1191 }
1192 }
1193
1194 return ret;
1195}
1196
1197#ifdef CONFIG_COMPAT
1198static long vfio_group_fops_compat_ioctl(struct file *filep,
1199 unsigned int cmd, unsigned long arg)
1200{
1201 arg = (unsigned long)compat_ptr(arg);
1202 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1203}
1204#endif /* CONFIG_COMPAT */
1205
1206static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1207{
1208 struct vfio_group *group;
1209
1210 group = vfio_group_get_from_minor(iminor(inode));
1211 if (!group)
1212 return -ENODEV;
1213
1214 if (group->container) {
1215 vfio_group_put(group);
1216 return -EBUSY;
1217 }
1218
1219 filep->private_data = group;
1220
1221 return 0;
1222}
1223
1224static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1225{
1226 struct vfio_group *group = filep->private_data;
1227
1228 filep->private_data = NULL;
1229
1230 vfio_group_try_dissolve_container(group);
1231
1232 vfio_group_put(group);
1233
1234 return 0;
1235}
1236
1237static const struct file_operations vfio_group_fops = {
1238 .owner = THIS_MODULE,
1239 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1240#ifdef CONFIG_COMPAT
1241 .compat_ioctl = vfio_group_fops_compat_ioctl,
1242#endif
1243 .open = vfio_group_fops_open,
1244 .release = vfio_group_fops_release,
1245};
1246
1247/**
1248 * VFIO Device fd
1249 */
1250static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1251{
1252 struct vfio_device *device = filep->private_data;
1253
1254 device->ops->release(device->device_data);
1255
1256 vfio_group_try_dissolve_container(device->group);
1257
1258 vfio_device_put(device);
1259
1260 return 0;
1261}
1262
1263static long vfio_device_fops_unl_ioctl(struct file *filep,
1264 unsigned int cmd, unsigned long arg)
1265{
1266 struct vfio_device *device = filep->private_data;
1267
1268 if (unlikely(!device->ops->ioctl))
1269 return -EINVAL;
1270
1271 return device->ops->ioctl(device->device_data, cmd, arg);
1272}
1273
1274static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1275 size_t count, loff_t *ppos)
1276{
1277 struct vfio_device *device = filep->private_data;
1278
1279 if (unlikely(!device->ops->read))
1280 return -EINVAL;
1281
1282 return device->ops->read(device->device_data, buf, count, ppos);
1283}
1284
1285static ssize_t vfio_device_fops_write(struct file *filep,
1286 const char __user *buf,
1287 size_t count, loff_t *ppos)
1288{
1289 struct vfio_device *device = filep->private_data;
1290
1291 if (unlikely(!device->ops->write))
1292 return -EINVAL;
1293
1294 return device->ops->write(device->device_data, buf, count, ppos);
1295}
1296
1297static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1298{
1299 struct vfio_device *device = filep->private_data;
1300
1301 if (unlikely(!device->ops->mmap))
1302 return -EINVAL;
1303
1304 return device->ops->mmap(device->device_data, vma);
1305}
1306
1307#ifdef CONFIG_COMPAT
1308static long vfio_device_fops_compat_ioctl(struct file *filep,
1309 unsigned int cmd, unsigned long arg)
1310{
1311 arg = (unsigned long)compat_ptr(arg);
1312 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1313}
1314#endif /* CONFIG_COMPAT */
1315
1316static const struct file_operations vfio_device_fops = {
1317 .owner = THIS_MODULE,
1318 .release = vfio_device_fops_release,
1319 .read = vfio_device_fops_read,
1320 .write = vfio_device_fops_write,
1321 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1322#ifdef CONFIG_COMPAT
1323 .compat_ioctl = vfio_device_fops_compat_ioctl,
1324#endif
1325 .mmap = vfio_device_fops_mmap,
1326};
1327
1328/**
1329 * Module/class support
1330 */
1331static char *vfio_devnode(struct device *dev, umode_t *mode)
1332{
1333 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1334}
1335
1336static int __init vfio_init(void)
1337{
1338 int ret;
1339
1340 idr_init(&vfio.group_idr);
1341 mutex_init(&vfio.group_lock);
1342 mutex_init(&vfio.iommu_drivers_lock);
1343 INIT_LIST_HEAD(&vfio.group_list);
1344 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1345 init_waitqueue_head(&vfio.release_q);
1346
1347 vfio.class = class_create(THIS_MODULE, "vfio");
1348 if (IS_ERR(vfio.class)) {
1349 ret = PTR_ERR(vfio.class);
1350 goto err_class;
1351 }
1352
1353 vfio.class->devnode = vfio_devnode;
1354
1355 ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
1356 if (ret)
1357 goto err_base_chrdev;
1358
1359 cdev_init(&vfio.cdev, &vfio_fops);
1360 ret = cdev_add(&vfio.cdev, vfio.devt, 1);
1361 if (ret)
1362 goto err_base_cdev;
1363
1364 vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio");
1365 if (IS_ERR(vfio.dev)) {
1366 ret = PTR_ERR(vfio.dev);
1367 goto err_base_dev;
1368 }
1369
1370 /* /dev/vfio/$GROUP */
1371 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1372 ret = cdev_add(&vfio.group_cdev,
1373 MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1);
1374 if (ret)
1375 goto err_groups_cdev;
1376
1377 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1378
1379 /*
1380 * Attempt to load known iommu-drivers. This gives us a working
1381 * environment without the user needing to explicitly load iommu
1382 * drivers.
1383 */
1384 request_module_nowait("vfio_iommu_type1");
1385
1386 return 0;
1387
1388err_groups_cdev:
1389 device_destroy(vfio.class, vfio.devt);
1390err_base_dev:
1391 cdev_del(&vfio.cdev);
1392err_base_cdev:
1393 unregister_chrdev_region(vfio.devt, MINORMASK);
1394err_base_chrdev:
1395 class_destroy(vfio.class);
1396 vfio.class = NULL;
1397err_class:
1398 return ret;
1399}
1400
1401static void __exit vfio_cleanup(void)
1402{
1403 WARN_ON(!list_empty(&vfio.group_list));
1404
1405 idr_destroy(&vfio.group_idr);
1406 cdev_del(&vfio.group_cdev);
1407 device_destroy(vfio.class, vfio.devt);
1408 cdev_del(&vfio.cdev);
1409 unregister_chrdev_region(vfio.devt, MINORMASK);
1410 class_destroy(vfio.class);
1411 vfio.class = NULL;
1412}
1413
1414module_init(vfio_init);
1415module_exit(vfio_cleanup);
1416
1417MODULE_VERSION(DRIVER_VERSION);
1418MODULE_LICENSE("GPL v2");
1419MODULE_AUTHOR(DRIVER_AUTHOR);
1420MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
new file mode 100644
index 000000000000..6f3fbc48a6c7
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -0,0 +1,753 @@
1/*
2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 *
15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
17 * VT-d, but that makes it harder to re-use as theoretically anyone
18 * implementing a similar IOMMU could make use of this. We expect the
19 * IOMMU to support the IOMMU API and have few to no restrictions around
20 * the IOVA range that can be mapped. The Type1 IOMMU is currently
21 * optimized for relatively static mappings of a userspace process with
22 * userpsace pages pinned into memory. We also assume devices and IOMMU
23 * domains are PCI based as the IOMMU API is still centered around a
24 * device/bus interface rather than a group interface.
25 */
26
27#include <linux/compat.h>
28#include <linux/device.h>
29#include <linux/fs.h>
30#include <linux/iommu.h>
31#include <linux/module.h>
32#include <linux/mm.h>
33#include <linux/pci.h> /* pci_bus_type */
34#include <linux/sched.h>
35#include <linux/slab.h>
36#include <linux/uaccess.h>
37#include <linux/vfio.h>
38#include <linux/workqueue.h>
39
40#define DRIVER_VERSION "0.2"
41#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
42#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
43
44static bool allow_unsafe_interrupts;
45module_param_named(allow_unsafe_interrupts,
46 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
47MODULE_PARM_DESC(allow_unsafe_interrupts,
48 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
49
50struct vfio_iommu {
51 struct iommu_domain *domain;
52 struct mutex lock;
53 struct list_head dma_list;
54 struct list_head group_list;
55 bool cache;
56};
57
58struct vfio_dma {
59 struct list_head next;
60 dma_addr_t iova; /* Device address */
61 unsigned long vaddr; /* Process virtual addr */
62 long npage; /* Number of pages */
63 int prot; /* IOMMU_READ/WRITE */
64};
65
66struct vfio_group {
67 struct iommu_group *iommu_group;
68 struct list_head next;
69};
70
71/*
72 * This code handles mapping and unmapping of user data buffers
73 * into DMA'ble space using the IOMMU
74 */
75
76#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
77
78struct vwork {
79 struct mm_struct *mm;
80 long npage;
81 struct work_struct work;
82};
83
84/* delayed decrement/increment for locked_vm */
85static void vfio_lock_acct_bg(struct work_struct *work)
86{
87 struct vwork *vwork = container_of(work, struct vwork, work);
88 struct mm_struct *mm;
89
90 mm = vwork->mm;
91 down_write(&mm->mmap_sem);
92 mm->locked_vm += vwork->npage;
93 up_write(&mm->mmap_sem);
94 mmput(mm);
95 kfree(vwork);
96}
97
98static void vfio_lock_acct(long npage)
99{
100 struct vwork *vwork;
101 struct mm_struct *mm;
102
103 if (!current->mm)
104 return; /* process exited */
105
106 if (down_write_trylock(&current->mm->mmap_sem)) {
107 current->mm->locked_vm += npage;
108 up_write(&current->mm->mmap_sem);
109 return;
110 }
111
112 /*
113 * Couldn't get mmap_sem lock, so must setup to update
114 * mm->locked_vm later. If locked_vm were atomic, we
115 * wouldn't need this silliness
116 */
117 vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
118 if (!vwork)
119 return;
120 mm = get_task_mm(current);
121 if (!mm) {
122 kfree(vwork);
123 return;
124 }
125 INIT_WORK(&vwork->work, vfio_lock_acct_bg);
126 vwork->mm = mm;
127 vwork->npage = npage;
128 schedule_work(&vwork->work);
129}
130
131/*
132 * Some mappings aren't backed by a struct page, for example an mmap'd
133 * MMIO range for our own or another device. These use a different
134 * pfn conversion and shouldn't be tracked as locked pages.
135 */
136static bool is_invalid_reserved_pfn(unsigned long pfn)
137{
138 if (pfn_valid(pfn)) {
139 bool reserved;
140 struct page *tail = pfn_to_page(pfn);
141 struct page *head = compound_trans_head(tail);
142 reserved = !!(PageReserved(head));
143 if (head != tail) {
144 /*
145 * "head" is not a dangling pointer
146 * (compound_trans_head takes care of that)
147 * but the hugepage may have been split
148 * from under us (and we may not hold a
149 * reference count on the head page so it can
150 * be reused before we run PageReferenced), so
151 * we've to check PageTail before returning
152 * what we just read.
153 */
154 smp_rmb();
155 if (PageTail(tail))
156 return reserved;
157 }
158 return PageReserved(tail);
159 }
160
161 return true;
162}
163
164static int put_pfn(unsigned long pfn, int prot)
165{
166 if (!is_invalid_reserved_pfn(pfn)) {
167 struct page *page = pfn_to_page(pfn);
168 if (prot & IOMMU_WRITE)
169 SetPageDirty(page);
170 put_page(page);
171 return 1;
172 }
173 return 0;
174}
175
176/* Unmap DMA region */
177static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
178 long npage, int prot)
179{
180 long i, unlocked = 0;
181
182 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
183 unsigned long pfn;
184
185 pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
186 if (pfn) {
187 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
188 unlocked += put_pfn(pfn, prot);
189 }
190 }
191 return unlocked;
192}
193
194static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
195 long npage, int prot)
196{
197 long unlocked;
198
199 unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
200 vfio_lock_acct(-unlocked);
201}
202
203static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
204{
205 struct page *page[1];
206 struct vm_area_struct *vma;
207 int ret = -EFAULT;
208
209 if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
210 *pfn = page_to_pfn(page[0]);
211 return 0;
212 }
213
214 down_read(&current->mm->mmap_sem);
215
216 vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
217
218 if (vma && vma->vm_flags & VM_PFNMAP) {
219 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
220 if (is_invalid_reserved_pfn(*pfn))
221 ret = 0;
222 }
223
224 up_read(&current->mm->mmap_sem);
225
226 return ret;
227}
228
229/* Map DMA region */
230static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
231 unsigned long vaddr, long npage, int prot)
232{
233 dma_addr_t start = iova;
234 long i, locked = 0;
235 int ret;
236
237 /* Verify that pages are not already mapped */
238 for (i = 0; i < npage; i++, iova += PAGE_SIZE)
239 if (iommu_iova_to_phys(iommu->domain, iova))
240 return -EBUSY;
241
242 iova = start;
243
244 if (iommu->cache)
245 prot |= IOMMU_CACHE;
246
247 /*
248 * XXX We break mappings into pages and use get_user_pages_fast to
249 * pin the pages in memory. It's been suggested that mlock might
250 * provide a more efficient mechanism, but nothing prevents the
251 * user from munlocking the pages, which could then allow the user
252 * access to random host memory. We also have no guarantee from the
253 * IOMMU API that the iommu driver can unmap sub-pages of previous
254 * mappings. This means we might lose an entire range if a single
255 * page within it is unmapped. Single page mappings are inefficient,
256 * but provide the most flexibility for now.
257 */
258 for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
259 unsigned long pfn = 0;
260
261 ret = vaddr_get_pfn(vaddr, prot, &pfn);
262 if (ret) {
263 __vfio_dma_do_unmap(iommu, start, i, prot);
264 return ret;
265 }
266
267 /*
268 * Only add actual locked pages to accounting
269 * XXX We're effectively marking a page locked for every
270 * IOVA page even though it's possible the user could be
271 * backing multiple IOVAs with the same vaddr. This over-
272 * penalizes the user process, but we currently have no
273 * easy way to do this properly.
274 */
275 if (!is_invalid_reserved_pfn(pfn))
276 locked++;
277
278 ret = iommu_map(iommu->domain, iova,
279 (phys_addr_t)pfn << PAGE_SHIFT,
280 PAGE_SIZE, prot);
281 if (ret) {
282 /* Back out mappings on error */
283 put_pfn(pfn, prot);
284 __vfio_dma_do_unmap(iommu, start, i, prot);
285 return ret;
286 }
287 }
288 vfio_lock_acct(locked);
289 return 0;
290}
291
292static inline bool ranges_overlap(dma_addr_t start1, size_t size1,
293 dma_addr_t start2, size_t size2)
294{
295 if (start1 < start2)
296 return (start2 - start1 < size1);
297 else if (start2 < start1)
298 return (start1 - start2 < size2);
299 return (size1 > 0 && size2 > 0);
300}
301
302static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
303 dma_addr_t start, size_t size)
304{
305 struct vfio_dma *dma;
306
307 list_for_each_entry(dma, &iommu->dma_list, next) {
308 if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
309 start, size))
310 return dma;
311 }
312 return NULL;
313}
314
315static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
316 size_t size, struct vfio_dma *dma)
317{
318 struct vfio_dma *split;
319 long npage_lo, npage_hi;
320
321 /* Existing dma region is completely covered, unmap all */
322 if (start <= dma->iova &&
323 start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
324 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
325 list_del(&dma->next);
326 npage_lo = dma->npage;
327 kfree(dma);
328 return npage_lo;
329 }
330
331 /* Overlap low address of existing range */
332 if (start <= dma->iova) {
333 size_t overlap;
334
335 overlap = start + size - dma->iova;
336 npage_lo = overlap >> PAGE_SHIFT;
337
338 vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
339 dma->iova += overlap;
340 dma->vaddr += overlap;
341 dma->npage -= npage_lo;
342 return npage_lo;
343 }
344
345 /* Overlap high address of existing range */
346 if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
347 size_t overlap;
348
349 overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
350 npage_hi = overlap >> PAGE_SHIFT;
351
352 vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
353 dma->npage -= npage_hi;
354 return npage_hi;
355 }
356
357 /* Split existing */
358 npage_lo = (start - dma->iova) >> PAGE_SHIFT;
359 npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
360
361 split = kzalloc(sizeof *split, GFP_KERNEL);
362 if (!split)
363 return -ENOMEM;
364
365 vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
366
367 dma->npage = npage_lo;
368
369 split->npage = npage_hi;
370 split->iova = start + size;
371 split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
372 split->prot = dma->prot;
373 list_add(&split->next, &iommu->dma_list);
374 return size >> PAGE_SHIFT;
375}
376
377static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
378 struct vfio_iommu_type1_dma_unmap *unmap)
379{
380 long ret = 0, npage = unmap->size >> PAGE_SHIFT;
381 struct vfio_dma *dma, *tmp;
382 uint64_t mask;
383
384 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
385
386 if (unmap->iova & mask)
387 return -EINVAL;
388 if (unmap->size & mask)
389 return -EINVAL;
390
391 /* XXX We still break these down into PAGE_SIZE */
392 WARN_ON(mask & PAGE_MASK);
393
394 mutex_lock(&iommu->lock);
395
396 list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) {
397 if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
398 unmap->iova, unmap->size)) {
399 ret = vfio_remove_dma_overlap(iommu, unmap->iova,
400 unmap->size, dma);
401 if (ret > 0)
402 npage -= ret;
403 if (ret < 0 || npage == 0)
404 break;
405 }
406 }
407 mutex_unlock(&iommu->lock);
408 return ret > 0 ? 0 : (int)ret;
409}
410
411static int vfio_dma_do_map(struct vfio_iommu *iommu,
412 struct vfio_iommu_type1_dma_map *map)
413{
414 struct vfio_dma *dma, *pdma = NULL;
415 dma_addr_t iova = map->iova;
416 unsigned long locked, lock_limit, vaddr = map->vaddr;
417 size_t size = map->size;
418 int ret = 0, prot = 0;
419 uint64_t mask;
420 long npage;
421
422 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
423
424 /* READ/WRITE from device perspective */
425 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
426 prot |= IOMMU_WRITE;
427 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
428 prot |= IOMMU_READ;
429
430 if (!prot)
431 return -EINVAL; /* No READ/WRITE? */
432
433 if (vaddr & mask)
434 return -EINVAL;
435 if (iova & mask)
436 return -EINVAL;
437 if (size & mask)
438 return -EINVAL;
439
440 /* XXX We still break these down into PAGE_SIZE */
441 WARN_ON(mask & PAGE_MASK);
442
443 /* Don't allow IOVA wrap */
444 if (iova + size && iova + size < iova)
445 return -EINVAL;
446
447 /* Don't allow virtual address wrap */
448 if (vaddr + size && vaddr + size < vaddr)
449 return -EINVAL;
450
451 npage = size >> PAGE_SHIFT;
452 if (!npage)
453 return -EINVAL;
454
455 mutex_lock(&iommu->lock);
456
457 if (vfio_find_dma(iommu, iova, size)) {
458 ret = -EBUSY;
459 goto out_lock;
460 }
461
462 /* account for locked pages */
463 locked = current->mm->locked_vm + npage;
464 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
465 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
466 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
467 __func__, rlimit(RLIMIT_MEMLOCK));
468 ret = -ENOMEM;
469 goto out_lock;
470 }
471
472 ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
473 if (ret)
474 goto out_lock;
475
476 /* Check if we abut a region below - nothing below 0 */
477 if (iova) {
478 dma = vfio_find_dma(iommu, iova - 1, 1);
479 if (dma && dma->prot == prot &&
480 dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) {
481
482 dma->npage += npage;
483 iova = dma->iova;
484 vaddr = dma->vaddr;
485 npage = dma->npage;
486 size = NPAGE_TO_SIZE(npage);
487
488 pdma = dma;
489 }
490 }
491
492 /* Check if we abut a region above - nothing above ~0 + 1 */
493 if (iova + size) {
494 dma = vfio_find_dma(iommu, iova + size, 1);
495 if (dma && dma->prot == prot &&
496 dma->vaddr == vaddr + size) {
497
498 dma->npage += npage;
499 dma->iova = iova;
500 dma->vaddr = vaddr;
501
502 /*
503 * If merged above and below, remove previously
504 * merged entry. New entry covers it.
505 */
506 if (pdma) {
507 list_del(&pdma->next);
508 kfree(pdma);
509 }
510 pdma = dma;
511 }
512 }
513
514 /* Isolated, new region */
515 if (!pdma) {
516 dma = kzalloc(sizeof *dma, GFP_KERNEL);
517 if (!dma) {
518 ret = -ENOMEM;
519 vfio_dma_unmap(iommu, iova, npage, prot);
520 goto out_lock;
521 }
522
523 dma->npage = npage;
524 dma->iova = iova;
525 dma->vaddr = vaddr;
526 dma->prot = prot;
527 list_add(&dma->next, &iommu->dma_list);
528 }
529
530out_lock:
531 mutex_unlock(&iommu->lock);
532 return ret;
533}
534
535static int vfio_iommu_type1_attach_group(void *iommu_data,
536 struct iommu_group *iommu_group)
537{
538 struct vfio_iommu *iommu = iommu_data;
539 struct vfio_group *group, *tmp;
540 int ret;
541
542 group = kzalloc(sizeof(*group), GFP_KERNEL);
543 if (!group)
544 return -ENOMEM;
545
546 mutex_lock(&iommu->lock);
547
548 list_for_each_entry(tmp, &iommu->group_list, next) {
549 if (tmp->iommu_group == iommu_group) {
550 mutex_unlock(&iommu->lock);
551 kfree(group);
552 return -EINVAL;
553 }
554 }
555
556 /*
557 * TODO: Domain have capabilities that might change as we add
558 * groups (see iommu->cache, currently never set). Check for
559 * them and potentially disallow groups to be attached when it
560 * would change capabilities (ugh).
561 */
562 ret = iommu_attach_group(iommu->domain, iommu_group);
563 if (ret) {
564 mutex_unlock(&iommu->lock);
565 kfree(group);
566 return ret;
567 }
568
569 group->iommu_group = iommu_group;
570 list_add(&group->next, &iommu->group_list);
571
572 mutex_unlock(&iommu->lock);
573
574 return 0;
575}
576
577static void vfio_iommu_type1_detach_group(void *iommu_data,
578 struct iommu_group *iommu_group)
579{
580 struct vfio_iommu *iommu = iommu_data;
581 struct vfio_group *group;
582
583 mutex_lock(&iommu->lock);
584
585 list_for_each_entry(group, &iommu->group_list, next) {
586 if (group->iommu_group == iommu_group) {
587 iommu_detach_group(iommu->domain, iommu_group);
588 list_del(&group->next);
589 kfree(group);
590 break;
591 }
592 }
593
594 mutex_unlock(&iommu->lock);
595}
596
597static void *vfio_iommu_type1_open(unsigned long arg)
598{
599 struct vfio_iommu *iommu;
600
601 if (arg != VFIO_TYPE1_IOMMU)
602 return ERR_PTR(-EINVAL);
603
604 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
605 if (!iommu)
606 return ERR_PTR(-ENOMEM);
607
608 INIT_LIST_HEAD(&iommu->group_list);
609 INIT_LIST_HEAD(&iommu->dma_list);
610 mutex_init(&iommu->lock);
611
612 /*
613 * Wish we didn't have to know about bus_type here.
614 */
615 iommu->domain = iommu_domain_alloc(&pci_bus_type);
616 if (!iommu->domain) {
617 kfree(iommu);
618 return ERR_PTR(-EIO);
619 }
620
621 /*
622 * Wish we could specify required capabilities rather than create
623 * a domain, see what comes out and hope it doesn't change along
624 * the way. Fortunately we know interrupt remapping is global for
625 * our iommus.
626 */
627 if (!allow_unsafe_interrupts &&
628 !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
629 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
630 __func__);
631 iommu_domain_free(iommu->domain);
632 kfree(iommu);
633 return ERR_PTR(-EPERM);
634 }
635
636 return iommu;
637}
638
639static void vfio_iommu_type1_release(void *iommu_data)
640{
641 struct vfio_iommu *iommu = iommu_data;
642 struct vfio_group *group, *group_tmp;
643 struct vfio_dma *dma, *dma_tmp;
644
645 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
646 iommu_detach_group(iommu->domain, group->iommu_group);
647 list_del(&group->next);
648 kfree(group);
649 }
650
651 list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) {
652 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
653 list_del(&dma->next);
654 kfree(dma);
655 }
656
657 iommu_domain_free(iommu->domain);
658 iommu->domain = NULL;
659 kfree(iommu);
660}
661
662static long vfio_iommu_type1_ioctl(void *iommu_data,
663 unsigned int cmd, unsigned long arg)
664{
665 struct vfio_iommu *iommu = iommu_data;
666 unsigned long minsz;
667
668 if (cmd == VFIO_CHECK_EXTENSION) {
669 switch (arg) {
670 case VFIO_TYPE1_IOMMU:
671 return 1;
672 default:
673 return 0;
674 }
675 } else if (cmd == VFIO_IOMMU_GET_INFO) {
676 struct vfio_iommu_type1_info info;
677
678 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
679
680 if (copy_from_user(&info, (void __user *)arg, minsz))
681 return -EFAULT;
682
683 if (info.argsz < minsz)
684 return -EINVAL;
685
686 info.flags = 0;
687
688 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
689
690 return copy_to_user((void __user *)arg, &info, minsz);
691
692 } else if (cmd == VFIO_IOMMU_MAP_DMA) {
693 struct vfio_iommu_type1_dma_map map;
694 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
695 VFIO_DMA_MAP_FLAG_WRITE;
696
697 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
698
699 if (copy_from_user(&map, (void __user *)arg, minsz))
700 return -EFAULT;
701
702 if (map.argsz < minsz || map.flags & ~mask)
703 return -EINVAL;
704
705 return vfio_dma_do_map(iommu, &map);
706
707 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
708 struct vfio_iommu_type1_dma_unmap unmap;
709
710 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
711
712 if (copy_from_user(&unmap, (void __user *)arg, minsz))
713 return -EFAULT;
714
715 if (unmap.argsz < minsz || unmap.flags)
716 return -EINVAL;
717
718 return vfio_dma_do_unmap(iommu, &unmap);
719 }
720
721 return -ENOTTY;
722}
723
724static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
725 .name = "vfio-iommu-type1",
726 .owner = THIS_MODULE,
727 .open = vfio_iommu_type1_open,
728 .release = vfio_iommu_type1_release,
729 .ioctl = vfio_iommu_type1_ioctl,
730 .attach_group = vfio_iommu_type1_attach_group,
731 .detach_group = vfio_iommu_type1_detach_group,
732};
733
734static int __init vfio_iommu_type1_init(void)
735{
736 if (!iommu_present(&pci_bus_type))
737 return -ENODEV;
738
739 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
740}
741
742static void __exit vfio_iommu_type1_cleanup(void)
743{
744 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
745}
746
747module_init(vfio_iommu_type1_init);
748module_exit(vfio_iommu_type1_cleanup);
749
750MODULE_VERSION(DRIVER_VERSION);
751MODULE_LICENSE("GPL v2");
752MODULE_AUTHOR(DRIVER_AUTHOR);
753MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
new file mode 100644
index 000000000000..0a4f180a11d8
--- /dev/null
+++ b/include/linux/vfio.h
@@ -0,0 +1,445 @@
1/*
2 * VFIO API definition
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef VFIO_H
12#define VFIO_H
13
14#include <linux/types.h>
15#include <linux/ioctl.h>
16
17#define VFIO_API_VERSION 0
18
19#ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */
20
21#include <linux/iommu.h>
22#include <linux/mm.h>
23
24/**
25 * struct vfio_device_ops - VFIO bus driver device callbacks
26 *
27 * @open: Called when userspace creates new file descriptor for device
28 * @release: Called when userspace releases file descriptor for device
29 * @read: Perform read(2) on device file descriptor
30 * @write: Perform write(2) on device file descriptor
31 * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
32 * operations documented below
33 * @mmap: Perform mmap(2) on a region of the device file descriptor
34 */
35struct vfio_device_ops {
36 char *name;
37 int (*open)(void *device_data);
38 void (*release)(void *device_data);
39 ssize_t (*read)(void *device_data, char __user *buf,
40 size_t count, loff_t *ppos);
41 ssize_t (*write)(void *device_data, const char __user *buf,
42 size_t count, loff_t *size);
43 long (*ioctl)(void *device_data, unsigned int cmd,
44 unsigned long arg);
45 int (*mmap)(void *device_data, struct vm_area_struct *vma);
46};
47
48extern int vfio_add_group_dev(struct device *dev,
49 const struct vfio_device_ops *ops,
50 void *device_data);
51
52extern void *vfio_del_group_dev(struct device *dev);
53
54/**
55 * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
56 */
57struct vfio_iommu_driver_ops {
58 char *name;
59 struct module *owner;
60 void *(*open)(unsigned long arg);
61 void (*release)(void *iommu_data);
62 ssize_t (*read)(void *iommu_data, char __user *buf,
63 size_t count, loff_t *ppos);
64 ssize_t (*write)(void *iommu_data, const char __user *buf,
65 size_t count, loff_t *size);
66 long (*ioctl)(void *iommu_data, unsigned int cmd,
67 unsigned long arg);
68 int (*mmap)(void *iommu_data, struct vm_area_struct *vma);
69 int (*attach_group)(void *iommu_data,
70 struct iommu_group *group);
71 void (*detach_group)(void *iommu_data,
72 struct iommu_group *group);
73
74};
75
76extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
77
78extern void vfio_unregister_iommu_driver(
79 const struct vfio_iommu_driver_ops *ops);
80
81/**
82 * offsetofend(TYPE, MEMBER)
83 *
84 * @TYPE: The type of the structure
85 * @MEMBER: The member within the structure to get the end offset of
86 *
87 * Simple helper macro for dealing with variable sized structures passed
88 * from user space. This allows us to easily determine if the provided
89 * structure is sized to include various fields.
90 */
91#define offsetofend(TYPE, MEMBER) ({ \
92 TYPE tmp; \
93 offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
94
95#endif /* __KERNEL__ */
96
97/* Kernel & User level defines for VFIO IOCTLs. */
98
99/* Extensions */
100
101#define VFIO_TYPE1_IOMMU 1
102
103/*
104 * The IOCTL interface is designed for extensibility by embedding the
105 * structure length (argsz) and flags into structures passed between
106 * kernel and userspace. We therefore use the _IO() macro for these
107 * defines to avoid implicitly embedding a size into the ioctl request.
108 * As structure fields are added, argsz will increase to match and flag
109 * bits will be defined to indicate additional fields with valid data.
110 * It's *always* the caller's responsibility to indicate the size of
111 * the structure passed by setting argsz appropriately.
112 */
113
114#define VFIO_TYPE (';')
115#define VFIO_BASE 100
116
117/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
118
119/**
120 * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
121 *
122 * Report the version of the VFIO API. This allows us to bump the entire
123 * API version should we later need to add or change features in incompatible
124 * ways.
125 * Return: VFIO_API_VERSION
126 * Availability: Always
127 */
128#define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0)
129
130/**
131 * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
132 *
133 * Check whether an extension is supported.
134 * Return: 0 if not supported, 1 (or some other positive integer) if supported.
135 * Availability: Always
136 */
137#define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1)
138
139/**
140 * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
141 *
142 * Set the iommu to the given type. The type must be supported by an
143 * iommu driver as verified by calling CHECK_EXTENSION using the same
144 * type. A group must be set to this file descriptor before this
145 * ioctl is available. The IOMMU interfaces enabled by this call are
146 * specific to the value set.
147 * Return: 0 on success, -errno on failure
148 * Availability: When VFIO group attached
149 */
150#define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2)
151
152/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
153
154/**
155 * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
156 * struct vfio_group_status)
157 *
158 * Retrieve information about the group. Fills in provided
159 * struct vfio_group_info. Caller sets argsz.
160 * Return: 0 on succes, -errno on failure.
161 * Availability: Always
162 */
163struct vfio_group_status {
164 __u32 argsz;
165 __u32 flags;
166#define VFIO_GROUP_FLAGS_VIABLE (1 << 0)
167#define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1)
168};
169#define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3)
170
171/**
172 * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
173 *
174 * Set the container for the VFIO group to the open VFIO file
175 * descriptor provided. Groups may only belong to a single
176 * container. Containers may, at their discretion, support multiple
177 * groups. Only when a container is set are all of the interfaces
178 * of the VFIO file descriptor and the VFIO group file descriptor
179 * available to the user.
180 * Return: 0 on success, -errno on failure.
181 * Availability: Always
182 */
183#define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4)
184
185/**
186 * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
187 *
188 * Remove the group from the attached container. This is the
189 * opposite of the SET_CONTAINER call and returns the group to
190 * an initial state. All device file descriptors must be released
191 * prior to calling this interface. When removing the last group
192 * from a container, the IOMMU will be disabled and all state lost,
193 * effectively also returning the VFIO file descriptor to an initial
194 * state.
195 * Return: 0 on success, -errno on failure.
196 * Availability: When attached to container
197 */
198#define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5)
199
200/**
201 * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
202 *
203 * Return a new file descriptor for the device object described by
204 * the provided string. The string should match a device listed in
205 * the devices subdirectory of the IOMMU group sysfs entry. The
206 * group containing the device must already be added to this context.
207 * Return: new file descriptor on success, -errno on failure.
208 * Availability: When attached to container
209 */
210#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6)
211
212/* --------------- IOCTLs for DEVICE file descriptors --------------- */
213
214/**
215 * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
216 * struct vfio_device_info)
217 *
218 * Retrieve information about the device. Fills in provided
219 * struct vfio_device_info. Caller sets argsz.
220 * Return: 0 on success, -errno on failure.
221 */
222struct vfio_device_info {
223 __u32 argsz;
224 __u32 flags;
225#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
226#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
227 __u32 num_regions; /* Max region index + 1 */
228 __u32 num_irqs; /* Max IRQ index + 1 */
229};
230#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
231
232/**
233 * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
234 * struct vfio_region_info)
235 *
236 * Retrieve information about a device region. Caller provides
237 * struct vfio_region_info with index value set. Caller sets argsz.
238 * Implementation of region mapping is bus driver specific. This is
239 * intended to describe MMIO, I/O port, as well as bus specific
240 * regions (ex. PCI config space). Zero sized regions may be used
241 * to describe unimplemented regions (ex. unimplemented PCI BARs).
242 * Return: 0 on success, -errno on failure.
243 */
244struct vfio_region_info {
245 __u32 argsz;
246 __u32 flags;
247#define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */
248#define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */
249#define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */
250 __u32 index; /* Region index */
251 __u32 resv; /* Reserved for alignment */
252 __u64 size; /* Region size (bytes) */
253 __u64 offset; /* Region offset from start of device fd */
254};
255#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8)
256
257/**
258 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
259 * struct vfio_irq_info)
260 *
261 * Retrieve information about a device IRQ. Caller provides
262 * struct vfio_irq_info with index value set. Caller sets argsz.
263 * Implementation of IRQ mapping is bus driver specific. Indexes
264 * using multiple IRQs are primarily intended to support MSI-like
265 * interrupt blocks. Zero count irq blocks may be used to describe
266 * unimplemented interrupt types.
267 *
268 * The EVENTFD flag indicates the interrupt index supports eventfd based
269 * signaling.
270 *
271 * The MASKABLE flags indicates the index supports MASK and UNMASK
272 * actions described below.
273 *
274 * AUTOMASKED indicates that after signaling, the interrupt line is
275 * automatically masked by VFIO and the user needs to unmask the line
276 * to receive new interrupts. This is primarily intended to distinguish
277 * level triggered interrupts.
278 *
279 * The NORESIZE flag indicates that the interrupt lines within the index
280 * are setup as a set and new subindexes cannot be enabled without first
281 * disabling the entire index. This is used for interrupts like PCI MSI
282 * and MSI-X where the driver may only use a subset of the available
283 * indexes, but VFIO needs to enable a specific number of vectors
284 * upfront. In the case of MSI-X, where the user can enable MSI-X and
285 * then add and unmask vectors, it's up to userspace to make the decision
286 * whether to allocate the maximum supported number of vectors or tear
287 * down setup and incrementally increase the vectors as each is enabled.
288 */
289struct vfio_irq_info {
290 __u32 argsz;
291 __u32 flags;
292#define VFIO_IRQ_INFO_EVENTFD (1 << 0)
293#define VFIO_IRQ_INFO_MASKABLE (1 << 1)
294#define VFIO_IRQ_INFO_AUTOMASKED (1 << 2)
295#define VFIO_IRQ_INFO_NORESIZE (1 << 3)
296 __u32 index; /* IRQ index */
297 __u32 count; /* Number of IRQs within this index */
298};
299#define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9)
300
301/**
302 * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
303 *
304 * Set signaling, masking, and unmasking of interrupts. Caller provides
305 * struct vfio_irq_set with all fields set. 'start' and 'count' indicate
306 * the range of subindexes being specified.
307 *
308 * The DATA flags specify the type of data provided. If DATA_NONE, the
309 * operation performs the specified action immediately on the specified
310 * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]:
311 * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
312 *
313 * DATA_BOOL allows sparse support for the same on arrays of interrupts.
314 * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
315 * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
316 * data = {1,0,1}
317 *
318 * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
319 * A value of -1 can be used to either de-assign interrupts if already
320 * assigned or skip un-assigned interrupts. For example, to set an eventfd
321 * to be trigger for interrupts [0,0] and [0,2]:
322 * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
323 * data = {fd1, -1, fd2}
324 * If index [0,1] is previously set, two count = 1 ioctls calls would be
325 * required to set [0,0] and [0,2] without changing [0,1].
326 *
327 * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
328 * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
329 * from userspace (ie. simulate hardware triggering).
330 *
331 * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
332 * enables the interrupt index for the device. Individual subindex interrupts
333 * can be disabled using the -1 value for DATA_EVENTFD or the index can be
334 * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
335 *
336 * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
337 * ACTION_TRIGGER specifies kernel->user signaling.
338 */
339struct vfio_irq_set {
340 __u32 argsz;
341 __u32 flags;
342#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
343#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
344#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
345#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
346#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
347#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
348 __u32 index;
349 __u32 start;
350 __u32 count;
351 __u8 data[];
352};
353#define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10)
354
355#define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
356 VFIO_IRQ_SET_DATA_BOOL | \
357 VFIO_IRQ_SET_DATA_EVENTFD)
358#define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
359 VFIO_IRQ_SET_ACTION_UNMASK | \
360 VFIO_IRQ_SET_ACTION_TRIGGER)
361/**
362 * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
363 *
364 * Reset a device.
365 */
366#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
367
368/*
369 * The VFIO-PCI bus driver makes use of the following fixed region and
370 * IRQ index mapping. Unimplemented regions return a size of zero.
371 * Unimplemented IRQ types return a count of zero.
372 */
373
374enum {
375 VFIO_PCI_BAR0_REGION_INDEX,
376 VFIO_PCI_BAR1_REGION_INDEX,
377 VFIO_PCI_BAR2_REGION_INDEX,
378 VFIO_PCI_BAR3_REGION_INDEX,
379 VFIO_PCI_BAR4_REGION_INDEX,
380 VFIO_PCI_BAR5_REGION_INDEX,
381 VFIO_PCI_ROM_REGION_INDEX,
382 VFIO_PCI_CONFIG_REGION_INDEX,
383 VFIO_PCI_NUM_REGIONS
384};
385
386enum {
387 VFIO_PCI_INTX_IRQ_INDEX,
388 VFIO_PCI_MSI_IRQ_INDEX,
389 VFIO_PCI_MSIX_IRQ_INDEX,
390 VFIO_PCI_NUM_IRQS
391};
392
393/* -------- API for Type1 VFIO IOMMU -------- */
394
395/**
396 * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
397 *
398 * Retrieve information about the IOMMU object. Fills in provided
399 * struct vfio_iommu_info. Caller sets argsz.
400 *
401 * XXX Should we do these by CHECK_EXTENSION too?
402 */
403struct vfio_iommu_type1_info {
404 __u32 argsz;
405 __u32 flags;
406#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
407 __u64 iova_pgsizes; /* Bitmap of supported page sizes */
408};
409
410#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
411
412/**
413 * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
414 *
415 * Map process virtual addresses to IO virtual addresses using the
416 * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
417 */
418struct vfio_iommu_type1_dma_map {
419 __u32 argsz;
420 __u32 flags;
421#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */
422#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */
423 __u64 vaddr; /* Process virtual address */
424 __u64 iova; /* IO virtual address */
425 __u64 size; /* Size of mapping (bytes) */
426};
427
428#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
429
430/**
431 * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
432 *
433 * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
434 * Caller sets argsz.
435 */
436struct vfio_iommu_type1_dma_unmap {
437 __u32 argsz;
438 __u32 flags;
439 __u64 iova; /* IO virtual address */
440 __u64 size; /* Size of mapping (bytes) */
441};
442
443#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
444
445#endif /* VFIO_H */