diff options
-rw-r--r-- | Documentation/ioctl/ioctl-number.txt | 1 | ||||
-rw-r--r-- | Documentation/vfio.txt | 314 | ||||
-rw-r--r-- | MAINTAINERS | 8 | ||||
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/Kconfig | 16 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 3 | ||||
-rw-r--r-- | drivers/vfio/pci/Kconfig | 8 | ||||
-rw-r--r-- | drivers/vfio/pci/Makefile | 4 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 579 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 1540 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 740 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 91 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 269 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 1420 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 753 | ||||
-rw-r--r-- | include/linux/vfio.h | 445 |
17 files changed, 6194 insertions, 0 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 915f28c470e9..849b771c5e03 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -88,6 +88,7 @@ Code Seq#(hex) Include File Comments | |||
88 | and kernel/power/user.c | 88 | and kernel/power/user.c |
89 | '8' all SNP8023 advanced NIC card | 89 | '8' all SNP8023 advanced NIC card |
90 | <mailto:mcr@solidum.com> | 90 | <mailto:mcr@solidum.com> |
91 | ';' 64-7F linux/vfio.h | ||
91 | '@' 00-0F linux/radeonfb.h conflict! | 92 | '@' 00-0F linux/radeonfb.h conflict! |
92 | '@' 00-0F drivers/video/aty/aty128fb.c conflict! | 93 | '@' 00-0F drivers/video/aty/aty128fb.c conflict! |
93 | 'A' 00-1F linux/apm_bios.h conflict! | 94 | 'A' 00-1F linux/apm_bios.h conflict! |
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt new file mode 100644 index 000000000000..0cb6685c8029 --- /dev/null +++ b/Documentation/vfio.txt | |||
@@ -0,0 +1,314 @@ | |||
1 | VFIO - "Virtual Function I/O"[1] | ||
2 | ------------------------------------------------------------------------------- | ||
3 | Many modern system now provide DMA and interrupt remapping facilities | ||
4 | to help ensure I/O devices behave within the boundaries they've been | ||
5 | allotted. This includes x86 hardware with AMD-Vi and Intel VT-d, | ||
6 | POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC | ||
7 | systems such as Freescale PAMU. The VFIO driver is an IOMMU/device | ||
8 | agnostic framework for exposing direct device access to userspace, in | ||
9 | a secure, IOMMU protected environment. In other words, this allows | ||
10 | safe[2], non-privileged, userspace drivers. | ||
11 | |||
12 | Why do we want that? Virtual machines often make use of direct device | ||
13 | access ("device assignment") when configured for the highest possible | ||
14 | I/O performance. From a device and host perspective, this simply | ||
15 | turns the VM into a userspace driver, with the benefits of | ||
16 | significantly reduced latency, higher bandwidth, and direct use of | ||
17 | bare-metal device drivers[3]. | ||
18 | |||
19 | Some applications, particularly in the high performance computing | ||
20 | field, also benefit from low-overhead, direct device access from | ||
21 | userspace. Examples include network adapters (often non-TCP/IP based) | ||
22 | and compute accelerators. Prior to VFIO, these drivers had to either | ||
23 | go through the full development cycle to become proper upstream | ||
24 | driver, be maintained out of tree, or make use of the UIO framework, | ||
25 | which has no notion of IOMMU protection, limited interrupt support, | ||
26 | and requires root privileges to access things like PCI configuration | ||
27 | space. | ||
28 | |||
29 | The VFIO driver framework intends to unify these, replacing both the | ||
30 | KVM PCI specific device assignment code as well as provide a more | ||
31 | secure, more featureful userspace driver environment than UIO. | ||
32 | |||
33 | Groups, Devices, and IOMMUs | ||
34 | ------------------------------------------------------------------------------- | ||
35 | |||
36 | Devices are the main target of any I/O driver. Devices typically | ||
37 | create a programming interface made up of I/O access, interrupts, | ||
38 | and DMA. Without going into the details of each of these, DMA is | ||
39 | by far the most critical aspect for maintaining a secure environment | ||
40 | as allowing a device read-write access to system memory imposes the | ||
41 | greatest risk to the overall system integrity. | ||
42 | |||
43 | To help mitigate this risk, many modern IOMMUs now incorporate | ||
44 | isolation properties into what was, in many cases, an interface only | ||
45 | meant for translation (ie. solving the addressing problems of devices | ||
46 | with limited address spaces). With this, devices can now be isolated | ||
47 | from each other and from arbitrary memory access, thus allowing | ||
48 | things like secure direct assignment of devices into virtual machines. | ||
49 | |||
50 | This isolation is not always at the granularity of a single device | ||
51 | though. Even when an IOMMU is capable of this, properties of devices, | ||
52 | interconnects, and IOMMU topologies can each reduce this isolation. | ||
53 | For instance, an individual device may be part of a larger multi- | ||
54 | function enclosure. While the IOMMU may be able to distinguish | ||
55 | between devices within the enclosure, the enclosure may not require | ||
56 | transactions between devices to reach the IOMMU. Examples of this | ||
57 | could be anything from a multi-function PCI device with backdoors | ||
58 | between functions to a non-PCI-ACS (Access Control Services) capable | ||
59 | bridge allowing redirection without reaching the IOMMU. Topology | ||
60 | can also play a factor in terms of hiding devices. A PCIe-to-PCI | ||
61 | bridge masks the devices behind it, making transaction appear as if | ||
62 | from the bridge itself. Obviously IOMMU design plays a major factor | ||
63 | as well. | ||
64 | |||
65 | Therefore, while for the most part an IOMMU may have device level | ||
66 | granularity, any system is susceptible to reduced granularity. The | ||
67 | IOMMU API therefore supports a notion of IOMMU groups. A group is | ||
68 | a set of devices which is isolatable from all other devices in the | ||
69 | system. Groups are therefore the unit of ownership used by VFIO. | ||
70 | |||
71 | While the group is the minimum granularity that must be used to | ||
72 | ensure secure user access, it's not necessarily the preferred | ||
73 | granularity. In IOMMUs which make use of page tables, it may be | ||
74 | possible to share a set of page tables between different groups, | ||
75 | reducing the overhead both to the platform (reduced TLB thrashing, | ||
76 | reduced duplicate page tables), and to the user (programming only | ||
77 | a single set of translations). For this reason, VFIO makes use of | ||
78 | a container class, which may hold one or more groups. A container | ||
79 | is created by simply opening the /dev/vfio/vfio character device. | ||
80 | |||
81 | On its own, the container provides little functionality, with all | ||
82 | but a couple version and extension query interfaces locked away. | ||
83 | The user needs to add a group into the container for the next level | ||
84 | of functionality. To do this, the user first needs to identify the | ||
85 | group associated with the desired device. This can be done using | ||
86 | the sysfs links described in the example below. By unbinding the | ||
87 | device from the host driver and binding it to a VFIO driver, a new | ||
88 | VFIO group will appear for the group as /dev/vfio/$GROUP, where | ||
89 | $GROUP is the IOMMU group number of which the device is a member. | ||
90 | If the IOMMU group contains multiple devices, each will need to | ||
91 | be bound to a VFIO driver before operations on the VFIO group | ||
92 | are allowed (it's also sufficient to only unbind the device from | ||
93 | host drivers if a VFIO driver is unavailable; this will make the | ||
94 | group available, but not that particular device). TBD - interface | ||
95 | for disabling driver probing/locking a device. | ||
96 | |||
97 | Once the group is ready, it may be added to the container by opening | ||
98 | the VFIO group character device (/dev/vfio/$GROUP) and using the | ||
99 | VFIO_GROUP_SET_CONTAINER ioctl, passing the file descriptor of the | ||
100 | previously opened container file. If desired and if the IOMMU driver | ||
101 | supports sharing the IOMMU context between groups, multiple groups may | ||
102 | be set to the same container. If a group fails to set to a container | ||
103 | with existing groups, a new empty container will need to be used | ||
104 | instead. | ||
105 | |||
106 | With a group (or groups) attached to a container, the remaining | ||
107 | ioctls become available, enabling access to the VFIO IOMMU interfaces. | ||
108 | Additionally, it now becomes possible to get file descriptors for each | ||
109 | device within a group using an ioctl on the VFIO group file descriptor. | ||
110 | |||
111 | The VFIO device API includes ioctls for describing the device, the I/O | ||
112 | regions and their read/write/mmap offsets on the device descriptor, as | ||
113 | well as mechanisms for describing and registering interrupt | ||
114 | notifications. | ||
115 | |||
116 | VFIO Usage Example | ||
117 | ------------------------------------------------------------------------------- | ||
118 | |||
119 | Assume user wants to access PCI device 0000:06:0d.0 | ||
120 | |||
121 | $ readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group | ||
122 | ../../../../kernel/iommu_groups/26 | ||
123 | |||
124 | This device is therefore in IOMMU group 26. This device is on the | ||
125 | pci bus, therefore the user will make use of vfio-pci to manage the | ||
126 | group: | ||
127 | |||
128 | # modprobe vfio-pci | ||
129 | |||
130 | Binding this device to the vfio-pci driver creates the VFIO group | ||
131 | character devices for this group: | ||
132 | |||
133 | $ lspci -n -s 0000:06:0d.0 | ||
134 | 06:0d.0 0401: 1102:0002 (rev 08) | ||
135 | # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind | ||
136 | # echo 1102 0002 > /sys/bus/pci/drivers/vfio/new_id | ||
137 | |||
138 | Now we need to look at what other devices are in the group to free | ||
139 | it for use by VFIO: | ||
140 | |||
141 | $ ls -l /sys/bus/pci/devices/0000:06:0d.0/iommu_group/devices | ||
142 | total 0 | ||
143 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:00:1e.0 -> | ||
144 | ../../../../devices/pci0000:00/0000:00:1e.0 | ||
145 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.0 -> | ||
146 | ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.0 | ||
147 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.1 -> | ||
148 | ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.1 | ||
149 | |||
150 | This device is behind a PCIe-to-PCI bridge[4], therefore we also | ||
151 | need to add device 0000:06:0d.1 to the group following the same | ||
152 | procedure as above. Device 0000:00:1e.0 is a bridge that does | ||
153 | not currently have a host driver, therefore it's not required to | ||
154 | bind this device to the vfio-pci driver (vfio-pci does not currently | ||
155 | support PCI bridges). | ||
156 | |||
157 | The final step is to provide the user with access to the group if | ||
158 | unprivileged operation is desired (note that /dev/vfio/vfio provides | ||
159 | no capabilities on its own and is therefore expected to be set to | ||
160 | mode 0666 by the system). | ||
161 | |||
162 | # chown user:user /dev/vfio/26 | ||
163 | |||
164 | The user now has full access to all the devices and the iommu for this | ||
165 | group and can access them as follows: | ||
166 | |||
167 | int container, group, device, i; | ||
168 | struct vfio_group_status group_status = | ||
169 | { .argsz = sizeof(group_status) }; | ||
170 | struct vfio_iommu_x86_info iommu_info = { .argsz = sizeof(iommu_info) }; | ||
171 | struct vfio_iommu_x86_dma_map dma_map = { .argsz = sizeof(dma_map) }; | ||
172 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; | ||
173 | |||
174 | /* Create a new container */ | ||
175 | container = open("/dev/vfio/vfio, O_RDWR); | ||
176 | |||
177 | if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) | ||
178 | /* Unknown API version */ | ||
179 | |||
180 | if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_X86_IOMMU)) | ||
181 | /* Doesn't support the IOMMU driver we want. */ | ||
182 | |||
183 | /* Open the group */ | ||
184 | group = open("/dev/vfio/26", O_RDWR); | ||
185 | |||
186 | /* Test the group is viable and available */ | ||
187 | ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); | ||
188 | |||
189 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) | ||
190 | /* Group is not viable (ie, not all devices bound for vfio) */ | ||
191 | |||
192 | /* Add the group to the container */ | ||
193 | ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); | ||
194 | |||
195 | /* Enable the IOMMU model we want */ | ||
196 | ioctl(container, VFIO_SET_IOMMU, VFIO_X86_IOMMU) | ||
197 | |||
198 | /* Get addition IOMMU info */ | ||
199 | ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); | ||
200 | |||
201 | /* Allocate some space and setup a DMA mapping */ | ||
202 | dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, | ||
203 | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); | ||
204 | dma_map.size = 1024 * 1024; | ||
205 | dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ | ||
206 | dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; | ||
207 | |||
208 | ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); | ||
209 | |||
210 | /* Get a file descriptor for the device */ | ||
211 | device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); | ||
212 | |||
213 | /* Test and setup the device */ | ||
214 | ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); | ||
215 | |||
216 | for (i = 0; i < device_info.num_regions; i++) { | ||
217 | struct vfio_region_info reg = { .argsz = sizeof(reg) }; | ||
218 | |||
219 | reg.index = i; | ||
220 | |||
221 | ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®); | ||
222 | |||
223 | /* Setup mappings... read/write offsets, mmaps | ||
224 | * For PCI devices, config space is a region */ | ||
225 | } | ||
226 | |||
227 | for (i = 0; i < device_info.num_irqs; i++) { | ||
228 | struct vfio_irq_info irq = { .argsz = sizeof(irq) }; | ||
229 | |||
230 | irq.index = i; | ||
231 | |||
232 | ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, ®); | ||
233 | |||
234 | /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */ | ||
235 | } | ||
236 | |||
237 | /* Gratuitous device reset and go... */ | ||
238 | ioctl(device, VFIO_DEVICE_RESET); | ||
239 | |||
240 | VFIO User API | ||
241 | ------------------------------------------------------------------------------- | ||
242 | |||
243 | Please see include/linux/vfio.h for complete API documentation. | ||
244 | |||
245 | VFIO bus driver API | ||
246 | ------------------------------------------------------------------------------- | ||
247 | |||
248 | VFIO bus drivers, such as vfio-pci make use of only a few interfaces | ||
249 | into VFIO core. When devices are bound and unbound to the driver, | ||
250 | the driver should call vfio_add_group_dev() and vfio_del_group_dev() | ||
251 | respectively: | ||
252 | |||
253 | extern int vfio_add_group_dev(struct iommu_group *iommu_group, | ||
254 | struct device *dev, | ||
255 | const struct vfio_device_ops *ops, | ||
256 | void *device_data); | ||
257 | |||
258 | extern void *vfio_del_group_dev(struct device *dev); | ||
259 | |||
260 | vfio_add_group_dev() indicates to the core to begin tracking the | ||
261 | specified iommu_group and register the specified dev as owned by | ||
262 | a VFIO bus driver. The driver provides an ops structure for callbacks | ||
263 | similar to a file operations structure: | ||
264 | |||
265 | struct vfio_device_ops { | ||
266 | int (*open)(void *device_data); | ||
267 | void (*release)(void *device_data); | ||
268 | ssize_t (*read)(void *device_data, char __user *buf, | ||
269 | size_t count, loff_t *ppos); | ||
270 | ssize_t (*write)(void *device_data, const char __user *buf, | ||
271 | size_t size, loff_t *ppos); | ||
272 | long (*ioctl)(void *device_data, unsigned int cmd, | ||
273 | unsigned long arg); | ||
274 | int (*mmap)(void *device_data, struct vm_area_struct *vma); | ||
275 | }; | ||
276 | |||
277 | Each function is passed the device_data that was originally registered | ||
278 | in the vfio_add_group_dev() call above. This allows the bus driver | ||
279 | an easy place to store its opaque, private data. The open/release | ||
280 | callbacks are issued when a new file descriptor is created for a | ||
281 | device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides | ||
282 | a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap | ||
283 | interfaces implement the device region access defined by the device's | ||
284 | own VFIO_DEVICE_GET_REGION_INFO ioctl. | ||
285 | |||
286 | ------------------------------------------------------------------------------- | ||
287 | |||
288 | [1] VFIO was originally an acronym for "Virtual Function I/O" in its | ||
289 | initial implementation by Tom Lyon while as Cisco. We've since | ||
290 | outgrown the acronym, but it's catchy. | ||
291 | |||
292 | [2] "safe" also depends upon a device being "well behaved". It's | ||
293 | possible for multi-function devices to have backdoors between | ||
294 | functions and even for single function devices to have alternative | ||
295 | access to things like PCI config space through MMIO registers. To | ||
296 | guard against the former we can include additional precautions in the | ||
297 | IOMMU driver to group multi-function PCI devices together | ||
298 | (iommu=group_mf). The latter we can't prevent, but the IOMMU should | ||
299 | still provide isolation. For PCI, SR-IOV Virtual Functions are the | ||
300 | best indicator of "well behaved", as these are designed for | ||
301 | virtualization usage models. | ||
302 | |||
303 | [3] As always there are trade-offs to virtual machine device | ||
304 | assignment that are beyond the scope of VFIO. It's expected that | ||
305 | future IOMMU technologies will reduce some, but maybe not all, of | ||
306 | these trade-offs. | ||
307 | |||
308 | [4] In this case the device is below a PCI bridge, so transactions | ||
309 | from either function of the device are indistinguishable to the iommu: | ||
310 | |||
311 | -[0000:00]-+-1e.0-[06]--+-0d.0 | ||
312 | \-0d.1 | ||
313 | |||
314 | 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 36ed8a14e8e2..6720018bc674 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -7414,6 +7414,14 @@ S: Maintained | |||
7414 | F: Documentation/filesystems/vfat.txt | 7414 | F: Documentation/filesystems/vfat.txt |
7415 | F: fs/fat/ | 7415 | F: fs/fat/ |
7416 | 7416 | ||
7417 | VFIO DRIVER | ||
7418 | M: Alex Williamson <alex.williamson@redhat.com> | ||
7419 | L: kvm@vger.kernel.org | ||
7420 | S: Maintained | ||
7421 | F: Documentation/vfio.txt | ||
7422 | F: drivers/vfio/ | ||
7423 | F: include/linux/vfio.h | ||
7424 | |||
7417 | VIDEOBUF2 FRAMEWORK | 7425 | VIDEOBUF2 FRAMEWORK |
7418 | M: Pawel Osciak <pawel@osciak.com> | 7426 | M: Pawel Osciak <pawel@osciak.com> |
7419 | M: Marek Szyprowski <m.szyprowski@samsung.com> | 7427 | M: Marek Szyprowski <m.szyprowski@samsung.com> |
diff --git a/drivers/Kconfig b/drivers/Kconfig index 805c432c9439..ece958d3762e 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
@@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig" | |||
112 | 112 | ||
113 | source "drivers/uio/Kconfig" | 113 | source "drivers/uio/Kconfig" |
114 | 114 | ||
115 | source "drivers/vfio/Kconfig" | ||
116 | |||
115 | source "drivers/vlynq/Kconfig" | 117 | source "drivers/vlynq/Kconfig" |
116 | 118 | ||
117 | source "drivers/virtio/Kconfig" | 119 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index bd36f09f2246..5b421840c48d 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -60,6 +60,7 @@ obj-$(CONFIG_ATM) += atm/ | |||
60 | obj-$(CONFIG_FUSION) += message/ | 60 | obj-$(CONFIG_FUSION) += message/ |
61 | obj-y += firewire/ | 61 | obj-y += firewire/ |
62 | obj-$(CONFIG_UIO) += uio/ | 62 | obj-$(CONFIG_UIO) += uio/ |
63 | obj-$(CONFIG_VFIO) += vfio/ | ||
63 | obj-y += cdrom/ | 64 | obj-y += cdrom/ |
64 | obj-y += auxdisplay/ | 65 | obj-y += auxdisplay/ |
65 | obj-$(CONFIG_PCCARD) += pcmcia/ | 66 | obj-$(CONFIG_PCCARD) += pcmcia/ |
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig new file mode 100644 index 000000000000..7cd5dec0abd1 --- /dev/null +++ b/drivers/vfio/Kconfig | |||
@@ -0,0 +1,16 @@ | |||
1 | config VFIO_IOMMU_TYPE1 | ||
2 | tristate | ||
3 | depends on VFIO | ||
4 | default n | ||
5 | |||
6 | menuconfig VFIO | ||
7 | tristate "VFIO Non-Privileged userspace driver framework" | ||
8 | depends on IOMMU_API | ||
9 | select VFIO_IOMMU_TYPE1 if X86 | ||
10 | help | ||
11 | VFIO provides a framework for secure userspace device drivers. | ||
12 | See Documentation/vfio.txt for more details. | ||
13 | |||
14 | If you don't know what to do here, say N. | ||
15 | |||
16 | source "drivers/vfio/pci/Kconfig" | ||
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile new file mode 100644 index 000000000000..2398d4a0e38b --- /dev/null +++ b/drivers/vfio/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | obj-$(CONFIG_VFIO) += vfio.o | ||
2 | obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o | ||
3 | obj-$(CONFIG_VFIO_PCI) += pci/ | ||
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig new file mode 100644 index 000000000000..5980758563eb --- /dev/null +++ b/drivers/vfio/pci/Kconfig | |||
@@ -0,0 +1,8 @@ | |||
1 | config VFIO_PCI | ||
2 | tristate "VFIO support for PCI devices" | ||
3 | depends on VFIO && PCI && EVENTFD | ||
4 | help | ||
5 | Support for the PCI VFIO bus driver. This is required to make | ||
6 | use of PCI drivers using the VFIO framework. | ||
7 | |||
8 | If you don't know what to do here, say N. | ||
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile new file mode 100644 index 000000000000..131079255fd9 --- /dev/null +++ b/drivers/vfio/pci/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | |||
2 | vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o | ||
3 | |||
4 | obj-$(CONFIG_VFIO_PCI) += vfio-pci.o | ||
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c new file mode 100644 index 000000000000..6968b7232232 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci.c | |||
@@ -0,0 +1,579 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * Derived from original vfio: | ||
10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
11 | * Author: Tom Lyon, pugs@cisco.com | ||
12 | */ | ||
13 | |||
14 | #include <linux/device.h> | ||
15 | #include <linux/eventfd.h> | ||
16 | #include <linux/interrupt.h> | ||
17 | #include <linux/iommu.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mutex.h> | ||
20 | #include <linux/notifier.h> | ||
21 | #include <linux/pci.h> | ||
22 | #include <linux/pm_runtime.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/uaccess.h> | ||
26 | #include <linux/vfio.h> | ||
27 | |||
28 | #include "vfio_pci_private.h" | ||
29 | |||
30 | #define DRIVER_VERSION "0.2" | ||
31 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
32 | #define DRIVER_DESC "VFIO PCI - User Level meta-driver" | ||
33 | |||
34 | static bool nointxmask; | ||
35 | module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); | ||
36 | MODULE_PARM_DESC(nointxmask, | ||
37 | "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); | ||
38 | |||
39 | static int vfio_pci_enable(struct vfio_pci_device *vdev) | ||
40 | { | ||
41 | struct pci_dev *pdev = vdev->pdev; | ||
42 | int ret; | ||
43 | u16 cmd; | ||
44 | u8 msix_pos; | ||
45 | |||
46 | vdev->reset_works = (pci_reset_function(pdev) == 0); | ||
47 | pci_save_state(pdev); | ||
48 | vdev->pci_saved_state = pci_store_saved_state(pdev); | ||
49 | if (!vdev->pci_saved_state) | ||
50 | pr_debug("%s: Couldn't store %s saved state\n", | ||
51 | __func__, dev_name(&pdev->dev)); | ||
52 | |||
53 | ret = vfio_config_init(vdev); | ||
54 | if (ret) | ||
55 | goto out; | ||
56 | |||
57 | if (likely(!nointxmask)) | ||
58 | vdev->pci_2_3 = pci_intx_mask_supported(pdev); | ||
59 | |||
60 | pci_read_config_word(pdev, PCI_COMMAND, &cmd); | ||
61 | if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { | ||
62 | cmd &= ~PCI_COMMAND_INTX_DISABLE; | ||
63 | pci_write_config_word(pdev, PCI_COMMAND, cmd); | ||
64 | } | ||
65 | |||
66 | msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); | ||
67 | if (msix_pos) { | ||
68 | u16 flags; | ||
69 | u32 table; | ||
70 | |||
71 | pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); | ||
72 | pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); | ||
73 | |||
74 | vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK; | ||
75 | vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; | ||
76 | vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; | ||
77 | } else | ||
78 | vdev->msix_bar = 0xFF; | ||
79 | |||
80 | ret = pci_enable_device(pdev); | ||
81 | if (ret) | ||
82 | goto out; | ||
83 | |||
84 | return ret; | ||
85 | |||
86 | out: | ||
87 | kfree(vdev->pci_saved_state); | ||
88 | vdev->pci_saved_state = NULL; | ||
89 | vfio_config_free(vdev); | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | static void vfio_pci_disable(struct vfio_pci_device *vdev) | ||
94 | { | ||
95 | int bar; | ||
96 | |||
97 | pci_disable_device(vdev->pdev); | ||
98 | |||
99 | vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | | ||
100 | VFIO_IRQ_SET_ACTION_TRIGGER, | ||
101 | vdev->irq_type, 0, 0, NULL); | ||
102 | |||
103 | vdev->virq_disabled = false; | ||
104 | |||
105 | vfio_config_free(vdev); | ||
106 | |||
107 | pci_reset_function(vdev->pdev); | ||
108 | |||
109 | if (pci_load_and_free_saved_state(vdev->pdev, | ||
110 | &vdev->pci_saved_state) == 0) | ||
111 | pci_restore_state(vdev->pdev); | ||
112 | else | ||
113 | pr_info("%s: Couldn't reload %s saved state\n", | ||
114 | __func__, dev_name(&vdev->pdev->dev)); | ||
115 | |||
116 | for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { | ||
117 | if (!vdev->barmap[bar]) | ||
118 | continue; | ||
119 | pci_iounmap(vdev->pdev, vdev->barmap[bar]); | ||
120 | pci_release_selected_regions(vdev->pdev, 1 << bar); | ||
121 | vdev->barmap[bar] = NULL; | ||
122 | } | ||
123 | } | ||
124 | |||
125 | static void vfio_pci_release(void *device_data) | ||
126 | { | ||
127 | struct vfio_pci_device *vdev = device_data; | ||
128 | |||
129 | if (atomic_dec_and_test(&vdev->refcnt)) | ||
130 | vfio_pci_disable(vdev); | ||
131 | |||
132 | module_put(THIS_MODULE); | ||
133 | } | ||
134 | |||
135 | static int vfio_pci_open(void *device_data) | ||
136 | { | ||
137 | struct vfio_pci_device *vdev = device_data; | ||
138 | |||
139 | if (!try_module_get(THIS_MODULE)) | ||
140 | return -ENODEV; | ||
141 | |||
142 | if (atomic_inc_return(&vdev->refcnt) == 1) { | ||
143 | int ret = vfio_pci_enable(vdev); | ||
144 | if (ret) { | ||
145 | module_put(THIS_MODULE); | ||
146 | return ret; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) | ||
154 | { | ||
155 | if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { | ||
156 | u8 pin; | ||
157 | pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); | ||
158 | if (pin) | ||
159 | return 1; | ||
160 | |||
161 | } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { | ||
162 | u8 pos; | ||
163 | u16 flags; | ||
164 | |||
165 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI); | ||
166 | if (pos) { | ||
167 | pci_read_config_word(vdev->pdev, | ||
168 | pos + PCI_MSI_FLAGS, &flags); | ||
169 | |||
170 | return 1 << (flags & PCI_MSI_FLAGS_QMASK); | ||
171 | } | ||
172 | } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { | ||
173 | u8 pos; | ||
174 | u16 flags; | ||
175 | |||
176 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX); | ||
177 | if (pos) { | ||
178 | pci_read_config_word(vdev->pdev, | ||
179 | pos + PCI_MSIX_FLAGS, &flags); | ||
180 | |||
181 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static long vfio_pci_ioctl(void *device_data, | ||
189 | unsigned int cmd, unsigned long arg) | ||
190 | { | ||
191 | struct vfio_pci_device *vdev = device_data; | ||
192 | unsigned long minsz; | ||
193 | |||
194 | if (cmd == VFIO_DEVICE_GET_INFO) { | ||
195 | struct vfio_device_info info; | ||
196 | |||
197 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
198 | |||
199 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
200 | return -EFAULT; | ||
201 | |||
202 | if (info.argsz < minsz) | ||
203 | return -EINVAL; | ||
204 | |||
205 | info.flags = VFIO_DEVICE_FLAGS_PCI; | ||
206 | |||
207 | if (vdev->reset_works) | ||
208 | info.flags |= VFIO_DEVICE_FLAGS_RESET; | ||
209 | |||
210 | info.num_regions = VFIO_PCI_NUM_REGIONS; | ||
211 | info.num_irqs = VFIO_PCI_NUM_IRQS; | ||
212 | |||
213 | return copy_to_user((void __user *)arg, &info, minsz); | ||
214 | |||
215 | } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { | ||
216 | struct pci_dev *pdev = vdev->pdev; | ||
217 | struct vfio_region_info info; | ||
218 | |||
219 | minsz = offsetofend(struct vfio_region_info, offset); | ||
220 | |||
221 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
222 | return -EFAULT; | ||
223 | |||
224 | if (info.argsz < minsz) | ||
225 | return -EINVAL; | ||
226 | |||
227 | switch (info.index) { | ||
228 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
229 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
230 | info.size = pdev->cfg_size; | ||
231 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
232 | VFIO_REGION_INFO_FLAG_WRITE; | ||
233 | break; | ||
234 | case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: | ||
235 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
236 | info.size = pci_resource_len(pdev, info.index); | ||
237 | if (!info.size) { | ||
238 | info.flags = 0; | ||
239 | break; | ||
240 | } | ||
241 | |||
242 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
243 | VFIO_REGION_INFO_FLAG_WRITE; | ||
244 | if (pci_resource_flags(pdev, info.index) & | ||
245 | IORESOURCE_MEM && info.size >= PAGE_SIZE) | ||
246 | info.flags |= VFIO_REGION_INFO_FLAG_MMAP; | ||
247 | break; | ||
248 | case VFIO_PCI_ROM_REGION_INDEX: | ||
249 | { | ||
250 | void __iomem *io; | ||
251 | size_t size; | ||
252 | |||
253 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
254 | info.flags = 0; | ||
255 | |||
256 | /* Report the BAR size, not the ROM size */ | ||
257 | info.size = pci_resource_len(pdev, info.index); | ||
258 | if (!info.size) | ||
259 | break; | ||
260 | |||
261 | /* Is it really there? */ | ||
262 | io = pci_map_rom(pdev, &size); | ||
263 | if (!io || !size) { | ||
264 | info.size = 0; | ||
265 | break; | ||
266 | } | ||
267 | pci_unmap_rom(pdev, io); | ||
268 | |||
269 | info.flags = VFIO_REGION_INFO_FLAG_READ; | ||
270 | break; | ||
271 | } | ||
272 | default: | ||
273 | return -EINVAL; | ||
274 | } | ||
275 | |||
276 | return copy_to_user((void __user *)arg, &info, minsz); | ||
277 | |||
278 | } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { | ||
279 | struct vfio_irq_info info; | ||
280 | |||
281 | minsz = offsetofend(struct vfio_irq_info, count); | ||
282 | |||
283 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
284 | return -EFAULT; | ||
285 | |||
286 | if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) | ||
287 | return -EINVAL; | ||
288 | |||
289 | info.flags = VFIO_IRQ_INFO_EVENTFD; | ||
290 | |||
291 | info.count = vfio_pci_get_irq_count(vdev, info.index); | ||
292 | |||
293 | if (info.index == VFIO_PCI_INTX_IRQ_INDEX) | ||
294 | info.flags |= (VFIO_IRQ_INFO_MASKABLE | | ||
295 | VFIO_IRQ_INFO_AUTOMASKED); | ||
296 | else | ||
297 | info.flags |= VFIO_IRQ_INFO_NORESIZE; | ||
298 | |||
299 | return copy_to_user((void __user *)arg, &info, minsz); | ||
300 | |||
301 | } else if (cmd == VFIO_DEVICE_SET_IRQS) { | ||
302 | struct vfio_irq_set hdr; | ||
303 | u8 *data = NULL; | ||
304 | int ret = 0; | ||
305 | |||
306 | minsz = offsetofend(struct vfio_irq_set, count); | ||
307 | |||
308 | if (copy_from_user(&hdr, (void __user *)arg, minsz)) | ||
309 | return -EFAULT; | ||
310 | |||
311 | if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || | ||
312 | hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | | ||
313 | VFIO_IRQ_SET_ACTION_TYPE_MASK)) | ||
314 | return -EINVAL; | ||
315 | |||
316 | if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
317 | size_t size; | ||
318 | |||
319 | if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) | ||
320 | size = sizeof(uint8_t); | ||
321 | else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) | ||
322 | size = sizeof(int32_t); | ||
323 | else | ||
324 | return -EINVAL; | ||
325 | |||
326 | if (hdr.argsz - minsz < hdr.count * size || | ||
327 | hdr.count > vfio_pci_get_irq_count(vdev, hdr.index)) | ||
328 | return -EINVAL; | ||
329 | |||
330 | data = kmalloc(hdr.count * size, GFP_KERNEL); | ||
331 | if (!data) | ||
332 | return -ENOMEM; | ||
333 | |||
334 | if (copy_from_user(data, (void __user *)(arg + minsz), | ||
335 | hdr.count * size)) { | ||
336 | kfree(data); | ||
337 | return -EFAULT; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | mutex_lock(&vdev->igate); | ||
342 | |||
343 | ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, | ||
344 | hdr.start, hdr.count, data); | ||
345 | |||
346 | mutex_unlock(&vdev->igate); | ||
347 | kfree(data); | ||
348 | |||
349 | return ret; | ||
350 | |||
351 | } else if (cmd == VFIO_DEVICE_RESET) | ||
352 | return vdev->reset_works ? | ||
353 | pci_reset_function(vdev->pdev) : -EINVAL; | ||
354 | |||
355 | return -ENOTTY; | ||
356 | } | ||
357 | |||
358 | static ssize_t vfio_pci_read(void *device_data, char __user *buf, | ||
359 | size_t count, loff_t *ppos) | ||
360 | { | ||
361 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
362 | struct vfio_pci_device *vdev = device_data; | ||
363 | struct pci_dev *pdev = vdev->pdev; | ||
364 | |||
365 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
366 | return -EINVAL; | ||
367 | |||
368 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
369 | return vfio_pci_config_readwrite(vdev, buf, count, ppos, false); | ||
370 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
371 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
372 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
373 | return vfio_pci_io_readwrite(vdev, buf, count, ppos, false); | ||
374 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) | ||
375 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
376 | |||
377 | return -EINVAL; | ||
378 | } | ||
379 | |||
380 | static ssize_t vfio_pci_write(void *device_data, const char __user *buf, | ||
381 | size_t count, loff_t *ppos) | ||
382 | { | ||
383 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
384 | struct vfio_pci_device *vdev = device_data; | ||
385 | struct pci_dev *pdev = vdev->pdev; | ||
386 | |||
387 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
388 | return -EINVAL; | ||
389 | |||
390 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
391 | return vfio_pci_config_readwrite(vdev, (char __user *)buf, | ||
392 | count, ppos, true); | ||
393 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
394 | return -EINVAL; | ||
395 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
396 | return vfio_pci_io_readwrite(vdev, (char __user *)buf, | ||
397 | count, ppos, true); | ||
398 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) { | ||
399 | return vfio_pci_mem_readwrite(vdev, (char __user *)buf, | ||
400 | count, ppos, true); | ||
401 | } | ||
402 | |||
403 | return -EINVAL; | ||
404 | } | ||
405 | |||
406 | static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) | ||
407 | { | ||
408 | struct vfio_pci_device *vdev = device_data; | ||
409 | struct pci_dev *pdev = vdev->pdev; | ||
410 | unsigned int index; | ||
411 | u64 phys_len, req_len, pgoff, req_start, phys; | ||
412 | int ret; | ||
413 | |||
414 | index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); | ||
415 | |||
416 | if (vma->vm_end < vma->vm_start) | ||
417 | return -EINVAL; | ||
418 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
419 | return -EINVAL; | ||
420 | if (index >= VFIO_PCI_ROM_REGION_INDEX) | ||
421 | return -EINVAL; | ||
422 | if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) | ||
423 | return -EINVAL; | ||
424 | |||
425 | phys_len = pci_resource_len(pdev, index); | ||
426 | req_len = vma->vm_end - vma->vm_start; | ||
427 | pgoff = vma->vm_pgoff & | ||
428 | ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); | ||
429 | req_start = pgoff << PAGE_SHIFT; | ||
430 | |||
431 | if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) | ||
432 | return -EINVAL; | ||
433 | |||
434 | if (index == vdev->msix_bar) { | ||
435 | /* | ||
436 | * Disallow mmaps overlapping the MSI-X table; users don't | ||
437 | * get to touch this directly. We could find somewhere | ||
438 | * else to map the overlap, but page granularity is only | ||
439 | * a recommendation, not a requirement, so the user needs | ||
440 | * to know which bits are real. Requiring them to mmap | ||
441 | * around the table makes that clear. | ||
442 | */ | ||
443 | |||
444 | /* If neither entirely above nor below, then it overlaps */ | ||
445 | if (!(req_start >= vdev->msix_offset + vdev->msix_size || | ||
446 | req_start + req_len <= vdev->msix_offset)) | ||
447 | return -EINVAL; | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Even though we don't make use of the barmap for the mmap, | ||
452 | * we need to request the region and the barmap tracks that. | ||
453 | */ | ||
454 | if (!vdev->barmap[index]) { | ||
455 | ret = pci_request_selected_regions(pdev, | ||
456 | 1 << index, "vfio-pci"); | ||
457 | if (ret) | ||
458 | return ret; | ||
459 | |||
460 | vdev->barmap[index] = pci_iomap(pdev, index, 0); | ||
461 | } | ||
462 | |||
463 | vma->vm_private_data = vdev; | ||
464 | vma->vm_flags |= (VM_IO | VM_RESERVED); | ||
465 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
466 | |||
467 | phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; | ||
468 | |||
469 | return remap_pfn_range(vma, vma->vm_start, phys, | ||
470 | req_len, vma->vm_page_prot); | ||
471 | } | ||
472 | |||
473 | static const struct vfio_device_ops vfio_pci_ops = { | ||
474 | .name = "vfio-pci", | ||
475 | .open = vfio_pci_open, | ||
476 | .release = vfio_pci_release, | ||
477 | .ioctl = vfio_pci_ioctl, | ||
478 | .read = vfio_pci_read, | ||
479 | .write = vfio_pci_write, | ||
480 | .mmap = vfio_pci_mmap, | ||
481 | }; | ||
482 | |||
483 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | ||
484 | { | ||
485 | u8 type; | ||
486 | struct vfio_pci_device *vdev; | ||
487 | struct iommu_group *group; | ||
488 | int ret; | ||
489 | |||
490 | pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type); | ||
491 | if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) | ||
492 | return -EINVAL; | ||
493 | |||
494 | group = iommu_group_get(&pdev->dev); | ||
495 | if (!group) | ||
496 | return -EINVAL; | ||
497 | |||
498 | vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); | ||
499 | if (!vdev) { | ||
500 | iommu_group_put(group); | ||
501 | return -ENOMEM; | ||
502 | } | ||
503 | |||
504 | vdev->pdev = pdev; | ||
505 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
506 | mutex_init(&vdev->igate); | ||
507 | spin_lock_init(&vdev->irqlock); | ||
508 | atomic_set(&vdev->refcnt, 0); | ||
509 | |||
510 | ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); | ||
511 | if (ret) { | ||
512 | iommu_group_put(group); | ||
513 | kfree(vdev); | ||
514 | } | ||
515 | |||
516 | return ret; | ||
517 | } | ||
518 | |||
519 | static void vfio_pci_remove(struct pci_dev *pdev) | ||
520 | { | ||
521 | struct vfio_pci_device *vdev; | ||
522 | |||
523 | vdev = vfio_del_group_dev(&pdev->dev); | ||
524 | if (!vdev) | ||
525 | return; | ||
526 | |||
527 | iommu_group_put(pdev->dev.iommu_group); | ||
528 | kfree(vdev); | ||
529 | } | ||
530 | |||
531 | static struct pci_driver vfio_pci_driver = { | ||
532 | .name = "vfio-pci", | ||
533 | .id_table = NULL, /* only dynamic ids */ | ||
534 | .probe = vfio_pci_probe, | ||
535 | .remove = vfio_pci_remove, | ||
536 | }; | ||
537 | |||
538 | static void __exit vfio_pci_cleanup(void) | ||
539 | { | ||
540 | pci_unregister_driver(&vfio_pci_driver); | ||
541 | vfio_pci_virqfd_exit(); | ||
542 | vfio_pci_uninit_perm_bits(); | ||
543 | } | ||
544 | |||
545 | static int __init vfio_pci_init(void) | ||
546 | { | ||
547 | int ret; | ||
548 | |||
549 | /* Allocate shared config space permision data used by all devices */ | ||
550 | ret = vfio_pci_init_perm_bits(); | ||
551 | if (ret) | ||
552 | return ret; | ||
553 | |||
554 | /* Start the virqfd cleanup handler */ | ||
555 | ret = vfio_pci_virqfd_init(); | ||
556 | if (ret) | ||
557 | goto out_virqfd; | ||
558 | |||
559 | /* Register and scan for devices */ | ||
560 | ret = pci_register_driver(&vfio_pci_driver); | ||
561 | if (ret) | ||
562 | goto out_driver; | ||
563 | |||
564 | return 0; | ||
565 | |||
566 | out_virqfd: | ||
567 | vfio_pci_virqfd_exit(); | ||
568 | out_driver: | ||
569 | vfio_pci_uninit_perm_bits(); | ||
570 | return ret; | ||
571 | } | ||
572 | |||
573 | module_init(vfio_pci_init); | ||
574 | module_exit(vfio_pci_cleanup); | ||
575 | |||
576 | MODULE_VERSION(DRIVER_VERSION); | ||
577 | MODULE_LICENSE("GPL v2"); | ||
578 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
579 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c new file mode 100644 index 000000000000..8b8f7d11e102 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_config.c | |||
@@ -0,0 +1,1540 @@ | |||
1 | /* | ||
2 | * VFIO PCI config space virtualization | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * This code handles reading and writing of PCI configuration registers. | ||
18 | * This is hairy because we want to allow a lot of flexibility to the | ||
19 | * user driver, but cannot trust it with all of the config fields. | ||
20 | * Tables determine which fields can be read and written, as well as | ||
21 | * which fields are 'virtualized' - special actions and translations to | ||
22 | * make it appear to the user that he has control, when in fact things | ||
23 | * must be negotiated with the underlying OS. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/pci.h> | ||
28 | #include <linux/uaccess.h> | ||
29 | #include <linux/vfio.h> | ||
30 | |||
31 | #include "vfio_pci_private.h" | ||
32 | |||
33 | #define PCI_CFG_SPACE_SIZE 256 | ||
34 | |||
35 | /* Useful "pseudo" capabilities */ | ||
36 | #define PCI_CAP_ID_BASIC 0 | ||
37 | #define PCI_CAP_ID_INVALID 0xFF | ||
38 | |||
39 | #define is_bar(offset) \ | ||
40 | ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ | ||
41 | (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4)) | ||
42 | |||
43 | /* | ||
44 | * Lengths of PCI Config Capabilities | ||
45 | * 0: Removed from the user visible capability list | ||
46 | * FF: Variable length | ||
47 | */ | ||
48 | static u8 pci_cap_length[] = { | ||
49 | [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ | ||
50 | [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, | ||
51 | [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, | ||
52 | [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF, | ||
53 | [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */ | ||
54 | [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */ | ||
55 | [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */ | ||
56 | [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */ | ||
57 | [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */ | ||
58 | [PCI_CAP_ID_VNDR] = 0xFF, /* variable */ | ||
59 | [PCI_CAP_ID_DBG] = 0, /* debug - don't care */ | ||
60 | [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */ | ||
61 | [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */ | ||
62 | [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */ | ||
63 | [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */ | ||
64 | [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */ | ||
65 | [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */ | ||
66 | [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF, | ||
67 | [PCI_CAP_ID_SATA] = 0xFF, | ||
68 | [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF, | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * Lengths of PCIe/PCI-X Extended Config Capabilities | ||
73 | * 0: Removed or masked from the user visible capabilty list | ||
74 | * FF: Variable length | ||
75 | */ | ||
76 | static u16 pci_ext_cap_length[] = { | ||
77 | [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, | ||
78 | [PCI_EXT_CAP_ID_VC] = 0xFF, | ||
79 | [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, | ||
80 | [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF, | ||
81 | [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */ | ||
82 | [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */ | ||
83 | [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */ | ||
84 | [PCI_EXT_CAP_ID_MFVC] = 0xFF, | ||
85 | [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */ | ||
86 | [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */ | ||
87 | [PCI_EXT_CAP_ID_VNDR] = 0xFF, | ||
88 | [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */ | ||
89 | [PCI_EXT_CAP_ID_ACS] = 0xFF, | ||
90 | [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF, | ||
91 | [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF, | ||
92 | [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF, | ||
93 | [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */ | ||
94 | [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF, | ||
95 | [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF, | ||
96 | [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */ | ||
97 | [PCI_EXT_CAP_ID_REBAR] = 0xFF, | ||
98 | [PCI_EXT_CAP_ID_DPA] = 0xFF, | ||
99 | [PCI_EXT_CAP_ID_TPH] = 0xFF, | ||
100 | [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, | ||
101 | [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ | ||
102 | [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ | ||
103 | [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ | ||
104 | }; | ||
105 | |||
106 | /* | ||
107 | * Read/Write Permission Bits - one bit for each bit in capability | ||
108 | * Any field can be read if it exists, but what is read depends on | ||
109 | * whether the field is 'virtualized', or just pass thru to the | ||
110 | * hardware. Any virtualized field is also virtualized for writes. | ||
111 | * Writes are only permitted if they have a 1 bit here. | ||
112 | */ | ||
113 | struct perm_bits { | ||
114 | u8 *virt; /* read/write virtual data, not hw */ | ||
115 | u8 *write; /* writeable bits */ | ||
116 | int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, | ||
117 | struct perm_bits *perm, int offset, __le32 *val); | ||
118 | int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, | ||
119 | struct perm_bits *perm, int offset, __le32 val); | ||
120 | }; | ||
121 | |||
122 | #define NO_VIRT 0 | ||
123 | #define ALL_VIRT 0xFFFFFFFFU | ||
124 | #define NO_WRITE 0 | ||
125 | #define ALL_WRITE 0xFFFFFFFFU | ||
126 | |||
127 | static int vfio_user_config_read(struct pci_dev *pdev, int offset, | ||
128 | __le32 *val, int count) | ||
129 | { | ||
130 | int ret = -EINVAL; | ||
131 | u32 tmp_val = 0; | ||
132 | |||
133 | switch (count) { | ||
134 | case 1: | ||
135 | { | ||
136 | u8 tmp; | ||
137 | ret = pci_user_read_config_byte(pdev, offset, &tmp); | ||
138 | tmp_val = tmp; | ||
139 | break; | ||
140 | } | ||
141 | case 2: | ||
142 | { | ||
143 | u16 tmp; | ||
144 | ret = pci_user_read_config_word(pdev, offset, &tmp); | ||
145 | tmp_val = tmp; | ||
146 | break; | ||
147 | } | ||
148 | case 4: | ||
149 | ret = pci_user_read_config_dword(pdev, offset, &tmp_val); | ||
150 | break; | ||
151 | } | ||
152 | |||
153 | *val = cpu_to_le32(tmp_val); | ||
154 | |||
155 | return pcibios_err_to_errno(ret); | ||
156 | } | ||
157 | |||
158 | static int vfio_user_config_write(struct pci_dev *pdev, int offset, | ||
159 | __le32 val, int count) | ||
160 | { | ||
161 | int ret = -EINVAL; | ||
162 | u32 tmp_val = le32_to_cpu(val); | ||
163 | |||
164 | switch (count) { | ||
165 | case 1: | ||
166 | ret = pci_user_write_config_byte(pdev, offset, tmp_val); | ||
167 | break; | ||
168 | case 2: | ||
169 | ret = pci_user_write_config_word(pdev, offset, tmp_val); | ||
170 | break; | ||
171 | case 4: | ||
172 | ret = pci_user_write_config_dword(pdev, offset, tmp_val); | ||
173 | break; | ||
174 | } | ||
175 | |||
176 | return pcibios_err_to_errno(ret); | ||
177 | } | ||
178 | |||
179 | static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, | ||
180 | int count, struct perm_bits *perm, | ||
181 | int offset, __le32 *val) | ||
182 | { | ||
183 | __le32 virt = 0; | ||
184 | |||
185 | memcpy(val, vdev->vconfig + pos, count); | ||
186 | |||
187 | memcpy(&virt, perm->virt + offset, count); | ||
188 | |||
189 | /* Any non-virtualized bits? */ | ||
190 | if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) { | ||
191 | struct pci_dev *pdev = vdev->pdev; | ||
192 | __le32 phys_val = 0; | ||
193 | int ret; | ||
194 | |||
195 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
196 | if (ret) | ||
197 | return ret; | ||
198 | |||
199 | *val = (phys_val & ~virt) | (*val & virt); | ||
200 | } | ||
201 | |||
202 | return count; | ||
203 | } | ||
204 | |||
205 | static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, | ||
206 | int count, struct perm_bits *perm, | ||
207 | int offset, __le32 val) | ||
208 | { | ||
209 | __le32 virt = 0, write = 0; | ||
210 | |||
211 | memcpy(&write, perm->write + offset, count); | ||
212 | |||
213 | if (!write) | ||
214 | return count; /* drop, no writable bits */ | ||
215 | |||
216 | memcpy(&virt, perm->virt + offset, count); | ||
217 | |||
218 | /* Virtualized and writable bits go to vconfig */ | ||
219 | if (write & virt) { | ||
220 | __le32 virt_val = 0; | ||
221 | |||
222 | memcpy(&virt_val, vdev->vconfig + pos, count); | ||
223 | |||
224 | virt_val &= ~(write & virt); | ||
225 | virt_val |= (val & (write & virt)); | ||
226 | |||
227 | memcpy(vdev->vconfig + pos, &virt_val, count); | ||
228 | } | ||
229 | |||
230 | /* Non-virtualzed and writable bits go to hardware */ | ||
231 | if (write & ~virt) { | ||
232 | struct pci_dev *pdev = vdev->pdev; | ||
233 | __le32 phys_val = 0; | ||
234 | int ret; | ||
235 | |||
236 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
237 | if (ret) | ||
238 | return ret; | ||
239 | |||
240 | phys_val &= ~(write & ~virt); | ||
241 | phys_val |= (val & (write & ~virt)); | ||
242 | |||
243 | ret = vfio_user_config_write(pdev, pos, phys_val, count); | ||
244 | if (ret) | ||
245 | return ret; | ||
246 | } | ||
247 | |||
248 | return count; | ||
249 | } | ||
250 | |||
251 | /* Allow direct read from hardware, except for capability next pointer */ | ||
252 | static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, | ||
253 | int count, struct perm_bits *perm, | ||
254 | int offset, __le32 *val) | ||
255 | { | ||
256 | int ret; | ||
257 | |||
258 | ret = vfio_user_config_read(vdev->pdev, pos, val, count); | ||
259 | if (ret) | ||
260 | return pcibios_err_to_errno(ret); | ||
261 | |||
262 | if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ | ||
263 | if (offset < 4) | ||
264 | memcpy(val, vdev->vconfig + pos, count); | ||
265 | } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */ | ||
266 | if (offset == PCI_CAP_LIST_ID && count > 1) | ||
267 | memcpy(val, vdev->vconfig + pos, | ||
268 | min(PCI_CAP_FLAGS, count)); | ||
269 | else if (offset == PCI_CAP_LIST_NEXT) | ||
270 | memcpy(val, vdev->vconfig + pos, 1); | ||
271 | } | ||
272 | |||
273 | return count; | ||
274 | } | ||
275 | |||
276 | static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, | ||
277 | int count, struct perm_bits *perm, | ||
278 | int offset, __le32 val) | ||
279 | { | ||
280 | int ret; | ||
281 | |||
282 | ret = vfio_user_config_write(vdev->pdev, pos, val, count); | ||
283 | if (ret) | ||
284 | return ret; | ||
285 | |||
286 | return count; | ||
287 | } | ||
288 | |||
289 | /* Default all regions to read-only, no-virtualization */ | ||
290 | static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { | ||
291 | [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
292 | }; | ||
293 | static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { | ||
294 | [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
295 | }; | ||
296 | |||
297 | static void free_perm_bits(struct perm_bits *perm) | ||
298 | { | ||
299 | kfree(perm->virt); | ||
300 | kfree(perm->write); | ||
301 | perm->virt = NULL; | ||
302 | perm->write = NULL; | ||
303 | } | ||
304 | |||
305 | static int alloc_perm_bits(struct perm_bits *perm, int size) | ||
306 | { | ||
307 | /* | ||
308 | * Round up all permission bits to the next dword, this lets us | ||
309 | * ignore whether a read/write exceeds the defined capability | ||
310 | * structure. We can do this because: | ||
311 | * - Standard config space is already dword aligned | ||
312 | * - Capabilities are all dword alinged (bits 0:1 of next reserved) | ||
313 | * - Express capabilities defined as dword aligned | ||
314 | */ | ||
315 | size = round_up(size, 4); | ||
316 | |||
317 | /* | ||
318 | * Zero state is | ||
319 | * - All Readable, None Writeable, None Virtualized | ||
320 | */ | ||
321 | perm->virt = kzalloc(size, GFP_KERNEL); | ||
322 | perm->write = kzalloc(size, GFP_KERNEL); | ||
323 | if (!perm->virt || !perm->write) { | ||
324 | free_perm_bits(perm); | ||
325 | return -ENOMEM; | ||
326 | } | ||
327 | |||
328 | perm->readfn = vfio_default_config_read; | ||
329 | perm->writefn = vfio_default_config_write; | ||
330 | |||
331 | return 0; | ||
332 | } | ||
333 | |||
334 | /* | ||
335 | * Helper functions for filling in permission tables | ||
336 | */ | ||
337 | static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write) | ||
338 | { | ||
339 | p->virt[off] = virt; | ||
340 | p->write[off] = write; | ||
341 | } | ||
342 | |||
343 | /* Handle endian-ness - pci and tables are little-endian */ | ||
344 | static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write) | ||
345 | { | ||
346 | *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt); | ||
347 | *(__le16 *)(&p->write[off]) = cpu_to_le16(write); | ||
348 | } | ||
349 | |||
350 | /* Handle endian-ness - pci and tables are little-endian */ | ||
351 | static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) | ||
352 | { | ||
353 | *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt); | ||
354 | *(__le32 *)(&p->write[off]) = cpu_to_le32(write); | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * Restore the *real* BARs after we detect a FLR or backdoor reset. | ||
359 | * (backdoor = some device specific technique that we didn't catch) | ||
360 | */ | ||
361 | static void vfio_bar_restore(struct vfio_pci_device *vdev) | ||
362 | { | ||
363 | struct pci_dev *pdev = vdev->pdev; | ||
364 | u32 *rbar = vdev->rbar; | ||
365 | int i; | ||
366 | |||
367 | if (pdev->is_virtfn) | ||
368 | return; | ||
369 | |||
370 | pr_info("%s: %s reset recovery - restoring bars\n", | ||
371 | __func__, dev_name(&pdev->dev)); | ||
372 | |||
373 | for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++) | ||
374 | pci_user_write_config_dword(pdev, i, *rbar); | ||
375 | |||
376 | pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); | ||
377 | } | ||
378 | |||
379 | static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) | ||
380 | { | ||
381 | unsigned long flags = pci_resource_flags(pdev, bar); | ||
382 | u32 val; | ||
383 | |||
384 | if (flags & IORESOURCE_IO) | ||
385 | return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO); | ||
386 | |||
387 | val = PCI_BASE_ADDRESS_SPACE_MEMORY; | ||
388 | |||
389 | if (flags & IORESOURCE_PREFETCH) | ||
390 | val |= PCI_BASE_ADDRESS_MEM_PREFETCH; | ||
391 | |||
392 | if (flags & IORESOURCE_MEM_64) | ||
393 | val |= PCI_BASE_ADDRESS_MEM_TYPE_64; | ||
394 | |||
395 | return cpu_to_le32(val); | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Pretend we're hardware and tweak the values of the *virtual* PCI BARs | ||
400 | * to reflect the hardware capabilities. This implements BAR sizing. | ||
401 | */ | ||
402 | static void vfio_bar_fixup(struct vfio_pci_device *vdev) | ||
403 | { | ||
404 | struct pci_dev *pdev = vdev->pdev; | ||
405 | int i; | ||
406 | __le32 *bar; | ||
407 | u64 mask; | ||
408 | |||
409 | bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0]; | ||
410 | |||
411 | for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) { | ||
412 | if (!pci_resource_start(pdev, i)) { | ||
413 | *bar = 0; /* Unmapped by host = unimplemented to user */ | ||
414 | continue; | ||
415 | } | ||
416 | |||
417 | mask = ~(pci_resource_len(pdev, i) - 1); | ||
418 | |||
419 | *bar &= cpu_to_le32((u32)mask); | ||
420 | *bar |= vfio_generate_bar_flags(pdev, i); | ||
421 | |||
422 | if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) { | ||
423 | bar++; | ||
424 | *bar &= cpu_to_le32((u32)(mask >> 32)); | ||
425 | i++; | ||
426 | } | ||
427 | } | ||
428 | |||
429 | bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; | ||
430 | |||
431 | /* | ||
432 | * NB. we expose the actual BAR size here, regardless of whether | ||
433 | * we can read it. When we report the REGION_INFO for the ROM | ||
434 | * we report what PCI tells us is the actual ROM size. | ||
435 | */ | ||
436 | if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { | ||
437 | mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); | ||
438 | mask |= PCI_ROM_ADDRESS_ENABLE; | ||
439 | *bar &= cpu_to_le32((u32)mask); | ||
440 | } else | ||
441 | *bar = 0; | ||
442 | |||
443 | vdev->bardirty = false; | ||
444 | } | ||
445 | |||
446 | static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, | ||
447 | int count, struct perm_bits *perm, | ||
448 | int offset, __le32 *val) | ||
449 | { | ||
450 | if (is_bar(offset)) /* pos == offset for basic config */ | ||
451 | vfio_bar_fixup(vdev); | ||
452 | |||
453 | count = vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
454 | |||
455 | /* Mask in virtual memory enable for SR-IOV devices */ | ||
456 | if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { | ||
457 | u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); | ||
458 | u32 tmp_val = le32_to_cpu(*val); | ||
459 | |||
460 | tmp_val |= cmd & PCI_COMMAND_MEMORY; | ||
461 | *val = cpu_to_le32(tmp_val); | ||
462 | } | ||
463 | |||
464 | return count; | ||
465 | } | ||
466 | |||
467 | static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, | ||
468 | int count, struct perm_bits *perm, | ||
469 | int offset, __le32 val) | ||
470 | { | ||
471 | struct pci_dev *pdev = vdev->pdev; | ||
472 | __le16 *virt_cmd; | ||
473 | u16 new_cmd = 0; | ||
474 | int ret; | ||
475 | |||
476 | virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND]; | ||
477 | |||
478 | if (offset == PCI_COMMAND) { | ||
479 | bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io; | ||
480 | u16 phys_cmd; | ||
481 | |||
482 | ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd); | ||
483 | if (ret) | ||
484 | return ret; | ||
485 | |||
486 | new_cmd = le32_to_cpu(val); | ||
487 | |||
488 | phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); | ||
489 | virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); | ||
490 | new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); | ||
491 | |||
492 | phys_io = !!(phys_cmd & PCI_COMMAND_IO); | ||
493 | virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); | ||
494 | new_io = !!(new_cmd & PCI_COMMAND_IO); | ||
495 | |||
496 | /* | ||
497 | * If the user is writing mem/io enable (new_mem/io) and we | ||
498 | * think it's already enabled (virt_mem/io), but the hardware | ||
499 | * shows it disabled (phys_mem/io, then the device has | ||
500 | * undergone some kind of backdoor reset and needs to be | ||
501 | * restored before we allow it to enable the bars. | ||
502 | * SR-IOV devices will trigger this, but we catch them later | ||
503 | */ | ||
504 | if ((new_mem && virt_mem && !phys_mem) || | ||
505 | (new_io && virt_io && !phys_io)) | ||
506 | vfio_bar_restore(vdev); | ||
507 | } | ||
508 | |||
509 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
510 | if (count < 0) | ||
511 | return count; | ||
512 | |||
513 | /* | ||
514 | * Save current memory/io enable bits in vconfig to allow for | ||
515 | * the test above next time. | ||
516 | */ | ||
517 | if (offset == PCI_COMMAND) { | ||
518 | u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO; | ||
519 | |||
520 | *virt_cmd &= cpu_to_le16(~mask); | ||
521 | *virt_cmd |= cpu_to_le16(new_cmd & mask); | ||
522 | } | ||
523 | |||
524 | /* Emulate INTx disable */ | ||
525 | if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) { | ||
526 | bool virt_intx_disable; | ||
527 | |||
528 | virt_intx_disable = !!(le16_to_cpu(*virt_cmd) & | ||
529 | PCI_COMMAND_INTX_DISABLE); | ||
530 | |||
531 | if (virt_intx_disable && !vdev->virq_disabled) { | ||
532 | vdev->virq_disabled = true; | ||
533 | vfio_pci_intx_mask(vdev); | ||
534 | } else if (!virt_intx_disable && vdev->virq_disabled) { | ||
535 | vdev->virq_disabled = false; | ||
536 | vfio_pci_intx_unmask(vdev); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | if (is_bar(offset)) | ||
541 | vdev->bardirty = true; | ||
542 | |||
543 | return count; | ||
544 | } | ||
545 | |||
546 | /* Permissions for the Basic PCI Header */ | ||
547 | static int __init init_pci_cap_basic_perm(struct perm_bits *perm) | ||
548 | { | ||
549 | if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF)) | ||
550 | return -ENOMEM; | ||
551 | |||
552 | perm->readfn = vfio_basic_config_read; | ||
553 | perm->writefn = vfio_basic_config_write; | ||
554 | |||
555 | /* Virtualized for SR-IOV functions, which just have FFFF */ | ||
556 | p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE); | ||
557 | p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE); | ||
558 | |||
559 | /* | ||
560 | * Virtualize INTx disable, we use it internally for interrupt | ||
561 | * control and can emulate it for non-PCI 2.3 devices. | ||
562 | */ | ||
563 | p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE); | ||
564 | |||
565 | /* Virtualize capability list, we might want to skip/disable */ | ||
566 | p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE); | ||
567 | |||
568 | /* No harm to write */ | ||
569 | p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE); | ||
570 | p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE); | ||
571 | p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE); | ||
572 | |||
573 | /* Virtualize all bars, can't touch the real ones */ | ||
574 | p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE); | ||
575 | p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE); | ||
576 | p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE); | ||
577 | p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE); | ||
578 | p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE); | ||
579 | p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE); | ||
580 | p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE); | ||
581 | |||
582 | /* Allow us to adjust capability chain */ | ||
583 | p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE); | ||
584 | |||
585 | /* Sometimes used by sw, just virtualize */ | ||
586 | p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | /* Permissions for the Power Management capability */ | ||
591 | static int __init init_pci_cap_pm_perm(struct perm_bits *perm) | ||
592 | { | ||
593 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) | ||
594 | return -ENOMEM; | ||
595 | |||
596 | /* | ||
597 | * We always virtualize the next field so we can remove | ||
598 | * capabilities from the chain if we want to. | ||
599 | */ | ||
600 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
601 | |||
602 | /* | ||
603 | * Power management is defined *per function*, | ||
604 | * so we let the user write this | ||
605 | */ | ||
606 | p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | /* Permissions for PCI-X capability */ | ||
611 | static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) | ||
612 | { | ||
613 | /* Alloc 24, but only 8 are used in v0 */ | ||
614 | if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2)) | ||
615 | return -ENOMEM; | ||
616 | |||
617 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
618 | |||
619 | p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE); | ||
620 | p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE); | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | /* Permissions for PCI Express capability */ | ||
625 | static int __init init_pci_cap_exp_perm(struct perm_bits *perm) | ||
626 | { | ||
627 | /* Alloc larger of two possible sizes */ | ||
628 | if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) | ||
629 | return -ENOMEM; | ||
630 | |||
631 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
632 | |||
633 | /* | ||
634 | * Allow writes to device control fields (includes FLR!) | ||
635 | * but not to devctl_phantom which could confuse IOMMU | ||
636 | * or to the ARI bit in devctl2 which is set at probe time | ||
637 | */ | ||
638 | p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); | ||
639 | p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); | ||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | /* Permissions for Advanced Function capability */ | ||
644 | static int __init init_pci_cap_af_perm(struct perm_bits *perm) | ||
645 | { | ||
646 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) | ||
647 | return -ENOMEM; | ||
648 | |||
649 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
650 | p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | /* Permissions for Advanced Error Reporting extended capability */ | ||
655 | static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) | ||
656 | { | ||
657 | u32 mask; | ||
658 | |||
659 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR])) | ||
660 | return -ENOMEM; | ||
661 | |||
662 | /* | ||
663 | * Virtualize the first dword of all express capabilities | ||
664 | * because it includes the next pointer. This lets us later | ||
665 | * remove capabilities from the chain if we need to. | ||
666 | */ | ||
667 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
668 | |||
669 | /* Writable bits mask */ | ||
670 | mask = PCI_ERR_UNC_TRAIN | /* Training */ | ||
671 | PCI_ERR_UNC_DLP | /* Data Link Protocol */ | ||
672 | PCI_ERR_UNC_SURPDN | /* Surprise Down */ | ||
673 | PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ | ||
674 | PCI_ERR_UNC_FCP | /* Flow Control Protocol */ | ||
675 | PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */ | ||
676 | PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */ | ||
677 | PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */ | ||
678 | PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */ | ||
679 | PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */ | ||
680 | PCI_ERR_UNC_ECRC | /* ECRC Error Status */ | ||
681 | PCI_ERR_UNC_UNSUP | /* Unsupported Request */ | ||
682 | PCI_ERR_UNC_ACSV | /* ACS Violation */ | ||
683 | PCI_ERR_UNC_INTN | /* internal error */ | ||
684 | PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */ | ||
685 | PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */ | ||
686 | PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */ | ||
687 | p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask); | ||
688 | p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask); | ||
689 | p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask); | ||
690 | |||
691 | mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */ | ||
692 | PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */ | ||
693 | PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */ | ||
694 | PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */ | ||
695 | PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */ | ||
696 | PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */ | ||
697 | PCI_ERR_COR_INTERNAL | /* Corrected Internal */ | ||
698 | PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */ | ||
699 | p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask); | ||
700 | p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask); | ||
701 | |||
702 | mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */ | ||
703 | PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */ | ||
704 | p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask); | ||
705 | return 0; | ||
706 | } | ||
707 | |||
708 | /* Permissions for Power Budgeting extended capability */ | ||
709 | static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) | ||
710 | { | ||
711 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR])) | ||
712 | return -ENOMEM; | ||
713 | |||
714 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
715 | |||
716 | /* Writing the data selector is OK, the info is still read-only */ | ||
717 | p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE); | ||
718 | return 0; | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * Initialize the shared permission tables | ||
723 | */ | ||
724 | void vfio_pci_uninit_perm_bits(void) | ||
725 | { | ||
726 | free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); | ||
727 | |||
728 | free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); | ||
729 | free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); | ||
730 | free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); | ||
731 | free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); | ||
732 | |||
733 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
734 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
735 | } | ||
736 | |||
737 | int __init vfio_pci_init_perm_bits(void) | ||
738 | { | ||
739 | int ret; | ||
740 | |||
741 | /* Basic config space */ | ||
742 | ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]); | ||
743 | |||
744 | /* Capabilities */ | ||
745 | ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); | ||
746 | cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; | ||
747 | ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); | ||
748 | cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
749 | ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); | ||
750 | ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); | ||
751 | |||
752 | /* Extended capabilities */ | ||
753 | ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
754 | ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
755 | ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
756 | |||
757 | if (ret) | ||
758 | vfio_pci_uninit_perm_bits(); | ||
759 | |||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) | ||
764 | { | ||
765 | u8 cap; | ||
766 | int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : | ||
767 | PCI_STD_HEADER_SIZEOF; | ||
768 | base /= 4; | ||
769 | pos /= 4; | ||
770 | |||
771 | cap = vdev->pci_config_map[pos]; | ||
772 | |||
773 | if (cap == PCI_CAP_ID_BASIC) | ||
774 | return 0; | ||
775 | |||
776 | /* XXX Can we have to abutting capabilities of the same type? */ | ||
777 | while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) | ||
778 | pos--; | ||
779 | |||
780 | return pos * 4; | ||
781 | } | ||
782 | |||
783 | static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, | ||
784 | int count, struct perm_bits *perm, | ||
785 | int offset, __le32 *val) | ||
786 | { | ||
787 | /* Update max available queue size from msi_qmax */ | ||
788 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
789 | __le16 *flags; | ||
790 | int start; | ||
791 | |||
792 | start = vfio_find_cap_start(vdev, pos); | ||
793 | |||
794 | flags = (__le16 *)&vdev->vconfig[start]; | ||
795 | |||
796 | *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK); | ||
797 | *flags |= cpu_to_le16(vdev->msi_qmax << 1); | ||
798 | } | ||
799 | |||
800 | return vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
801 | } | ||
802 | |||
803 | static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, | ||
804 | int count, struct perm_bits *perm, | ||
805 | int offset, __le32 val) | ||
806 | { | ||
807 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
808 | if (count < 0) | ||
809 | return count; | ||
810 | |||
811 | /* Fixup and write configured queue size and enable to hardware */ | ||
812 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
813 | __le16 *pflags; | ||
814 | u16 flags; | ||
815 | int start, ret; | ||
816 | |||
817 | start = vfio_find_cap_start(vdev, pos); | ||
818 | |||
819 | pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS]; | ||
820 | |||
821 | flags = le16_to_cpu(*pflags); | ||
822 | |||
823 | /* MSI is enabled via ioctl */ | ||
824 | if (!is_msi(vdev)) | ||
825 | flags &= ~PCI_MSI_FLAGS_ENABLE; | ||
826 | |||
827 | /* Check queue size */ | ||
828 | if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) { | ||
829 | flags &= ~PCI_MSI_FLAGS_QSIZE; | ||
830 | flags |= vdev->msi_qmax << 4; | ||
831 | } | ||
832 | |||
833 | /* Write back to virt and to hardware */ | ||
834 | *pflags = cpu_to_le16(flags); | ||
835 | ret = pci_user_write_config_word(vdev->pdev, | ||
836 | start + PCI_MSI_FLAGS, | ||
837 | flags); | ||
838 | if (ret) | ||
839 | return pcibios_err_to_errno(ret); | ||
840 | } | ||
841 | |||
842 | return count; | ||
843 | } | ||
844 | |||
845 | /* | ||
846 | * MSI determination is per-device, so this routine gets used beyond | ||
847 | * initialization time. Don't add __init | ||
848 | */ | ||
849 | static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags) | ||
850 | { | ||
851 | if (alloc_perm_bits(perm, len)) | ||
852 | return -ENOMEM; | ||
853 | |||
854 | perm->readfn = vfio_msi_config_read; | ||
855 | perm->writefn = vfio_msi_config_write; | ||
856 | |||
857 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
858 | |||
859 | /* | ||
860 | * The upper byte of the control register is reserved, | ||
861 | * just setup the lower byte. | ||
862 | */ | ||
863 | p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
864 | p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE); | ||
865 | if (flags & PCI_MSI_FLAGS_64BIT) { | ||
866 | p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE); | ||
867 | p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
868 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
869 | p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE); | ||
870 | p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE); | ||
871 | } | ||
872 | } else { | ||
873 | p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
874 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
875 | p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE); | ||
876 | p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE); | ||
877 | } | ||
878 | } | ||
879 | return 0; | ||
880 | } | ||
881 | |||
882 | /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ | ||
883 | static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) | ||
884 | { | ||
885 | struct pci_dev *pdev = vdev->pdev; | ||
886 | int len, ret; | ||
887 | u16 flags; | ||
888 | |||
889 | ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags); | ||
890 | if (ret) | ||
891 | return pcibios_err_to_errno(ret); | ||
892 | |||
893 | len = 10; /* Minimum size */ | ||
894 | if (flags & PCI_MSI_FLAGS_64BIT) | ||
895 | len += 4; | ||
896 | if (flags & PCI_MSI_FLAGS_MASKBIT) | ||
897 | len += 10; | ||
898 | |||
899 | if (vdev->msi_perm) | ||
900 | return len; | ||
901 | |||
902 | vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); | ||
903 | if (!vdev->msi_perm) | ||
904 | return -ENOMEM; | ||
905 | |||
906 | ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags); | ||
907 | if (ret) | ||
908 | return ret; | ||
909 | |||
910 | return len; | ||
911 | } | ||
912 | |||
913 | /* Determine extended capability length for VC (2 & 9) and MFVC */ | ||
914 | static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) | ||
915 | { | ||
916 | struct pci_dev *pdev = vdev->pdev; | ||
917 | u32 tmp; | ||
918 | int ret, evcc, phases, vc_arb; | ||
919 | int len = PCI_CAP_VC_BASE_SIZEOF; | ||
920 | |||
921 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp); | ||
922 | if (ret) | ||
923 | return pcibios_err_to_errno(ret); | ||
924 | |||
925 | evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */ | ||
926 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp); | ||
927 | if (ret) | ||
928 | return pcibios_err_to_errno(ret); | ||
929 | |||
930 | if (tmp & PCI_VC_REG2_128_PHASE) | ||
931 | phases = 128; | ||
932 | else if (tmp & PCI_VC_REG2_64_PHASE) | ||
933 | phases = 64; | ||
934 | else if (tmp & PCI_VC_REG2_32_PHASE) | ||
935 | phases = 32; | ||
936 | else | ||
937 | phases = 0; | ||
938 | |||
939 | vc_arb = phases * 4; | ||
940 | |||
941 | /* | ||
942 | * Port arbitration tables are root & switch only; | ||
943 | * function arbitration tables are function 0 only. | ||
944 | * In either case, we'll never let user write them so | ||
945 | * we don't care how big they are | ||
946 | */ | ||
947 | len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF; | ||
948 | if (vc_arb) { | ||
949 | len = round_up(len, 16); | ||
950 | len += vc_arb / 8; | ||
951 | } | ||
952 | return len; | ||
953 | } | ||
954 | |||
955 | static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) | ||
956 | { | ||
957 | struct pci_dev *pdev = vdev->pdev; | ||
958 | u16 word; | ||
959 | u8 byte; | ||
960 | int ret; | ||
961 | |||
962 | switch (cap) { | ||
963 | case PCI_CAP_ID_MSI: | ||
964 | return vfio_msi_cap_len(vdev, pos); | ||
965 | case PCI_CAP_ID_PCIX: | ||
966 | ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word); | ||
967 | if (ret) | ||
968 | return pcibios_err_to_errno(ret); | ||
969 | |||
970 | if (PCI_X_CMD_VERSION(word)) { | ||
971 | vdev->extended_caps = true; | ||
972 | return PCI_CAP_PCIX_SIZEOF_V2; | ||
973 | } else | ||
974 | return PCI_CAP_PCIX_SIZEOF_V0; | ||
975 | case PCI_CAP_ID_VNDR: | ||
976 | /* length follows next field */ | ||
977 | ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte); | ||
978 | if (ret) | ||
979 | return pcibios_err_to_errno(ret); | ||
980 | |||
981 | return byte; | ||
982 | case PCI_CAP_ID_EXP: | ||
983 | /* length based on version */ | ||
984 | ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); | ||
985 | if (ret) | ||
986 | return pcibios_err_to_errno(ret); | ||
987 | |||
988 | if ((word & PCI_EXP_FLAGS_VERS) == 1) | ||
989 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; | ||
990 | else { | ||
991 | vdev->extended_caps = true; | ||
992 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; | ||
993 | } | ||
994 | case PCI_CAP_ID_HT: | ||
995 | ret = pci_read_config_byte(pdev, pos + 3, &byte); | ||
996 | if (ret) | ||
997 | return pcibios_err_to_errno(ret); | ||
998 | |||
999 | return (byte & HT_3BIT_CAP_MASK) ? | ||
1000 | HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG; | ||
1001 | case PCI_CAP_ID_SATA: | ||
1002 | ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte); | ||
1003 | if (ret) | ||
1004 | return pcibios_err_to_errno(ret); | ||
1005 | |||
1006 | byte &= PCI_SATA_REGS_MASK; | ||
1007 | if (byte == PCI_SATA_REGS_INLINE) | ||
1008 | return PCI_SATA_SIZEOF_LONG; | ||
1009 | else | ||
1010 | return PCI_SATA_SIZEOF_SHORT; | ||
1011 | default: | ||
1012 | pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n", | ||
1013 | dev_name(&pdev->dev), __func__, cap, pos); | ||
1014 | } | ||
1015 | |||
1016 | return 0; | ||
1017 | } | ||
1018 | |||
1019 | static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) | ||
1020 | { | ||
1021 | struct pci_dev *pdev = vdev->pdev; | ||
1022 | u8 byte; | ||
1023 | u32 dword; | ||
1024 | int ret; | ||
1025 | |||
1026 | switch (ecap) { | ||
1027 | case PCI_EXT_CAP_ID_VNDR: | ||
1028 | ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); | ||
1029 | if (ret) | ||
1030 | return pcibios_err_to_errno(ret); | ||
1031 | |||
1032 | return dword >> PCI_VSEC_HDR_LEN_SHIFT; | ||
1033 | case PCI_EXT_CAP_ID_VC: | ||
1034 | case PCI_EXT_CAP_ID_VC9: | ||
1035 | case PCI_EXT_CAP_ID_MFVC: | ||
1036 | return vfio_vc_cap_len(vdev, epos); | ||
1037 | case PCI_EXT_CAP_ID_ACS: | ||
1038 | ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte); | ||
1039 | if (ret) | ||
1040 | return pcibios_err_to_errno(ret); | ||
1041 | |||
1042 | if (byte & PCI_ACS_EC) { | ||
1043 | int bits; | ||
1044 | |||
1045 | ret = pci_read_config_byte(pdev, | ||
1046 | epos + PCI_ACS_EGRESS_BITS, | ||
1047 | &byte); | ||
1048 | if (ret) | ||
1049 | return pcibios_err_to_errno(ret); | ||
1050 | |||
1051 | bits = byte ? round_up(byte, 32) : 256; | ||
1052 | return 8 + (bits / 8); | ||
1053 | } | ||
1054 | return 8; | ||
1055 | |||
1056 | case PCI_EXT_CAP_ID_REBAR: | ||
1057 | ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte); | ||
1058 | if (ret) | ||
1059 | return pcibios_err_to_errno(ret); | ||
1060 | |||
1061 | byte &= PCI_REBAR_CTRL_NBAR_MASK; | ||
1062 | byte >>= PCI_REBAR_CTRL_NBAR_SHIFT; | ||
1063 | |||
1064 | return 4 + (byte * 8); | ||
1065 | case PCI_EXT_CAP_ID_DPA: | ||
1066 | ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte); | ||
1067 | if (ret) | ||
1068 | return pcibios_err_to_errno(ret); | ||
1069 | |||
1070 | byte &= PCI_DPA_CAP_SUBSTATE_MASK; | ||
1071 | byte = round_up(byte + 1, 4); | ||
1072 | return PCI_DPA_BASE_SIZEOF + byte; | ||
1073 | case PCI_EXT_CAP_ID_TPH: | ||
1074 | ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); | ||
1075 | if (ret) | ||
1076 | return pcibios_err_to_errno(ret); | ||
1077 | |||
1078 | if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { | ||
1079 | int sts; | ||
1080 | |||
1081 | sts = byte & PCI_TPH_CAP_ST_MASK; | ||
1082 | sts >>= PCI_TPH_CAP_ST_SHIFT; | ||
1083 | return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4); | ||
1084 | } | ||
1085 | return PCI_TPH_BASE_SIZEOF; | ||
1086 | default: | ||
1087 | pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n", | ||
1088 | dev_name(&pdev->dev), __func__, ecap, epos); | ||
1089 | } | ||
1090 | |||
1091 | return 0; | ||
1092 | } | ||
1093 | |||
1094 | static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, | ||
1095 | int offset, int size) | ||
1096 | { | ||
1097 | struct pci_dev *pdev = vdev->pdev; | ||
1098 | int ret = 0; | ||
1099 | |||
1100 | /* | ||
1101 | * We try to read physical config space in the largest chunks | ||
1102 | * we can, assuming that all of the fields support dword access. | ||
1103 | * pci_save_state() makes this same assumption and seems to do ok. | ||
1104 | */ | ||
1105 | while (size) { | ||
1106 | int filled; | ||
1107 | |||
1108 | if (size >= 4 && !(offset % 4)) { | ||
1109 | __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; | ||
1110 | u32 dword; | ||
1111 | |||
1112 | ret = pci_read_config_dword(pdev, offset, &dword); | ||
1113 | if (ret) | ||
1114 | return ret; | ||
1115 | *dwordp = cpu_to_le32(dword); | ||
1116 | filled = 4; | ||
1117 | } else if (size >= 2 && !(offset % 2)) { | ||
1118 | __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; | ||
1119 | u16 word; | ||
1120 | |||
1121 | ret = pci_read_config_word(pdev, offset, &word); | ||
1122 | if (ret) | ||
1123 | return ret; | ||
1124 | *wordp = cpu_to_le16(word); | ||
1125 | filled = 2; | ||
1126 | } else { | ||
1127 | u8 *byte = &vdev->vconfig[offset]; | ||
1128 | ret = pci_read_config_byte(pdev, offset, byte); | ||
1129 | if (ret) | ||
1130 | return ret; | ||
1131 | filled = 1; | ||
1132 | } | ||
1133 | |||
1134 | offset += filled; | ||
1135 | size -= filled; | ||
1136 | } | ||
1137 | |||
1138 | return ret; | ||
1139 | } | ||
1140 | |||
1141 | static int vfio_cap_init(struct vfio_pci_device *vdev) | ||
1142 | { | ||
1143 | struct pci_dev *pdev = vdev->pdev; | ||
1144 | u8 *map = vdev->pci_config_map; | ||
1145 | u16 status; | ||
1146 | u8 pos, *prev, cap; | ||
1147 | int loops, ret, caps = 0; | ||
1148 | |||
1149 | /* Any capabilities? */ | ||
1150 | ret = pci_read_config_word(pdev, PCI_STATUS, &status); | ||
1151 | if (ret) | ||
1152 | return ret; | ||
1153 | |||
1154 | if (!(status & PCI_STATUS_CAP_LIST)) | ||
1155 | return 0; /* Done */ | ||
1156 | |||
1157 | ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); | ||
1158 | if (ret) | ||
1159 | return ret; | ||
1160 | |||
1161 | /* Mark the previous position in case we want to skip a capability */ | ||
1162 | prev = &vdev->vconfig[PCI_CAPABILITY_LIST]; | ||
1163 | |||
1164 | /* We can bound our loop, capabilities are dword aligned */ | ||
1165 | loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF; | ||
1166 | while (pos && loops--) { | ||
1167 | u8 next; | ||
1168 | int i, len = 0; | ||
1169 | |||
1170 | ret = pci_read_config_byte(pdev, pos, &cap); | ||
1171 | if (ret) | ||
1172 | return ret; | ||
1173 | |||
1174 | ret = pci_read_config_byte(pdev, | ||
1175 | pos + PCI_CAP_LIST_NEXT, &next); | ||
1176 | if (ret) | ||
1177 | return ret; | ||
1178 | |||
1179 | if (cap <= PCI_CAP_ID_MAX) { | ||
1180 | len = pci_cap_length[cap]; | ||
1181 | if (len == 0xFF) { /* Variable length */ | ||
1182 | len = vfio_cap_len(vdev, cap, pos); | ||
1183 | if (len < 0) | ||
1184 | return len; | ||
1185 | } | ||
1186 | } | ||
1187 | |||
1188 | if (!len) { | ||
1189 | pr_info("%s: %s hiding cap 0x%x\n", | ||
1190 | __func__, dev_name(&pdev->dev), cap); | ||
1191 | *prev = next; | ||
1192 | pos = next; | ||
1193 | continue; | ||
1194 | } | ||
1195 | |||
1196 | /* Sanity check, do we overlap other capabilities? */ | ||
1197 | for (i = 0; i < len; i += 4) { | ||
1198 | if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
1199 | continue; | ||
1200 | |||
1201 | pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", | ||
1202 | __func__, dev_name(&pdev->dev), | ||
1203 | pos + i, map[pos + i], cap); | ||
1204 | } | ||
1205 | |||
1206 | memset(map + (pos / 4), cap, len / 4); | ||
1207 | ret = vfio_fill_vconfig_bytes(vdev, pos, len); | ||
1208 | if (ret) | ||
1209 | return ret; | ||
1210 | |||
1211 | prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; | ||
1212 | pos = next; | ||
1213 | caps++; | ||
1214 | } | ||
1215 | |||
1216 | /* If we didn't fill any capabilities, clear the status flag */ | ||
1217 | if (!caps) { | ||
1218 | __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS]; | ||
1219 | *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST); | ||
1220 | } | ||
1221 | |||
1222 | return 0; | ||
1223 | } | ||
1224 | |||
1225 | static int vfio_ecap_init(struct vfio_pci_device *vdev) | ||
1226 | { | ||
1227 | struct pci_dev *pdev = vdev->pdev; | ||
1228 | u8 *map = vdev->pci_config_map; | ||
1229 | u16 epos; | ||
1230 | __le32 *prev = NULL; | ||
1231 | int loops, ret, ecaps = 0; | ||
1232 | |||
1233 | if (!vdev->extended_caps) | ||
1234 | return 0; | ||
1235 | |||
1236 | epos = PCI_CFG_SPACE_SIZE; | ||
1237 | |||
1238 | loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; | ||
1239 | |||
1240 | while (loops-- && epos >= PCI_CFG_SPACE_SIZE) { | ||
1241 | u32 header; | ||
1242 | u16 ecap; | ||
1243 | int i, len = 0; | ||
1244 | bool hidden = false; | ||
1245 | |||
1246 | ret = pci_read_config_dword(pdev, epos, &header); | ||
1247 | if (ret) | ||
1248 | return ret; | ||
1249 | |||
1250 | ecap = PCI_EXT_CAP_ID(header); | ||
1251 | |||
1252 | if (ecap <= PCI_EXT_CAP_ID_MAX) { | ||
1253 | len = pci_ext_cap_length[ecap]; | ||
1254 | if (len == 0xFF) { | ||
1255 | len = vfio_ext_cap_len(vdev, ecap, epos); | ||
1256 | if (len < 0) | ||
1257 | return ret; | ||
1258 | } | ||
1259 | } | ||
1260 | |||
1261 | if (!len) { | ||
1262 | pr_info("%s: %s hiding ecap 0x%x@0x%x\n", | ||
1263 | __func__, dev_name(&pdev->dev), ecap, epos); | ||
1264 | |||
1265 | /* If not the first in the chain, we can skip over it */ | ||
1266 | if (prev) { | ||
1267 | u32 val = epos = PCI_EXT_CAP_NEXT(header); | ||
1268 | *prev &= cpu_to_le32(~(0xffcU << 20)); | ||
1269 | *prev |= cpu_to_le32(val << 20); | ||
1270 | continue; | ||
1271 | } | ||
1272 | |||
1273 | /* | ||
1274 | * Otherwise, fill in a placeholder, the direct | ||
1275 | * readfn will virtualize this automatically | ||
1276 | */ | ||
1277 | len = PCI_CAP_SIZEOF; | ||
1278 | hidden = true; | ||
1279 | } | ||
1280 | |||
1281 | for (i = 0; i < len; i += 4) { | ||
1282 | if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
1283 | continue; | ||
1284 | |||
1285 | pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", | ||
1286 | __func__, dev_name(&pdev->dev), | ||
1287 | epos + i, map[epos + i], ecap); | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * Even though ecap is 2 bytes, we're currently a long way | ||
1292 | * from exceeding 1 byte capabilities. If we ever make it | ||
1293 | * up to 0xFF we'll need to up this to a two-byte, byte map. | ||
1294 | */ | ||
1295 | BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); | ||
1296 | |||
1297 | memset(map + (epos / 4), ecap, len / 4); | ||
1298 | ret = vfio_fill_vconfig_bytes(vdev, epos, len); | ||
1299 | if (ret) | ||
1300 | return ret; | ||
1301 | |||
1302 | /* | ||
1303 | * If we're just using this capability to anchor the list, | ||
1304 | * hide the real ID. Only count real ecaps. XXX PCI spec | ||
1305 | * indicates to use cap id = 0, version = 0, next = 0 if | ||
1306 | * ecaps are absent, hope users check all the way to next. | ||
1307 | */ | ||
1308 | if (hidden) | ||
1309 | *(__le32 *)&vdev->vconfig[epos] &= | ||
1310 | cpu_to_le32((0xffcU << 20)); | ||
1311 | else | ||
1312 | ecaps++; | ||
1313 | |||
1314 | prev = (__le32 *)&vdev->vconfig[epos]; | ||
1315 | epos = PCI_EXT_CAP_NEXT(header); | ||
1316 | } | ||
1317 | |||
1318 | if (!ecaps) | ||
1319 | *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; | ||
1320 | |||
1321 | return 0; | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * For each device we allocate a pci_config_map that indicates the | ||
1326 | * capability occupying each dword and thus the struct perm_bits we | ||
1327 | * use for read and write. We also allocate a virtualized config | ||
1328 | * space which tracks reads and writes to bits that we emulate for | ||
1329 | * the user. Initial values filled from device. | ||
1330 | * | ||
1331 | * Using shared stuct perm_bits between all vfio-pci devices saves | ||
1332 | * us from allocating cfg_size buffers for virt and write for every | ||
1333 | * device. We could remove vconfig and allocate individual buffers | ||
1334 | * for each area requring emulated bits, but the array of pointers | ||
1335 | * would be comparable in size (at least for standard config space). | ||
1336 | */ | ||
1337 | int vfio_config_init(struct vfio_pci_device *vdev) | ||
1338 | { | ||
1339 | struct pci_dev *pdev = vdev->pdev; | ||
1340 | u8 *map, *vconfig; | ||
1341 | int ret; | ||
1342 | |||
1343 | /* | ||
1344 | * Config space, caps and ecaps are all dword aligned, so we can | ||
1345 | * use one byte per dword to record the type. | ||
1346 | */ | ||
1347 | map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); | ||
1348 | if (!map) | ||
1349 | return -ENOMEM; | ||
1350 | |||
1351 | vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); | ||
1352 | if (!vconfig) { | ||
1353 | kfree(map); | ||
1354 | return -ENOMEM; | ||
1355 | } | ||
1356 | |||
1357 | vdev->pci_config_map = map; | ||
1358 | vdev->vconfig = vconfig; | ||
1359 | |||
1360 | memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); | ||
1361 | memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, | ||
1362 | (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); | ||
1363 | |||
1364 | ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); | ||
1365 | if (ret) | ||
1366 | goto out; | ||
1367 | |||
1368 | vdev->bardirty = true; | ||
1369 | |||
1370 | /* | ||
1371 | * XXX can we just pci_load_saved_state/pci_restore_state? | ||
1372 | * may need to rebuild vconfig after that | ||
1373 | */ | ||
1374 | |||
1375 | /* For restore after reset */ | ||
1376 | vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]); | ||
1377 | vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]); | ||
1378 | vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]); | ||
1379 | vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]); | ||
1380 | vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]); | ||
1381 | vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]); | ||
1382 | vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]); | ||
1383 | |||
1384 | if (pdev->is_virtfn) { | ||
1385 | *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor); | ||
1386 | *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); | ||
1387 | } | ||
1388 | |||
1389 | ret = vfio_cap_init(vdev); | ||
1390 | if (ret) | ||
1391 | goto out; | ||
1392 | |||
1393 | ret = vfio_ecap_init(vdev); | ||
1394 | if (ret) | ||
1395 | goto out; | ||
1396 | |||
1397 | return 0; | ||
1398 | |||
1399 | out: | ||
1400 | kfree(map); | ||
1401 | vdev->pci_config_map = NULL; | ||
1402 | kfree(vconfig); | ||
1403 | vdev->vconfig = NULL; | ||
1404 | return pcibios_err_to_errno(ret); | ||
1405 | } | ||
1406 | |||
1407 | void vfio_config_free(struct vfio_pci_device *vdev) | ||
1408 | { | ||
1409 | kfree(vdev->vconfig); | ||
1410 | vdev->vconfig = NULL; | ||
1411 | kfree(vdev->pci_config_map); | ||
1412 | vdev->pci_config_map = NULL; | ||
1413 | kfree(vdev->msi_perm); | ||
1414 | vdev->msi_perm = NULL; | ||
1415 | } | ||
1416 | |||
1417 | static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, | ||
1418 | size_t count, loff_t *ppos, bool iswrite) | ||
1419 | { | ||
1420 | struct pci_dev *pdev = vdev->pdev; | ||
1421 | struct perm_bits *perm; | ||
1422 | __le32 val = 0; | ||
1423 | int cap_start = 0, offset; | ||
1424 | u8 cap_id; | ||
1425 | ssize_t ret = count; | ||
1426 | |||
1427 | if (*ppos < 0 || *ppos + count > pdev->cfg_size) | ||
1428 | return -EFAULT; | ||
1429 | |||
1430 | /* | ||
1431 | * gcc can't seem to figure out we're a static function, only called | ||
1432 | * with count of 1/2/4 and hits copy_from_user_overflow without this. | ||
1433 | */ | ||
1434 | if (count > sizeof(val)) | ||
1435 | return -EINVAL; | ||
1436 | |||
1437 | cap_id = vdev->pci_config_map[*ppos / 4]; | ||
1438 | |||
1439 | if (cap_id == PCI_CAP_ID_INVALID) { | ||
1440 | if (iswrite) | ||
1441 | return ret; /* drop */ | ||
1442 | |||
1443 | /* | ||
1444 | * Per PCI spec 3.0, section 6.1, reads from reserved and | ||
1445 | * unimplemented registers return 0 | ||
1446 | */ | ||
1447 | if (copy_to_user(buf, &val, count)) | ||
1448 | return -EFAULT; | ||
1449 | |||
1450 | return ret; | ||
1451 | } | ||
1452 | |||
1453 | /* | ||
1454 | * All capabilities are minimum 4 bytes and aligned on dword | ||
1455 | * boundaries. Since we don't support unaligned accesses, we're | ||
1456 | * only ever accessing a single capability. | ||
1457 | */ | ||
1458 | if (*ppos >= PCI_CFG_SPACE_SIZE) { | ||
1459 | WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); | ||
1460 | |||
1461 | perm = &ecap_perms[cap_id]; | ||
1462 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
1463 | |||
1464 | } else { | ||
1465 | WARN_ON(cap_id > PCI_CAP_ID_MAX); | ||
1466 | |||
1467 | perm = &cap_perms[cap_id]; | ||
1468 | |||
1469 | if (cap_id == PCI_CAP_ID_MSI) | ||
1470 | perm = vdev->msi_perm; | ||
1471 | |||
1472 | if (cap_id > PCI_CAP_ID_BASIC) | ||
1473 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
1474 | } | ||
1475 | |||
1476 | WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); | ||
1477 | WARN_ON(cap_start > *ppos); | ||
1478 | |||
1479 | offset = *ppos - cap_start; | ||
1480 | |||
1481 | if (iswrite) { | ||
1482 | if (!perm->writefn) | ||
1483 | return ret; | ||
1484 | |||
1485 | if (copy_from_user(&val, buf, count)) | ||
1486 | return -EFAULT; | ||
1487 | |||
1488 | ret = perm->writefn(vdev, *ppos, count, perm, offset, val); | ||
1489 | } else { | ||
1490 | if (perm->readfn) { | ||
1491 | ret = perm->readfn(vdev, *ppos, count, | ||
1492 | perm, offset, &val); | ||
1493 | if (ret < 0) | ||
1494 | return ret; | ||
1495 | } | ||
1496 | |||
1497 | if (copy_to_user(buf, &val, count)) | ||
1498 | return -EFAULT; | ||
1499 | } | ||
1500 | |||
1501 | return ret; | ||
1502 | } | ||
1503 | |||
1504 | ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
1505 | char __user *buf, size_t count, | ||
1506 | loff_t *ppos, bool iswrite) | ||
1507 | { | ||
1508 | size_t done = 0; | ||
1509 | int ret = 0; | ||
1510 | loff_t pos = *ppos; | ||
1511 | |||
1512 | pos &= VFIO_PCI_OFFSET_MASK; | ||
1513 | |||
1514 | /* | ||
1515 | * We want to both keep the access size the caller users as well as | ||
1516 | * support reading large chunks of config space in a single call. | ||
1517 | * PCI doesn't support unaligned accesses, so we can safely break | ||
1518 | * those apart. | ||
1519 | */ | ||
1520 | while (count) { | ||
1521 | if (count >= 4 && !(pos % 4)) | ||
1522 | ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); | ||
1523 | else if (count >= 2 && !(pos % 2)) | ||
1524 | ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); | ||
1525 | else | ||
1526 | ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); | ||
1527 | |||
1528 | if (ret < 0) | ||
1529 | return ret; | ||
1530 | |||
1531 | count -= ret; | ||
1532 | done += ret; | ||
1533 | buf += ret; | ||
1534 | pos += ret; | ||
1535 | } | ||
1536 | |||
1537 | *ppos += done; | ||
1538 | |||
1539 | return done; | ||
1540 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c new file mode 100644 index 000000000000..211a4920b88a --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_intrs.c | |||
@@ -0,0 +1,740 @@ | |||
1 | /* | ||
2 | * VFIO PCI interrupt handling | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | #include <linux/device.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/eventfd.h> | ||
19 | #include <linux/pci.h> | ||
20 | #include <linux/file.h> | ||
21 | #include <linux/poll.h> | ||
22 | #include <linux/vfio.h> | ||
23 | #include <linux/wait.h> | ||
24 | #include <linux/workqueue.h> | ||
25 | |||
26 | #include "vfio_pci_private.h" | ||
27 | |||
28 | /* | ||
29 | * IRQfd - generic | ||
30 | */ | ||
31 | struct virqfd { | ||
32 | struct vfio_pci_device *vdev; | ||
33 | struct eventfd_ctx *eventfd; | ||
34 | int (*handler)(struct vfio_pci_device *, void *); | ||
35 | void (*thread)(struct vfio_pci_device *, void *); | ||
36 | void *data; | ||
37 | struct work_struct inject; | ||
38 | wait_queue_t wait; | ||
39 | poll_table pt; | ||
40 | struct work_struct shutdown; | ||
41 | struct virqfd **pvirqfd; | ||
42 | }; | ||
43 | |||
44 | static struct workqueue_struct *vfio_irqfd_cleanup_wq; | ||
45 | |||
46 | int __init vfio_pci_virqfd_init(void) | ||
47 | { | ||
48 | vfio_irqfd_cleanup_wq = | ||
49 | create_singlethread_workqueue("vfio-irqfd-cleanup"); | ||
50 | if (!vfio_irqfd_cleanup_wq) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | void vfio_pci_virqfd_exit(void) | ||
57 | { | ||
58 | destroy_workqueue(vfio_irqfd_cleanup_wq); | ||
59 | } | ||
60 | |||
61 | static void virqfd_deactivate(struct virqfd *virqfd) | ||
62 | { | ||
63 | queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); | ||
64 | } | ||
65 | |||
66 | static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
67 | { | ||
68 | struct virqfd *virqfd = container_of(wait, struct virqfd, wait); | ||
69 | unsigned long flags = (unsigned long)key; | ||
70 | |||
71 | if (flags & POLLIN) { | ||
72 | /* An event has been signaled, call function */ | ||
73 | if ((!virqfd->handler || | ||
74 | virqfd->handler(virqfd->vdev, virqfd->data)) && | ||
75 | virqfd->thread) | ||
76 | schedule_work(&virqfd->inject); | ||
77 | } | ||
78 | |||
79 | if (flags & POLLHUP) | ||
80 | /* The eventfd is closing, detach from VFIO */ | ||
81 | virqfd_deactivate(virqfd); | ||
82 | |||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static void virqfd_ptable_queue_proc(struct file *file, | ||
87 | wait_queue_head_t *wqh, poll_table *pt) | ||
88 | { | ||
89 | struct virqfd *virqfd = container_of(pt, struct virqfd, pt); | ||
90 | add_wait_queue(wqh, &virqfd->wait); | ||
91 | } | ||
92 | |||
93 | static void virqfd_shutdown(struct work_struct *work) | ||
94 | { | ||
95 | struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); | ||
96 | struct virqfd **pvirqfd = virqfd->pvirqfd; | ||
97 | u64 cnt; | ||
98 | |||
99 | eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt); | ||
100 | flush_work(&virqfd->inject); | ||
101 | eventfd_ctx_put(virqfd->eventfd); | ||
102 | |||
103 | kfree(virqfd); | ||
104 | *pvirqfd = NULL; | ||
105 | } | ||
106 | |||
107 | static void virqfd_inject(struct work_struct *work) | ||
108 | { | ||
109 | struct virqfd *virqfd = container_of(work, struct virqfd, inject); | ||
110 | if (virqfd->thread) | ||
111 | virqfd->thread(virqfd->vdev, virqfd->data); | ||
112 | } | ||
113 | |||
114 | static int virqfd_enable(struct vfio_pci_device *vdev, | ||
115 | int (*handler)(struct vfio_pci_device *, void *), | ||
116 | void (*thread)(struct vfio_pci_device *, void *), | ||
117 | void *data, struct virqfd **pvirqfd, int fd) | ||
118 | { | ||
119 | struct file *file = NULL; | ||
120 | struct eventfd_ctx *ctx = NULL; | ||
121 | struct virqfd *virqfd; | ||
122 | int ret = 0; | ||
123 | unsigned int events; | ||
124 | |||
125 | if (*pvirqfd) | ||
126 | return -EBUSY; | ||
127 | |||
128 | virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); | ||
129 | if (!virqfd) | ||
130 | return -ENOMEM; | ||
131 | |||
132 | virqfd->pvirqfd = pvirqfd; | ||
133 | *pvirqfd = virqfd; | ||
134 | virqfd->vdev = vdev; | ||
135 | virqfd->handler = handler; | ||
136 | virqfd->thread = thread; | ||
137 | virqfd->data = data; | ||
138 | |||
139 | INIT_WORK(&virqfd->shutdown, virqfd_shutdown); | ||
140 | INIT_WORK(&virqfd->inject, virqfd_inject); | ||
141 | |||
142 | file = eventfd_fget(fd); | ||
143 | if (IS_ERR(file)) { | ||
144 | ret = PTR_ERR(file); | ||
145 | goto fail; | ||
146 | } | ||
147 | |||
148 | ctx = eventfd_ctx_fileget(file); | ||
149 | if (IS_ERR(ctx)) { | ||
150 | ret = PTR_ERR(ctx); | ||
151 | goto fail; | ||
152 | } | ||
153 | |||
154 | virqfd->eventfd = ctx; | ||
155 | |||
156 | /* | ||
157 | * Install our own custom wake-up handling so we are notified via | ||
158 | * a callback whenever someone signals the underlying eventfd. | ||
159 | */ | ||
160 | init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); | ||
161 | init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); | ||
162 | |||
163 | events = file->f_op->poll(file, &virqfd->pt); | ||
164 | |||
165 | /* | ||
166 | * Check if there was an event already pending on the eventfd | ||
167 | * before we registered and trigger it as if we didn't miss it. | ||
168 | */ | ||
169 | if (events & POLLIN) { | ||
170 | if ((!handler || handler(vdev, data)) && thread) | ||
171 | schedule_work(&virqfd->inject); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Do not drop the file until the irqfd is fully initialized, | ||
176 | * otherwise we might race against the POLLHUP. | ||
177 | */ | ||
178 | fput(file); | ||
179 | |||
180 | return 0; | ||
181 | |||
182 | fail: | ||
183 | if (ctx && !IS_ERR(ctx)) | ||
184 | eventfd_ctx_put(ctx); | ||
185 | |||
186 | if (file && !IS_ERR(file)) | ||
187 | fput(file); | ||
188 | |||
189 | kfree(virqfd); | ||
190 | *pvirqfd = NULL; | ||
191 | |||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | static void virqfd_disable(struct virqfd *virqfd) | ||
196 | { | ||
197 | if (!virqfd) | ||
198 | return; | ||
199 | |||
200 | virqfd_deactivate(virqfd); | ||
201 | |||
202 | /* Block until we know all outstanding shutdown jobs have completed. */ | ||
203 | flush_workqueue(vfio_irqfd_cleanup_wq); | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * INTx | ||
208 | */ | ||
209 | static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused) | ||
210 | { | ||
211 | if (likely(is_intx(vdev) && !vdev->virq_disabled)) | ||
212 | eventfd_signal(vdev->ctx[0].trigger, 1); | ||
213 | } | ||
214 | |||
215 | void vfio_pci_intx_mask(struct vfio_pci_device *vdev) | ||
216 | { | ||
217 | struct pci_dev *pdev = vdev->pdev; | ||
218 | unsigned long flags; | ||
219 | |||
220 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
221 | |||
222 | /* | ||
223 | * Masking can come from interrupt, ioctl, or config space | ||
224 | * via INTx disable. The latter means this can get called | ||
225 | * even when not using intx delivery. In this case, just | ||
226 | * try to have the physical bit follow the virtual bit. | ||
227 | */ | ||
228 | if (unlikely(!is_intx(vdev))) { | ||
229 | if (vdev->pci_2_3) | ||
230 | pci_intx(pdev, 0); | ||
231 | } else if (!vdev->ctx[0].masked) { | ||
232 | /* | ||
233 | * Can't use check_and_mask here because we always want to | ||
234 | * mask, not just when something is pending. | ||
235 | */ | ||
236 | if (vdev->pci_2_3) | ||
237 | pci_intx(pdev, 0); | ||
238 | else | ||
239 | disable_irq_nosync(pdev->irq); | ||
240 | |||
241 | vdev->ctx[0].masked = true; | ||
242 | } | ||
243 | |||
244 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * If this is triggered by an eventfd, we can't call eventfd_signal | ||
249 | * or else we'll deadlock on the eventfd wait queue. Return >0 when | ||
250 | * a signal is necessary, which can then be handled via a work queue | ||
251 | * or directly depending on the caller. | ||
252 | */ | ||
253 | int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) | ||
254 | { | ||
255 | struct pci_dev *pdev = vdev->pdev; | ||
256 | unsigned long flags; | ||
257 | int ret = 0; | ||
258 | |||
259 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
260 | |||
261 | /* | ||
262 | * Unmasking comes from ioctl or config, so again, have the | ||
263 | * physical bit follow the virtual even when not using INTx. | ||
264 | */ | ||
265 | if (unlikely(!is_intx(vdev))) { | ||
266 | if (vdev->pci_2_3) | ||
267 | pci_intx(pdev, 1); | ||
268 | } else if (vdev->ctx[0].masked && !vdev->virq_disabled) { | ||
269 | /* | ||
270 | * A pending interrupt here would immediately trigger, | ||
271 | * but we can avoid that overhead by just re-sending | ||
272 | * the interrupt to the user. | ||
273 | */ | ||
274 | if (vdev->pci_2_3) { | ||
275 | if (!pci_check_and_unmask_intx(pdev)) | ||
276 | ret = 1; | ||
277 | } else | ||
278 | enable_irq(pdev->irq); | ||
279 | |||
280 | vdev->ctx[0].masked = (ret > 0); | ||
281 | } | ||
282 | |||
283 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
284 | |||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) | ||
289 | { | ||
290 | if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) | ||
291 | vfio_send_intx_eventfd(vdev, NULL); | ||
292 | } | ||
293 | |||
294 | static irqreturn_t vfio_intx_handler(int irq, void *dev_id) | ||
295 | { | ||
296 | struct vfio_pci_device *vdev = dev_id; | ||
297 | unsigned long flags; | ||
298 | int ret = IRQ_NONE; | ||
299 | |||
300 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
301 | |||
302 | if (!vdev->pci_2_3) { | ||
303 | disable_irq_nosync(vdev->pdev->irq); | ||
304 | vdev->ctx[0].masked = true; | ||
305 | ret = IRQ_HANDLED; | ||
306 | } else if (!vdev->ctx[0].masked && /* may be shared */ | ||
307 | pci_check_and_mask_intx(vdev->pdev)) { | ||
308 | vdev->ctx[0].masked = true; | ||
309 | ret = IRQ_HANDLED; | ||
310 | } | ||
311 | |||
312 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
313 | |||
314 | if (ret == IRQ_HANDLED) | ||
315 | vfio_send_intx_eventfd(vdev, NULL); | ||
316 | |||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static int vfio_intx_enable(struct vfio_pci_device *vdev) | ||
321 | { | ||
322 | if (!is_irq_none(vdev)) | ||
323 | return -EINVAL; | ||
324 | |||
325 | if (!vdev->pdev->irq) | ||
326 | return -ENODEV; | ||
327 | |||
328 | vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
329 | if (!vdev->ctx) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | vdev->num_ctx = 1; | ||
333 | vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; | ||
334 | |||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) | ||
339 | { | ||
340 | struct pci_dev *pdev = vdev->pdev; | ||
341 | unsigned long irqflags = IRQF_SHARED; | ||
342 | struct eventfd_ctx *trigger; | ||
343 | unsigned long flags; | ||
344 | int ret; | ||
345 | |||
346 | if (vdev->ctx[0].trigger) { | ||
347 | free_irq(pdev->irq, vdev); | ||
348 | kfree(vdev->ctx[0].name); | ||
349 | eventfd_ctx_put(vdev->ctx[0].trigger); | ||
350 | vdev->ctx[0].trigger = NULL; | ||
351 | } | ||
352 | |||
353 | if (fd < 0) /* Disable only */ | ||
354 | return 0; | ||
355 | |||
356 | vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", | ||
357 | pci_name(pdev)); | ||
358 | if (!vdev->ctx[0].name) | ||
359 | return -ENOMEM; | ||
360 | |||
361 | trigger = eventfd_ctx_fdget(fd); | ||
362 | if (IS_ERR(trigger)) { | ||
363 | kfree(vdev->ctx[0].name); | ||
364 | return PTR_ERR(trigger); | ||
365 | } | ||
366 | |||
367 | if (!vdev->pci_2_3) | ||
368 | irqflags = 0; | ||
369 | |||
370 | ret = request_irq(pdev->irq, vfio_intx_handler, | ||
371 | irqflags, vdev->ctx[0].name, vdev); | ||
372 | if (ret) { | ||
373 | kfree(vdev->ctx[0].name); | ||
374 | eventfd_ctx_put(trigger); | ||
375 | return ret; | ||
376 | } | ||
377 | |||
378 | vdev->ctx[0].trigger = trigger; | ||
379 | |||
380 | /* | ||
381 | * INTx disable will stick across the new irq setup, | ||
382 | * disable_irq won't. | ||
383 | */ | ||
384 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
385 | if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled)) | ||
386 | disable_irq_nosync(pdev->irq); | ||
387 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
388 | |||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static void vfio_intx_disable(struct vfio_pci_device *vdev) | ||
393 | { | ||
394 | vfio_intx_set_signal(vdev, -1); | ||
395 | virqfd_disable(vdev->ctx[0].unmask); | ||
396 | virqfd_disable(vdev->ctx[0].mask); | ||
397 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
398 | vdev->num_ctx = 0; | ||
399 | kfree(vdev->ctx); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * MSI/MSI-X | ||
404 | */ | ||
405 | static irqreturn_t vfio_msihandler(int irq, void *arg) | ||
406 | { | ||
407 | struct eventfd_ctx *trigger = arg; | ||
408 | |||
409 | eventfd_signal(trigger, 1); | ||
410 | return IRQ_HANDLED; | ||
411 | } | ||
412 | |||
413 | static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) | ||
414 | { | ||
415 | struct pci_dev *pdev = vdev->pdev; | ||
416 | int ret; | ||
417 | |||
418 | if (!is_irq_none(vdev)) | ||
419 | return -EINVAL; | ||
420 | |||
421 | vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
422 | if (!vdev->ctx) | ||
423 | return -ENOMEM; | ||
424 | |||
425 | if (msix) { | ||
426 | int i; | ||
427 | |||
428 | vdev->msix = kzalloc(nvec * sizeof(struct msix_entry), | ||
429 | GFP_KERNEL); | ||
430 | if (!vdev->msix) { | ||
431 | kfree(vdev->ctx); | ||
432 | return -ENOMEM; | ||
433 | } | ||
434 | |||
435 | for (i = 0; i < nvec; i++) | ||
436 | vdev->msix[i].entry = i; | ||
437 | |||
438 | ret = pci_enable_msix(pdev, vdev->msix, nvec); | ||
439 | if (ret) { | ||
440 | kfree(vdev->msix); | ||
441 | kfree(vdev->ctx); | ||
442 | return ret; | ||
443 | } | ||
444 | } else { | ||
445 | ret = pci_enable_msi_block(pdev, nvec); | ||
446 | if (ret) { | ||
447 | kfree(vdev->ctx); | ||
448 | return ret; | ||
449 | } | ||
450 | } | ||
451 | |||
452 | vdev->num_ctx = nvec; | ||
453 | vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : | ||
454 | VFIO_PCI_MSI_IRQ_INDEX; | ||
455 | |||
456 | if (!msix) { | ||
457 | /* | ||
458 | * Compute the virtual hardware field for max msi vectors - | ||
459 | * it is the log base 2 of the number of vectors. | ||
460 | */ | ||
461 | vdev->msi_qmax = fls(nvec * 2 - 1) - 1; | ||
462 | } | ||
463 | |||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, | ||
468 | int vector, int fd, bool msix) | ||
469 | { | ||
470 | struct pci_dev *pdev = vdev->pdev; | ||
471 | int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; | ||
472 | char *name = msix ? "vfio-msix" : "vfio-msi"; | ||
473 | struct eventfd_ctx *trigger; | ||
474 | int ret; | ||
475 | |||
476 | if (vector >= vdev->num_ctx) | ||
477 | return -EINVAL; | ||
478 | |||
479 | if (vdev->ctx[vector].trigger) { | ||
480 | free_irq(irq, vdev->ctx[vector].trigger); | ||
481 | kfree(vdev->ctx[vector].name); | ||
482 | eventfd_ctx_put(vdev->ctx[vector].trigger); | ||
483 | vdev->ctx[vector].trigger = NULL; | ||
484 | } | ||
485 | |||
486 | if (fd < 0) | ||
487 | return 0; | ||
488 | |||
489 | vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", | ||
490 | name, vector, pci_name(pdev)); | ||
491 | if (!vdev->ctx[vector].name) | ||
492 | return -ENOMEM; | ||
493 | |||
494 | trigger = eventfd_ctx_fdget(fd); | ||
495 | if (IS_ERR(trigger)) { | ||
496 | kfree(vdev->ctx[vector].name); | ||
497 | return PTR_ERR(trigger); | ||
498 | } | ||
499 | |||
500 | ret = request_irq(irq, vfio_msihandler, 0, | ||
501 | vdev->ctx[vector].name, trigger); | ||
502 | if (ret) { | ||
503 | kfree(vdev->ctx[vector].name); | ||
504 | eventfd_ctx_put(trigger); | ||
505 | return ret; | ||
506 | } | ||
507 | |||
508 | vdev->ctx[vector].trigger = trigger; | ||
509 | |||
510 | return 0; | ||
511 | } | ||
512 | |||
513 | static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, | ||
514 | unsigned count, int32_t *fds, bool msix) | ||
515 | { | ||
516 | int i, j, ret = 0; | ||
517 | |||
518 | if (start + count > vdev->num_ctx) | ||
519 | return -EINVAL; | ||
520 | |||
521 | for (i = 0, j = start; i < count && !ret; i++, j++) { | ||
522 | int fd = fds ? fds[i] : -1; | ||
523 | ret = vfio_msi_set_vector_signal(vdev, j, fd, msix); | ||
524 | } | ||
525 | |||
526 | if (ret) { | ||
527 | for (--j; j >= start; j--) | ||
528 | vfio_msi_set_vector_signal(vdev, j, -1, msix); | ||
529 | } | ||
530 | |||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) | ||
535 | { | ||
536 | struct pci_dev *pdev = vdev->pdev; | ||
537 | int i; | ||
538 | |||
539 | vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix); | ||
540 | |||
541 | for (i = 0; i < vdev->num_ctx; i++) { | ||
542 | virqfd_disable(vdev->ctx[i].unmask); | ||
543 | virqfd_disable(vdev->ctx[i].mask); | ||
544 | } | ||
545 | |||
546 | if (msix) { | ||
547 | pci_disable_msix(vdev->pdev); | ||
548 | kfree(vdev->msix); | ||
549 | } else | ||
550 | pci_disable_msi(pdev); | ||
551 | |||
552 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
553 | vdev->num_ctx = 0; | ||
554 | kfree(vdev->ctx); | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * IOCTL support | ||
559 | */ | ||
560 | static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, | ||
561 | unsigned index, unsigned start, | ||
562 | unsigned count, uint32_t flags, void *data) | ||
563 | { | ||
564 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
565 | return -EINVAL; | ||
566 | |||
567 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
568 | vfio_pci_intx_unmask(vdev); | ||
569 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
570 | uint8_t unmask = *(uint8_t *)data; | ||
571 | if (unmask) | ||
572 | vfio_pci_intx_unmask(vdev); | ||
573 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
574 | int32_t fd = *(int32_t *)data; | ||
575 | if (fd >= 0) | ||
576 | return virqfd_enable(vdev, vfio_pci_intx_unmask_handler, | ||
577 | vfio_send_intx_eventfd, NULL, | ||
578 | &vdev->ctx[0].unmask, fd); | ||
579 | |||
580 | virqfd_disable(vdev->ctx[0].unmask); | ||
581 | } | ||
582 | |||
583 | return 0; | ||
584 | } | ||
585 | |||
586 | static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, | ||
587 | unsigned index, unsigned start, | ||
588 | unsigned count, uint32_t flags, void *data) | ||
589 | { | ||
590 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
591 | return -EINVAL; | ||
592 | |||
593 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
594 | vfio_pci_intx_mask(vdev); | ||
595 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
596 | uint8_t mask = *(uint8_t *)data; | ||
597 | if (mask) | ||
598 | vfio_pci_intx_mask(vdev); | ||
599 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
600 | return -ENOTTY; /* XXX implement me */ | ||
601 | } | ||
602 | |||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, | ||
607 | unsigned index, unsigned start, | ||
608 | unsigned count, uint32_t flags, void *data) | ||
609 | { | ||
610 | if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
611 | vfio_intx_disable(vdev); | ||
612 | return 0; | ||
613 | } | ||
614 | |||
615 | if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1) | ||
616 | return -EINVAL; | ||
617 | |||
618 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
619 | int32_t fd = *(int32_t *)data; | ||
620 | int ret; | ||
621 | |||
622 | if (is_intx(vdev)) | ||
623 | return vfio_intx_set_signal(vdev, fd); | ||
624 | |||
625 | ret = vfio_intx_enable(vdev); | ||
626 | if (ret) | ||
627 | return ret; | ||
628 | |||
629 | ret = vfio_intx_set_signal(vdev, fd); | ||
630 | if (ret) | ||
631 | vfio_intx_disable(vdev); | ||
632 | |||
633 | return ret; | ||
634 | } | ||
635 | |||
636 | if (!is_intx(vdev)) | ||
637 | return -EINVAL; | ||
638 | |||
639 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
640 | vfio_send_intx_eventfd(vdev, NULL); | ||
641 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
642 | uint8_t trigger = *(uint8_t *)data; | ||
643 | if (trigger) | ||
644 | vfio_send_intx_eventfd(vdev, NULL); | ||
645 | } | ||
646 | return 0; | ||
647 | } | ||
648 | |||
649 | static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, | ||
650 | unsigned index, unsigned start, | ||
651 | unsigned count, uint32_t flags, void *data) | ||
652 | { | ||
653 | int i; | ||
654 | bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; | ||
655 | |||
656 | if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
657 | vfio_msi_disable(vdev, msix); | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | if (!(irq_is(vdev, index) || is_irq_none(vdev))) | ||
662 | return -EINVAL; | ||
663 | |||
664 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
665 | int32_t *fds = data; | ||
666 | int ret; | ||
667 | |||
668 | if (vdev->irq_type == index) | ||
669 | return vfio_msi_set_block(vdev, start, count, | ||
670 | fds, msix); | ||
671 | |||
672 | ret = vfio_msi_enable(vdev, start + count, msix); | ||
673 | if (ret) | ||
674 | return ret; | ||
675 | |||
676 | ret = vfio_msi_set_block(vdev, start, count, fds, msix); | ||
677 | if (ret) | ||
678 | vfio_msi_disable(vdev, msix); | ||
679 | |||
680 | return ret; | ||
681 | } | ||
682 | |||
683 | if (!irq_is(vdev, index) || start + count > vdev->num_ctx) | ||
684 | return -EINVAL; | ||
685 | |||
686 | for (i = start; i < start + count; i++) { | ||
687 | if (!vdev->ctx[i].trigger) | ||
688 | continue; | ||
689 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
690 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
691 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
692 | uint8_t *bools = data; | ||
693 | if (bools[i - start]) | ||
694 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
695 | } | ||
696 | } | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | ||
701 | unsigned index, unsigned start, unsigned count, | ||
702 | void *data) | ||
703 | { | ||
704 | int (*func)(struct vfio_pci_device *vdev, unsigned index, | ||
705 | unsigned start, unsigned count, uint32_t flags, | ||
706 | void *data) = NULL; | ||
707 | |||
708 | switch (index) { | ||
709 | case VFIO_PCI_INTX_IRQ_INDEX: | ||
710 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
711 | case VFIO_IRQ_SET_ACTION_MASK: | ||
712 | func = vfio_pci_set_intx_mask; | ||
713 | break; | ||
714 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
715 | func = vfio_pci_set_intx_unmask; | ||
716 | break; | ||
717 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
718 | func = vfio_pci_set_intx_trigger; | ||
719 | break; | ||
720 | } | ||
721 | break; | ||
722 | case VFIO_PCI_MSI_IRQ_INDEX: | ||
723 | case VFIO_PCI_MSIX_IRQ_INDEX: | ||
724 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
725 | case VFIO_IRQ_SET_ACTION_MASK: | ||
726 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
727 | /* XXX Need masking support exported */ | ||
728 | break; | ||
729 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
730 | func = vfio_pci_set_msi_trigger; | ||
731 | break; | ||
732 | } | ||
733 | break; | ||
734 | } | ||
735 | |||
736 | if (!func) | ||
737 | return -ENOTTY; | ||
738 | |||
739 | return func(vdev, index, start, count, flags, data); | ||
740 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h new file mode 100644 index 000000000000..611827cba8cd --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * Derived from original vfio: | ||
10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
11 | * Author: Tom Lyon, pugs@cisco.com | ||
12 | */ | ||
13 | |||
14 | #include <linux/mutex.h> | ||
15 | #include <linux/pci.h> | ||
16 | |||
17 | #ifndef VFIO_PCI_PRIVATE_H | ||
18 | #define VFIO_PCI_PRIVATE_H | ||
19 | |||
20 | #define VFIO_PCI_OFFSET_SHIFT 40 | ||
21 | |||
22 | #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) | ||
23 | #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) | ||
24 | #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) | ||
25 | |||
26 | struct vfio_pci_irq_ctx { | ||
27 | struct eventfd_ctx *trigger; | ||
28 | struct virqfd *unmask; | ||
29 | struct virqfd *mask; | ||
30 | char *name; | ||
31 | bool masked; | ||
32 | }; | ||
33 | |||
34 | struct vfio_pci_device { | ||
35 | struct pci_dev *pdev; | ||
36 | void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; | ||
37 | u8 *pci_config_map; | ||
38 | u8 *vconfig; | ||
39 | struct perm_bits *msi_perm; | ||
40 | spinlock_t irqlock; | ||
41 | struct mutex igate; | ||
42 | struct msix_entry *msix; | ||
43 | struct vfio_pci_irq_ctx *ctx; | ||
44 | int num_ctx; | ||
45 | int irq_type; | ||
46 | u8 msi_qmax; | ||
47 | u8 msix_bar; | ||
48 | u16 msix_size; | ||
49 | u32 msix_offset; | ||
50 | u32 rbar[7]; | ||
51 | bool pci_2_3; | ||
52 | bool virq_disabled; | ||
53 | bool reset_works; | ||
54 | bool extended_caps; | ||
55 | bool bardirty; | ||
56 | struct pci_saved_state *pci_saved_state; | ||
57 | atomic_t refcnt; | ||
58 | }; | ||
59 | |||
60 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | ||
61 | #define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) | ||
62 | #define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) | ||
63 | #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) | ||
64 | #define irq_is(vdev, type) (vdev->irq_type == type) | ||
65 | |||
66 | extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); | ||
67 | extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); | ||
68 | |||
69 | extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, | ||
70 | uint32_t flags, unsigned index, | ||
71 | unsigned start, unsigned count, void *data); | ||
72 | |||
73 | extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
74 | char __user *buf, size_t count, | ||
75 | loff_t *ppos, bool iswrite); | ||
76 | extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, | ||
77 | char __user *buf, size_t count, | ||
78 | loff_t *ppos, bool iswrite); | ||
79 | extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, | ||
80 | char __user *buf, size_t count, | ||
81 | loff_t *ppos, bool iswrite); | ||
82 | |||
83 | extern int vfio_pci_init_perm_bits(void); | ||
84 | extern void vfio_pci_uninit_perm_bits(void); | ||
85 | |||
86 | extern int vfio_pci_virqfd_init(void); | ||
87 | extern void vfio_pci_virqfd_exit(void); | ||
88 | |||
89 | extern int vfio_config_init(struct vfio_pci_device *vdev); | ||
90 | extern void vfio_config_free(struct vfio_pci_device *vdev); | ||
91 | #endif /* VFIO_PCI_PRIVATE_H */ | ||
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c new file mode 100644 index 000000000000..4362d9e7baa3 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_rdwr.c | |||
@@ -0,0 +1,269 @@ | |||
1 | /* | ||
2 | * VFIO PCI I/O Port & MMIO access | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | #include <linux/fs.h> | ||
17 | #include <linux/pci.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/io.h> | ||
20 | |||
21 | #include "vfio_pci_private.h" | ||
22 | |||
23 | /* I/O Port BAR access */ | ||
24 | ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
25 | size_t count, loff_t *ppos, bool iswrite) | ||
26 | { | ||
27 | struct pci_dev *pdev = vdev->pdev; | ||
28 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
29 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
30 | void __iomem *io; | ||
31 | size_t done = 0; | ||
32 | |||
33 | if (!pci_resource_start(pdev, bar)) | ||
34 | return -EINVAL; | ||
35 | |||
36 | if (pos + count > pci_resource_len(pdev, bar)) | ||
37 | return -EINVAL; | ||
38 | |||
39 | if (!vdev->barmap[bar]) { | ||
40 | int ret; | ||
41 | |||
42 | ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); | ||
43 | if (ret) | ||
44 | return ret; | ||
45 | |||
46 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
47 | |||
48 | if (!vdev->barmap[bar]) { | ||
49 | pci_release_selected_regions(pdev, 1 << bar); | ||
50 | return -EINVAL; | ||
51 | } | ||
52 | } | ||
53 | |||
54 | io = vdev->barmap[bar]; | ||
55 | |||
56 | while (count) { | ||
57 | int filled; | ||
58 | |||
59 | if (count >= 3 && !(pos % 4)) { | ||
60 | __le32 val; | ||
61 | |||
62 | if (iswrite) { | ||
63 | if (copy_from_user(&val, buf, 4)) | ||
64 | return -EFAULT; | ||
65 | |||
66 | iowrite32(le32_to_cpu(val), io + pos); | ||
67 | } else { | ||
68 | val = cpu_to_le32(ioread32(io + pos)); | ||
69 | |||
70 | if (copy_to_user(buf, &val, 4)) | ||
71 | return -EFAULT; | ||
72 | } | ||
73 | |||
74 | filled = 4; | ||
75 | |||
76 | } else if ((pos % 2) == 0 && count >= 2) { | ||
77 | __le16 val; | ||
78 | |||
79 | if (iswrite) { | ||
80 | if (copy_from_user(&val, buf, 2)) | ||
81 | return -EFAULT; | ||
82 | |||
83 | iowrite16(le16_to_cpu(val), io + pos); | ||
84 | } else { | ||
85 | val = cpu_to_le16(ioread16(io + pos)); | ||
86 | |||
87 | if (copy_to_user(buf, &val, 2)) | ||
88 | return -EFAULT; | ||
89 | } | ||
90 | |||
91 | filled = 2; | ||
92 | } else { | ||
93 | u8 val; | ||
94 | |||
95 | if (iswrite) { | ||
96 | if (copy_from_user(&val, buf, 1)) | ||
97 | return -EFAULT; | ||
98 | |||
99 | iowrite8(val, io + pos); | ||
100 | } else { | ||
101 | val = ioread8(io + pos); | ||
102 | |||
103 | if (copy_to_user(buf, &val, 1)) | ||
104 | return -EFAULT; | ||
105 | } | ||
106 | |||
107 | filled = 1; | ||
108 | } | ||
109 | |||
110 | count -= filled; | ||
111 | done += filled; | ||
112 | buf += filled; | ||
113 | pos += filled; | ||
114 | } | ||
115 | |||
116 | *ppos += done; | ||
117 | |||
118 | return done; | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * MMIO BAR access | ||
123 | * We handle two excluded ranges here as well, if the user tries to read | ||
124 | * the ROM beyond what PCI tells us is available or the MSI-X table region, | ||
125 | * we return 0xFF and writes are dropped. | ||
126 | */ | ||
127 | ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
128 | size_t count, loff_t *ppos, bool iswrite) | ||
129 | { | ||
130 | struct pci_dev *pdev = vdev->pdev; | ||
131 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
132 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
133 | void __iomem *io; | ||
134 | resource_size_t end; | ||
135 | size_t done = 0; | ||
136 | size_t x_start = 0, x_end = 0; /* excluded range */ | ||
137 | |||
138 | if (!pci_resource_start(pdev, bar)) | ||
139 | return -EINVAL; | ||
140 | |||
141 | end = pci_resource_len(pdev, bar); | ||
142 | |||
143 | if (pos > end) | ||
144 | return -EINVAL; | ||
145 | |||
146 | if (pos == end) | ||
147 | return 0; | ||
148 | |||
149 | if (pos + count > end) | ||
150 | count = end - pos; | ||
151 | |||
152 | if (bar == PCI_ROM_RESOURCE) { | ||
153 | io = pci_map_rom(pdev, &x_start); | ||
154 | x_end = end; | ||
155 | } else { | ||
156 | if (!vdev->barmap[bar]) { | ||
157 | int ret; | ||
158 | |||
159 | ret = pci_request_selected_regions(pdev, 1 << bar, | ||
160 | "vfio"); | ||
161 | if (ret) | ||
162 | return ret; | ||
163 | |||
164 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
165 | |||
166 | if (!vdev->barmap[bar]) { | ||
167 | pci_release_selected_regions(pdev, 1 << bar); | ||
168 | return -EINVAL; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | io = vdev->barmap[bar]; | ||
173 | |||
174 | if (bar == vdev->msix_bar) { | ||
175 | x_start = vdev->msix_offset; | ||
176 | x_end = vdev->msix_offset + vdev->msix_size; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | if (!io) | ||
181 | return -EINVAL; | ||
182 | |||
183 | while (count) { | ||
184 | size_t fillable, filled; | ||
185 | |||
186 | if (pos < x_start) | ||
187 | fillable = x_start - pos; | ||
188 | else if (pos >= x_end) | ||
189 | fillable = end - pos; | ||
190 | else | ||
191 | fillable = 0; | ||
192 | |||
193 | if (fillable >= 4 && !(pos % 4) && (count >= 4)) { | ||
194 | __le32 val; | ||
195 | |||
196 | if (iswrite) { | ||
197 | if (copy_from_user(&val, buf, 4)) | ||
198 | goto out; | ||
199 | |||
200 | iowrite32(le32_to_cpu(val), io + pos); | ||
201 | } else { | ||
202 | val = cpu_to_le32(ioread32(io + pos)); | ||
203 | |||
204 | if (copy_to_user(buf, &val, 4)) | ||
205 | goto out; | ||
206 | } | ||
207 | |||
208 | filled = 4; | ||
209 | } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) { | ||
210 | __le16 val; | ||
211 | |||
212 | if (iswrite) { | ||
213 | if (copy_from_user(&val, buf, 2)) | ||
214 | goto out; | ||
215 | |||
216 | iowrite16(le16_to_cpu(val), io + pos); | ||
217 | } else { | ||
218 | val = cpu_to_le16(ioread16(io + pos)); | ||
219 | |||
220 | if (copy_to_user(buf, &val, 2)) | ||
221 | goto out; | ||
222 | } | ||
223 | |||
224 | filled = 2; | ||
225 | } else if (fillable) { | ||
226 | u8 val; | ||
227 | |||
228 | if (iswrite) { | ||
229 | if (copy_from_user(&val, buf, 1)) | ||
230 | goto out; | ||
231 | |||
232 | iowrite8(val, io + pos); | ||
233 | } else { | ||
234 | val = ioread8(io + pos); | ||
235 | |||
236 | if (copy_to_user(buf, &val, 1)) | ||
237 | goto out; | ||
238 | } | ||
239 | |||
240 | filled = 1; | ||
241 | } else { | ||
242 | /* Drop writes, fill reads with FF */ | ||
243 | if (!iswrite) { | ||
244 | char val = 0xFF; | ||
245 | size_t i; | ||
246 | |||
247 | for (i = 0; i < x_end - pos; i++) { | ||
248 | if (put_user(val, buf + i)) | ||
249 | goto out; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | filled = x_end - pos; | ||
254 | } | ||
255 | |||
256 | count -= filled; | ||
257 | done += filled; | ||
258 | buf += filled; | ||
259 | pos += filled; | ||
260 | } | ||
261 | |||
262 | *ppos += done; | ||
263 | |||
264 | out: | ||
265 | if (bar == PCI_ROM_RESOURCE) | ||
266 | pci_unmap_rom(pdev, io); | ||
267 | |||
268 | return count ? -EFAULT : done; | ||
269 | } | ||
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c new file mode 100644 index 000000000000..9591e2b509d7 --- /dev/null +++ b/drivers/vfio/vfio.c | |||
@@ -0,0 +1,1420 @@ | |||
1 | /* | ||
2 | * VFIO core | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | #include <linux/cdev.h> | ||
17 | #include <linux/compat.h> | ||
18 | #include <linux/device.h> | ||
19 | #include <linux/file.h> | ||
20 | #include <linux/anon_inodes.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/idr.h> | ||
23 | #include <linux/iommu.h> | ||
24 | #include <linux/list.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/mutex.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/uaccess.h> | ||
31 | #include <linux/vfio.h> | ||
32 | #include <linux/wait.h> | ||
33 | |||
34 | #define DRIVER_VERSION "0.3" | ||
35 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
36 | #define DRIVER_DESC "VFIO - User Level meta-driver" | ||
37 | |||
38 | static struct vfio { | ||
39 | struct class *class; | ||
40 | struct list_head iommu_drivers_list; | ||
41 | struct mutex iommu_drivers_lock; | ||
42 | struct list_head group_list; | ||
43 | struct idr group_idr; | ||
44 | struct mutex group_lock; | ||
45 | struct cdev group_cdev; | ||
46 | struct device *dev; | ||
47 | dev_t devt; | ||
48 | struct cdev cdev; | ||
49 | wait_queue_head_t release_q; | ||
50 | } vfio; | ||
51 | |||
52 | struct vfio_iommu_driver { | ||
53 | const struct vfio_iommu_driver_ops *ops; | ||
54 | struct list_head vfio_next; | ||
55 | }; | ||
56 | |||
57 | struct vfio_container { | ||
58 | struct kref kref; | ||
59 | struct list_head group_list; | ||
60 | struct mutex group_lock; | ||
61 | struct vfio_iommu_driver *iommu_driver; | ||
62 | void *iommu_data; | ||
63 | }; | ||
64 | |||
65 | struct vfio_group { | ||
66 | struct kref kref; | ||
67 | int minor; | ||
68 | atomic_t container_users; | ||
69 | struct iommu_group *iommu_group; | ||
70 | struct vfio_container *container; | ||
71 | struct list_head device_list; | ||
72 | struct mutex device_lock; | ||
73 | struct device *dev; | ||
74 | struct notifier_block nb; | ||
75 | struct list_head vfio_next; | ||
76 | struct list_head container_next; | ||
77 | }; | ||
78 | |||
79 | struct vfio_device { | ||
80 | struct kref kref; | ||
81 | struct device *dev; | ||
82 | const struct vfio_device_ops *ops; | ||
83 | struct vfio_group *group; | ||
84 | struct list_head group_next; | ||
85 | void *device_data; | ||
86 | }; | ||
87 | |||
88 | /** | ||
89 | * IOMMU driver registration | ||
90 | */ | ||
91 | int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) | ||
92 | { | ||
93 | struct vfio_iommu_driver *driver, *tmp; | ||
94 | |||
95 | driver = kzalloc(sizeof(*driver), GFP_KERNEL); | ||
96 | if (!driver) | ||
97 | return -ENOMEM; | ||
98 | |||
99 | driver->ops = ops; | ||
100 | |||
101 | mutex_lock(&vfio.iommu_drivers_lock); | ||
102 | |||
103 | /* Check for duplicates */ | ||
104 | list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { | ||
105 | if (tmp->ops == ops) { | ||
106 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
107 | kfree(driver); | ||
108 | return -EINVAL; | ||
109 | } | ||
110 | } | ||
111 | |||
112 | list_add(&driver->vfio_next, &vfio.iommu_drivers_list); | ||
113 | |||
114 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
115 | |||
116 | return 0; | ||
117 | } | ||
118 | EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); | ||
119 | |||
120 | void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) | ||
121 | { | ||
122 | struct vfio_iommu_driver *driver; | ||
123 | |||
124 | mutex_lock(&vfio.iommu_drivers_lock); | ||
125 | list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { | ||
126 | if (driver->ops == ops) { | ||
127 | list_del(&driver->vfio_next); | ||
128 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
129 | kfree(driver); | ||
130 | return; | ||
131 | } | ||
132 | } | ||
133 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
134 | } | ||
135 | EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); | ||
136 | |||
137 | /** | ||
138 | * Group minor allocation/free - both called with vfio.group_lock held | ||
139 | */ | ||
140 | static int vfio_alloc_group_minor(struct vfio_group *group) | ||
141 | { | ||
142 | int ret, minor; | ||
143 | |||
144 | again: | ||
145 | if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0)) | ||
146 | return -ENOMEM; | ||
147 | |||
148 | /* index 0 is used by /dev/vfio/vfio */ | ||
149 | ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor); | ||
150 | if (ret == -EAGAIN) | ||
151 | goto again; | ||
152 | if (ret || minor > MINORMASK) { | ||
153 | if (minor > MINORMASK) | ||
154 | idr_remove(&vfio.group_idr, minor); | ||
155 | return -ENOSPC; | ||
156 | } | ||
157 | |||
158 | return minor; | ||
159 | } | ||
160 | |||
161 | static void vfio_free_group_minor(int minor) | ||
162 | { | ||
163 | idr_remove(&vfio.group_idr, minor); | ||
164 | } | ||
165 | |||
166 | static int vfio_iommu_group_notifier(struct notifier_block *nb, | ||
167 | unsigned long action, void *data); | ||
168 | static void vfio_group_get(struct vfio_group *group); | ||
169 | |||
170 | /** | ||
171 | * Container objects - containers are created when /dev/vfio/vfio is | ||
172 | * opened, but their lifecycle extends until the last user is done, so | ||
173 | * it's freed via kref. Must support container/group/device being | ||
174 | * closed in any order. | ||
175 | */ | ||
176 | static void vfio_container_get(struct vfio_container *container) | ||
177 | { | ||
178 | kref_get(&container->kref); | ||
179 | } | ||
180 | |||
181 | static void vfio_container_release(struct kref *kref) | ||
182 | { | ||
183 | struct vfio_container *container; | ||
184 | container = container_of(kref, struct vfio_container, kref); | ||
185 | |||
186 | kfree(container); | ||
187 | } | ||
188 | |||
189 | static void vfio_container_put(struct vfio_container *container) | ||
190 | { | ||
191 | kref_put(&container->kref, vfio_container_release); | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * Group objects - create, release, get, put, search | ||
196 | */ | ||
197 | static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | ||
198 | { | ||
199 | struct vfio_group *group, *tmp; | ||
200 | struct device *dev; | ||
201 | int ret, minor; | ||
202 | |||
203 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
204 | if (!group) | ||
205 | return ERR_PTR(-ENOMEM); | ||
206 | |||
207 | kref_init(&group->kref); | ||
208 | INIT_LIST_HEAD(&group->device_list); | ||
209 | mutex_init(&group->device_lock); | ||
210 | atomic_set(&group->container_users, 0); | ||
211 | group->iommu_group = iommu_group; | ||
212 | |||
213 | group->nb.notifier_call = vfio_iommu_group_notifier; | ||
214 | |||
215 | /* | ||
216 | * blocking notifiers acquire a rwsem around registering and hold | ||
217 | * it around callback. Therefore, need to register outside of | ||
218 | * vfio.group_lock to avoid A-B/B-A contention. Our callback won't | ||
219 | * do anything unless it can find the group in vfio.group_list, so | ||
220 | * no harm in registering early. | ||
221 | */ | ||
222 | ret = iommu_group_register_notifier(iommu_group, &group->nb); | ||
223 | if (ret) { | ||
224 | kfree(group); | ||
225 | return ERR_PTR(ret); | ||
226 | } | ||
227 | |||
228 | mutex_lock(&vfio.group_lock); | ||
229 | |||
230 | minor = vfio_alloc_group_minor(group); | ||
231 | if (minor < 0) { | ||
232 | mutex_unlock(&vfio.group_lock); | ||
233 | kfree(group); | ||
234 | return ERR_PTR(minor); | ||
235 | } | ||
236 | |||
237 | /* Did we race creating this group? */ | ||
238 | list_for_each_entry(tmp, &vfio.group_list, vfio_next) { | ||
239 | if (tmp->iommu_group == iommu_group) { | ||
240 | vfio_group_get(tmp); | ||
241 | vfio_free_group_minor(minor); | ||
242 | mutex_unlock(&vfio.group_lock); | ||
243 | kfree(group); | ||
244 | return tmp; | ||
245 | } | ||
246 | } | ||
247 | |||
248 | dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor), | ||
249 | group, "%d", iommu_group_id(iommu_group)); | ||
250 | if (IS_ERR(dev)) { | ||
251 | vfio_free_group_minor(minor); | ||
252 | mutex_unlock(&vfio.group_lock); | ||
253 | kfree(group); | ||
254 | return (struct vfio_group *)dev; /* ERR_PTR */ | ||
255 | } | ||
256 | |||
257 | group->minor = minor; | ||
258 | group->dev = dev; | ||
259 | |||
260 | list_add(&group->vfio_next, &vfio.group_list); | ||
261 | |||
262 | mutex_unlock(&vfio.group_lock); | ||
263 | |||
264 | return group; | ||
265 | } | ||
266 | |||
267 | static void vfio_group_release(struct kref *kref) | ||
268 | { | ||
269 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); | ||
270 | |||
271 | WARN_ON(!list_empty(&group->device_list)); | ||
272 | |||
273 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor)); | ||
274 | list_del(&group->vfio_next); | ||
275 | vfio_free_group_minor(group->minor); | ||
276 | |||
277 | mutex_unlock(&vfio.group_lock); | ||
278 | |||
279 | /* | ||
280 | * Unregister outside of lock. A spurious callback is harmless now | ||
281 | * that the group is no longer in vfio.group_list. | ||
282 | */ | ||
283 | iommu_group_unregister_notifier(group->iommu_group, &group->nb); | ||
284 | |||
285 | kfree(group); | ||
286 | } | ||
287 | |||
288 | static void vfio_group_put(struct vfio_group *group) | ||
289 | { | ||
290 | mutex_lock(&vfio.group_lock); | ||
291 | /* | ||
292 | * Release needs to unlock to unregister the notifier, so only | ||
293 | * unlock if not released. | ||
294 | */ | ||
295 | if (!kref_put(&group->kref, vfio_group_release)) | ||
296 | mutex_unlock(&vfio.group_lock); | ||
297 | } | ||
298 | |||
299 | /* Assume group_lock or group reference is held */ | ||
300 | static void vfio_group_get(struct vfio_group *group) | ||
301 | { | ||
302 | kref_get(&group->kref); | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * Not really a try as we will sleep for mutex, but we need to make | ||
307 | * sure the group pointer is valid under lock and get a reference. | ||
308 | */ | ||
309 | static struct vfio_group *vfio_group_try_get(struct vfio_group *group) | ||
310 | { | ||
311 | struct vfio_group *target = group; | ||
312 | |||
313 | mutex_lock(&vfio.group_lock); | ||
314 | list_for_each_entry(group, &vfio.group_list, vfio_next) { | ||
315 | if (group == target) { | ||
316 | vfio_group_get(group); | ||
317 | mutex_unlock(&vfio.group_lock); | ||
318 | return group; | ||
319 | } | ||
320 | } | ||
321 | mutex_unlock(&vfio.group_lock); | ||
322 | |||
323 | return NULL; | ||
324 | } | ||
325 | |||
326 | static | ||
327 | struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) | ||
328 | { | ||
329 | struct vfio_group *group; | ||
330 | |||
331 | mutex_lock(&vfio.group_lock); | ||
332 | list_for_each_entry(group, &vfio.group_list, vfio_next) { | ||
333 | if (group->iommu_group == iommu_group) { | ||
334 | vfio_group_get(group); | ||
335 | mutex_unlock(&vfio.group_lock); | ||
336 | return group; | ||
337 | } | ||
338 | } | ||
339 | mutex_unlock(&vfio.group_lock); | ||
340 | |||
341 | return NULL; | ||
342 | } | ||
343 | |||
344 | static struct vfio_group *vfio_group_get_from_minor(int minor) | ||
345 | { | ||
346 | struct vfio_group *group; | ||
347 | |||
348 | mutex_lock(&vfio.group_lock); | ||
349 | group = idr_find(&vfio.group_idr, minor); | ||
350 | if (!group) { | ||
351 | mutex_unlock(&vfio.group_lock); | ||
352 | return NULL; | ||
353 | } | ||
354 | vfio_group_get(group); | ||
355 | mutex_unlock(&vfio.group_lock); | ||
356 | |||
357 | return group; | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * Device objects - create, release, get, put, search | ||
362 | */ | ||
363 | static | ||
364 | struct vfio_device *vfio_group_create_device(struct vfio_group *group, | ||
365 | struct device *dev, | ||
366 | const struct vfio_device_ops *ops, | ||
367 | void *device_data) | ||
368 | { | ||
369 | struct vfio_device *device; | ||
370 | int ret; | ||
371 | |||
372 | device = kzalloc(sizeof(*device), GFP_KERNEL); | ||
373 | if (!device) | ||
374 | return ERR_PTR(-ENOMEM); | ||
375 | |||
376 | kref_init(&device->kref); | ||
377 | device->dev = dev; | ||
378 | device->group = group; | ||
379 | device->ops = ops; | ||
380 | device->device_data = device_data; | ||
381 | |||
382 | ret = dev_set_drvdata(dev, device); | ||
383 | if (ret) { | ||
384 | kfree(device); | ||
385 | return ERR_PTR(ret); | ||
386 | } | ||
387 | |||
388 | /* No need to get group_lock, caller has group reference */ | ||
389 | vfio_group_get(group); | ||
390 | |||
391 | mutex_lock(&group->device_lock); | ||
392 | list_add(&device->group_next, &group->device_list); | ||
393 | mutex_unlock(&group->device_lock); | ||
394 | |||
395 | return device; | ||
396 | } | ||
397 | |||
398 | static void vfio_device_release(struct kref *kref) | ||
399 | { | ||
400 | struct vfio_device *device = container_of(kref, | ||
401 | struct vfio_device, kref); | ||
402 | struct vfio_group *group = device->group; | ||
403 | |||
404 | mutex_lock(&group->device_lock); | ||
405 | list_del(&device->group_next); | ||
406 | mutex_unlock(&group->device_lock); | ||
407 | |||
408 | dev_set_drvdata(device->dev, NULL); | ||
409 | |||
410 | kfree(device); | ||
411 | |||
412 | /* vfio_del_group_dev may be waiting for this device */ | ||
413 | wake_up(&vfio.release_q); | ||
414 | } | ||
415 | |||
416 | /* Device reference always implies a group reference */ | ||
417 | static void vfio_device_put(struct vfio_device *device) | ||
418 | { | ||
419 | kref_put(&device->kref, vfio_device_release); | ||
420 | vfio_group_put(device->group); | ||
421 | } | ||
422 | |||
423 | static void vfio_device_get(struct vfio_device *device) | ||
424 | { | ||
425 | vfio_group_get(device->group); | ||
426 | kref_get(&device->kref); | ||
427 | } | ||
428 | |||
429 | static struct vfio_device *vfio_group_get_device(struct vfio_group *group, | ||
430 | struct device *dev) | ||
431 | { | ||
432 | struct vfio_device *device; | ||
433 | |||
434 | mutex_lock(&group->device_lock); | ||
435 | list_for_each_entry(device, &group->device_list, group_next) { | ||
436 | if (device->dev == dev) { | ||
437 | vfio_device_get(device); | ||
438 | mutex_unlock(&group->device_lock); | ||
439 | return device; | ||
440 | } | ||
441 | } | ||
442 | mutex_unlock(&group->device_lock); | ||
443 | return NULL; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Whitelist some drivers that we know are safe (no dma) or just sit on | ||
448 | * a device. It's not always practical to leave a device within a group | ||
449 | * driverless as it could get re-bound to something unsafe. | ||
450 | */ | ||
451 | static const char * const vfio_driver_whitelist[] = { "pci-stub" }; | ||
452 | |||
453 | static bool vfio_whitelisted_driver(struct device_driver *drv) | ||
454 | { | ||
455 | int i; | ||
456 | |||
457 | for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { | ||
458 | if (!strcmp(drv->name, vfio_driver_whitelist[i])) | ||
459 | return true; | ||
460 | } | ||
461 | |||
462 | return false; | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * A vfio group is viable for use by userspace if all devices are either | ||
467 | * driver-less or bound to a vfio or whitelisted driver. We test the | ||
468 | * latter by the existence of a struct vfio_device matching the dev. | ||
469 | */ | ||
470 | static int vfio_dev_viable(struct device *dev, void *data) | ||
471 | { | ||
472 | struct vfio_group *group = data; | ||
473 | struct vfio_device *device; | ||
474 | |||
475 | if (!dev->driver || vfio_whitelisted_driver(dev->driver)) | ||
476 | return 0; | ||
477 | |||
478 | device = vfio_group_get_device(group, dev); | ||
479 | if (device) { | ||
480 | vfio_device_put(device); | ||
481 | return 0; | ||
482 | } | ||
483 | |||
484 | return -EINVAL; | ||
485 | } | ||
486 | |||
487 | /** | ||
488 | * Async device support | ||
489 | */ | ||
490 | static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) | ||
491 | { | ||
492 | struct vfio_device *device; | ||
493 | |||
494 | /* Do we already know about it? We shouldn't */ | ||
495 | device = vfio_group_get_device(group, dev); | ||
496 | if (WARN_ON_ONCE(device)) { | ||
497 | vfio_device_put(device); | ||
498 | return 0; | ||
499 | } | ||
500 | |||
501 | /* Nothing to do for idle groups */ | ||
502 | if (!atomic_read(&group->container_users)) | ||
503 | return 0; | ||
504 | |||
505 | /* TODO Prevent device auto probing */ | ||
506 | WARN("Device %s added to live group %d!\n", dev_name(dev), | ||
507 | iommu_group_id(group->iommu_group)); | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev) | ||
513 | { | ||
514 | struct vfio_device *device; | ||
515 | |||
516 | /* | ||
517 | * Expect to fall out here. If a device was in use, it would | ||
518 | * have been bound to a vfio sub-driver, which would have blocked | ||
519 | * in .remove at vfio_del_group_dev. Sanity check that we no | ||
520 | * longer track the device, so it's safe to remove. | ||
521 | */ | ||
522 | device = vfio_group_get_device(group, dev); | ||
523 | if (likely(!device)) | ||
524 | return 0; | ||
525 | |||
526 | WARN("Device %s removed from live group %d!\n", dev_name(dev), | ||
527 | iommu_group_id(group->iommu_group)); | ||
528 | |||
529 | vfio_device_put(device); | ||
530 | return 0; | ||
531 | } | ||
532 | |||
533 | static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) | ||
534 | { | ||
535 | /* We don't care what happens when the group isn't in use */ | ||
536 | if (!atomic_read(&group->container_users)) | ||
537 | return 0; | ||
538 | |||
539 | return vfio_dev_viable(dev, group); | ||
540 | } | ||
541 | |||
542 | static int vfio_iommu_group_notifier(struct notifier_block *nb, | ||
543 | unsigned long action, void *data) | ||
544 | { | ||
545 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); | ||
546 | struct device *dev = data; | ||
547 | |||
548 | /* | ||
549 | * Need to go through a group_lock lookup to get a reference or | ||
550 | * we risk racing a group being removed. Leave a WARN_ON for | ||
551 | * debuging, but if the group no longer exists, a spurious notify | ||
552 | * is harmless. | ||
553 | */ | ||
554 | group = vfio_group_try_get(group); | ||
555 | if (WARN_ON(!group)) | ||
556 | return NOTIFY_OK; | ||
557 | |||
558 | switch (action) { | ||
559 | case IOMMU_GROUP_NOTIFY_ADD_DEVICE: | ||
560 | vfio_group_nb_add_dev(group, dev); | ||
561 | break; | ||
562 | case IOMMU_GROUP_NOTIFY_DEL_DEVICE: | ||
563 | vfio_group_nb_del_dev(group, dev); | ||
564 | break; | ||
565 | case IOMMU_GROUP_NOTIFY_BIND_DRIVER: | ||
566 | pr_debug("%s: Device %s, group %d binding to driver\n", | ||
567 | __func__, dev_name(dev), | ||
568 | iommu_group_id(group->iommu_group)); | ||
569 | break; | ||
570 | case IOMMU_GROUP_NOTIFY_BOUND_DRIVER: | ||
571 | pr_debug("%s: Device %s, group %d bound to driver %s\n", | ||
572 | __func__, dev_name(dev), | ||
573 | iommu_group_id(group->iommu_group), dev->driver->name); | ||
574 | BUG_ON(vfio_group_nb_verify(group, dev)); | ||
575 | break; | ||
576 | case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER: | ||
577 | pr_debug("%s: Device %s, group %d unbinding from driver %s\n", | ||
578 | __func__, dev_name(dev), | ||
579 | iommu_group_id(group->iommu_group), dev->driver->name); | ||
580 | break; | ||
581 | case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER: | ||
582 | pr_debug("%s: Device %s, group %d unbound from driver\n", | ||
583 | __func__, dev_name(dev), | ||
584 | iommu_group_id(group->iommu_group)); | ||
585 | /* | ||
586 | * XXX An unbound device in a live group is ok, but we'd | ||
587 | * really like to avoid the above BUG_ON by preventing other | ||
588 | * drivers from binding to it. Once that occurs, we have to | ||
589 | * stop the system to maintain isolation. At a minimum, we'd | ||
590 | * want a toggle to disable driver auto probe for this device. | ||
591 | */ | ||
592 | break; | ||
593 | } | ||
594 | |||
595 | vfio_group_put(group); | ||
596 | return NOTIFY_OK; | ||
597 | } | ||
598 | |||
599 | /** | ||
600 | * VFIO driver API | ||
601 | */ | ||
602 | int vfio_add_group_dev(struct device *dev, | ||
603 | const struct vfio_device_ops *ops, void *device_data) | ||
604 | { | ||
605 | struct iommu_group *iommu_group; | ||
606 | struct vfio_group *group; | ||
607 | struct vfio_device *device; | ||
608 | |||
609 | iommu_group = iommu_group_get(dev); | ||
610 | if (!iommu_group) | ||
611 | return -EINVAL; | ||
612 | |||
613 | group = vfio_group_get_from_iommu(iommu_group); | ||
614 | if (!group) { | ||
615 | group = vfio_create_group(iommu_group); | ||
616 | if (IS_ERR(group)) { | ||
617 | iommu_group_put(iommu_group); | ||
618 | return PTR_ERR(group); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | device = vfio_group_get_device(group, dev); | ||
623 | if (device) { | ||
624 | WARN(1, "Device %s already exists on group %d\n", | ||
625 | dev_name(dev), iommu_group_id(iommu_group)); | ||
626 | vfio_device_put(device); | ||
627 | vfio_group_put(group); | ||
628 | iommu_group_put(iommu_group); | ||
629 | return -EBUSY; | ||
630 | } | ||
631 | |||
632 | device = vfio_group_create_device(group, dev, ops, device_data); | ||
633 | if (IS_ERR(device)) { | ||
634 | vfio_group_put(group); | ||
635 | iommu_group_put(iommu_group); | ||
636 | return PTR_ERR(device); | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * Added device holds reference to iommu_group and vfio_device | ||
641 | * (which in turn holds reference to vfio_group). Drop extra | ||
642 | * group reference used while acquiring device. | ||
643 | */ | ||
644 | vfio_group_put(group); | ||
645 | |||
646 | return 0; | ||
647 | } | ||
648 | EXPORT_SYMBOL_GPL(vfio_add_group_dev); | ||
649 | |||
650 | /* Test whether a struct device is present in our tracking */ | ||
651 | static bool vfio_dev_present(struct device *dev) | ||
652 | { | ||
653 | struct iommu_group *iommu_group; | ||
654 | struct vfio_group *group; | ||
655 | struct vfio_device *device; | ||
656 | |||
657 | iommu_group = iommu_group_get(dev); | ||
658 | if (!iommu_group) | ||
659 | return false; | ||
660 | |||
661 | group = vfio_group_get_from_iommu(iommu_group); | ||
662 | if (!group) { | ||
663 | iommu_group_put(iommu_group); | ||
664 | return false; | ||
665 | } | ||
666 | |||
667 | device = vfio_group_get_device(group, dev); | ||
668 | if (!device) { | ||
669 | vfio_group_put(group); | ||
670 | iommu_group_put(iommu_group); | ||
671 | return false; | ||
672 | } | ||
673 | |||
674 | vfio_device_put(device); | ||
675 | vfio_group_put(group); | ||
676 | iommu_group_put(iommu_group); | ||
677 | return true; | ||
678 | } | ||
679 | |||
680 | /* | ||
681 | * Decrement the device reference count and wait for the device to be | ||
682 | * removed. Open file descriptors for the device... */ | ||
683 | void *vfio_del_group_dev(struct device *dev) | ||
684 | { | ||
685 | struct vfio_device *device = dev_get_drvdata(dev); | ||
686 | struct vfio_group *group = device->group; | ||
687 | struct iommu_group *iommu_group = group->iommu_group; | ||
688 | void *device_data = device->device_data; | ||
689 | |||
690 | vfio_device_put(device); | ||
691 | |||
692 | /* TODO send a signal to encourage this to be released */ | ||
693 | wait_event(vfio.release_q, !vfio_dev_present(dev)); | ||
694 | |||
695 | iommu_group_put(iommu_group); | ||
696 | |||
697 | return device_data; | ||
698 | } | ||
699 | EXPORT_SYMBOL_GPL(vfio_del_group_dev); | ||
700 | |||
701 | /** | ||
702 | * VFIO base fd, /dev/vfio/vfio | ||
703 | */ | ||
704 | static long vfio_ioctl_check_extension(struct vfio_container *container, | ||
705 | unsigned long arg) | ||
706 | { | ||
707 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
708 | long ret = 0; | ||
709 | |||
710 | switch (arg) { | ||
711 | /* No base extensions yet */ | ||
712 | default: | ||
713 | /* | ||
714 | * If no driver is set, poll all registered drivers for | ||
715 | * extensions and return the first positive result. If | ||
716 | * a driver is already set, further queries will be passed | ||
717 | * only to that driver. | ||
718 | */ | ||
719 | if (!driver) { | ||
720 | mutex_lock(&vfio.iommu_drivers_lock); | ||
721 | list_for_each_entry(driver, &vfio.iommu_drivers_list, | ||
722 | vfio_next) { | ||
723 | if (!try_module_get(driver->ops->owner)) | ||
724 | continue; | ||
725 | |||
726 | ret = driver->ops->ioctl(NULL, | ||
727 | VFIO_CHECK_EXTENSION, | ||
728 | arg); | ||
729 | module_put(driver->ops->owner); | ||
730 | if (ret > 0) | ||
731 | break; | ||
732 | } | ||
733 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
734 | } else | ||
735 | ret = driver->ops->ioctl(container->iommu_data, | ||
736 | VFIO_CHECK_EXTENSION, arg); | ||
737 | } | ||
738 | |||
739 | return ret; | ||
740 | } | ||
741 | |||
742 | /* hold container->group_lock */ | ||
743 | static int __vfio_container_attach_groups(struct vfio_container *container, | ||
744 | struct vfio_iommu_driver *driver, | ||
745 | void *data) | ||
746 | { | ||
747 | struct vfio_group *group; | ||
748 | int ret = -ENODEV; | ||
749 | |||
750 | list_for_each_entry(group, &container->group_list, container_next) { | ||
751 | ret = driver->ops->attach_group(data, group->iommu_group); | ||
752 | if (ret) | ||
753 | goto unwind; | ||
754 | } | ||
755 | |||
756 | return ret; | ||
757 | |||
758 | unwind: | ||
759 | list_for_each_entry_continue_reverse(group, &container->group_list, | ||
760 | container_next) { | ||
761 | driver->ops->detach_group(data, group->iommu_group); | ||
762 | } | ||
763 | |||
764 | return ret; | ||
765 | } | ||
766 | |||
767 | static long vfio_ioctl_set_iommu(struct vfio_container *container, | ||
768 | unsigned long arg) | ||
769 | { | ||
770 | struct vfio_iommu_driver *driver; | ||
771 | long ret = -ENODEV; | ||
772 | |||
773 | mutex_lock(&container->group_lock); | ||
774 | |||
775 | /* | ||
776 | * The container is designed to be an unprivileged interface while | ||
777 | * the group can be assigned to specific users. Therefore, only by | ||
778 | * adding a group to a container does the user get the privilege of | ||
779 | * enabling the iommu, which may allocate finite resources. There | ||
780 | * is no unset_iommu, but by removing all the groups from a container, | ||
781 | * the container is deprivileged and returns to an unset state. | ||
782 | */ | ||
783 | if (list_empty(&container->group_list) || container->iommu_driver) { | ||
784 | mutex_unlock(&container->group_lock); | ||
785 | return -EINVAL; | ||
786 | } | ||
787 | |||
788 | mutex_lock(&vfio.iommu_drivers_lock); | ||
789 | list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { | ||
790 | void *data; | ||
791 | |||
792 | if (!try_module_get(driver->ops->owner)) | ||
793 | continue; | ||
794 | |||
795 | /* | ||
796 | * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, | ||
797 | * so test which iommu driver reported support for this | ||
798 | * extension and call open on them. We also pass them the | ||
799 | * magic, allowing a single driver to support multiple | ||
800 | * interfaces if they'd like. | ||
801 | */ | ||
802 | if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { | ||
803 | module_put(driver->ops->owner); | ||
804 | continue; | ||
805 | } | ||
806 | |||
807 | /* module reference holds the driver we're working on */ | ||
808 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
809 | |||
810 | data = driver->ops->open(arg); | ||
811 | if (IS_ERR(data)) { | ||
812 | ret = PTR_ERR(data); | ||
813 | module_put(driver->ops->owner); | ||
814 | goto skip_drivers_unlock; | ||
815 | } | ||
816 | |||
817 | ret = __vfio_container_attach_groups(container, driver, data); | ||
818 | if (!ret) { | ||
819 | container->iommu_driver = driver; | ||
820 | container->iommu_data = data; | ||
821 | } else { | ||
822 | driver->ops->release(data); | ||
823 | module_put(driver->ops->owner); | ||
824 | } | ||
825 | |||
826 | goto skip_drivers_unlock; | ||
827 | } | ||
828 | |||
829 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
830 | skip_drivers_unlock: | ||
831 | mutex_unlock(&container->group_lock); | ||
832 | |||
833 | return ret; | ||
834 | } | ||
835 | |||
836 | static long vfio_fops_unl_ioctl(struct file *filep, | ||
837 | unsigned int cmd, unsigned long arg) | ||
838 | { | ||
839 | struct vfio_container *container = filep->private_data; | ||
840 | struct vfio_iommu_driver *driver; | ||
841 | void *data; | ||
842 | long ret = -EINVAL; | ||
843 | |||
844 | if (!container) | ||
845 | return ret; | ||
846 | |||
847 | driver = container->iommu_driver; | ||
848 | data = container->iommu_data; | ||
849 | |||
850 | switch (cmd) { | ||
851 | case VFIO_GET_API_VERSION: | ||
852 | ret = VFIO_API_VERSION; | ||
853 | break; | ||
854 | case VFIO_CHECK_EXTENSION: | ||
855 | ret = vfio_ioctl_check_extension(container, arg); | ||
856 | break; | ||
857 | case VFIO_SET_IOMMU: | ||
858 | ret = vfio_ioctl_set_iommu(container, arg); | ||
859 | break; | ||
860 | default: | ||
861 | if (driver) /* passthrough all unrecognized ioctls */ | ||
862 | ret = driver->ops->ioctl(data, cmd, arg); | ||
863 | } | ||
864 | |||
865 | return ret; | ||
866 | } | ||
867 | |||
868 | #ifdef CONFIG_COMPAT | ||
869 | static long vfio_fops_compat_ioctl(struct file *filep, | ||
870 | unsigned int cmd, unsigned long arg) | ||
871 | { | ||
872 | arg = (unsigned long)compat_ptr(arg); | ||
873 | return vfio_fops_unl_ioctl(filep, cmd, arg); | ||
874 | } | ||
875 | #endif /* CONFIG_COMPAT */ | ||
876 | |||
877 | static int vfio_fops_open(struct inode *inode, struct file *filep) | ||
878 | { | ||
879 | struct vfio_container *container; | ||
880 | |||
881 | container = kzalloc(sizeof(*container), GFP_KERNEL); | ||
882 | if (!container) | ||
883 | return -ENOMEM; | ||
884 | |||
885 | INIT_LIST_HEAD(&container->group_list); | ||
886 | mutex_init(&container->group_lock); | ||
887 | kref_init(&container->kref); | ||
888 | |||
889 | filep->private_data = container; | ||
890 | |||
891 | return 0; | ||
892 | } | ||
893 | |||
894 | static int vfio_fops_release(struct inode *inode, struct file *filep) | ||
895 | { | ||
896 | struct vfio_container *container = filep->private_data; | ||
897 | |||
898 | filep->private_data = NULL; | ||
899 | |||
900 | vfio_container_put(container); | ||
901 | |||
902 | return 0; | ||
903 | } | ||
904 | |||
905 | /* | ||
906 | * Once an iommu driver is set, we optionally pass read/write/mmap | ||
907 | * on to the driver, allowing management interfaces beyond ioctl. | ||
908 | */ | ||
909 | static ssize_t vfio_fops_read(struct file *filep, char __user *buf, | ||
910 | size_t count, loff_t *ppos) | ||
911 | { | ||
912 | struct vfio_container *container = filep->private_data; | ||
913 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
914 | |||
915 | if (unlikely(!driver || !driver->ops->read)) | ||
916 | return -EINVAL; | ||
917 | |||
918 | return driver->ops->read(container->iommu_data, buf, count, ppos); | ||
919 | } | ||
920 | |||
921 | static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, | ||
922 | size_t count, loff_t *ppos) | ||
923 | { | ||
924 | struct vfio_container *container = filep->private_data; | ||
925 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
926 | |||
927 | if (unlikely(!driver || !driver->ops->write)) | ||
928 | return -EINVAL; | ||
929 | |||
930 | return driver->ops->write(container->iommu_data, buf, count, ppos); | ||
931 | } | ||
932 | |||
933 | static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) | ||
934 | { | ||
935 | struct vfio_container *container = filep->private_data; | ||
936 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
937 | |||
938 | if (unlikely(!driver || !driver->ops->mmap)) | ||
939 | return -EINVAL; | ||
940 | |||
941 | return driver->ops->mmap(container->iommu_data, vma); | ||
942 | } | ||
943 | |||
944 | static const struct file_operations vfio_fops = { | ||
945 | .owner = THIS_MODULE, | ||
946 | .open = vfio_fops_open, | ||
947 | .release = vfio_fops_release, | ||
948 | .read = vfio_fops_read, | ||
949 | .write = vfio_fops_write, | ||
950 | .unlocked_ioctl = vfio_fops_unl_ioctl, | ||
951 | #ifdef CONFIG_COMPAT | ||
952 | .compat_ioctl = vfio_fops_compat_ioctl, | ||
953 | #endif | ||
954 | .mmap = vfio_fops_mmap, | ||
955 | }; | ||
956 | |||
957 | /** | ||
958 | * VFIO Group fd, /dev/vfio/$GROUP | ||
959 | */ | ||
960 | static void __vfio_group_unset_container(struct vfio_group *group) | ||
961 | { | ||
962 | struct vfio_container *container = group->container; | ||
963 | struct vfio_iommu_driver *driver; | ||
964 | |||
965 | mutex_lock(&container->group_lock); | ||
966 | |||
967 | driver = container->iommu_driver; | ||
968 | if (driver) | ||
969 | driver->ops->detach_group(container->iommu_data, | ||
970 | group->iommu_group); | ||
971 | |||
972 | group->container = NULL; | ||
973 | list_del(&group->container_next); | ||
974 | |||
975 | /* Detaching the last group deprivileges a container, remove iommu */ | ||
976 | if (driver && list_empty(&container->group_list)) { | ||
977 | driver->ops->release(container->iommu_data); | ||
978 | module_put(driver->ops->owner); | ||
979 | container->iommu_driver = NULL; | ||
980 | container->iommu_data = NULL; | ||
981 | } | ||
982 | |||
983 | mutex_unlock(&container->group_lock); | ||
984 | |||
985 | vfio_container_put(container); | ||
986 | } | ||
987 | |||
988 | /* | ||
989 | * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or | ||
990 | * if there was no container to unset. Since the ioctl is called on | ||
991 | * the group, we know that still exists, therefore the only valid | ||
992 | * transition here is 1->0. | ||
993 | */ | ||
994 | static int vfio_group_unset_container(struct vfio_group *group) | ||
995 | { | ||
996 | int users = atomic_cmpxchg(&group->container_users, 1, 0); | ||
997 | |||
998 | if (!users) | ||
999 | return -EINVAL; | ||
1000 | if (users != 1) | ||
1001 | return -EBUSY; | ||
1002 | |||
1003 | __vfio_group_unset_container(group); | ||
1004 | |||
1005 | return 0; | ||
1006 | } | ||
1007 | |||
1008 | /* | ||
1009 | * When removing container users, anything that removes the last user | ||
1010 | * implicitly removes the group from the container. That is, if the | ||
1011 | * group file descriptor is closed, as well as any device file descriptors, | ||
1012 | * the group is free. | ||
1013 | */ | ||
1014 | static void vfio_group_try_dissolve_container(struct vfio_group *group) | ||
1015 | { | ||
1016 | if (0 == atomic_dec_if_positive(&group->container_users)) | ||
1017 | __vfio_group_unset_container(group); | ||
1018 | } | ||
1019 | |||
1020 | static int vfio_group_set_container(struct vfio_group *group, int container_fd) | ||
1021 | { | ||
1022 | struct file *filep; | ||
1023 | struct vfio_container *container; | ||
1024 | struct vfio_iommu_driver *driver; | ||
1025 | int ret = 0; | ||
1026 | |||
1027 | if (atomic_read(&group->container_users)) | ||
1028 | return -EINVAL; | ||
1029 | |||
1030 | filep = fget(container_fd); | ||
1031 | if (!filep) | ||
1032 | return -EBADF; | ||
1033 | |||
1034 | /* Sanity check, is this really our fd? */ | ||
1035 | if (filep->f_op != &vfio_fops) { | ||
1036 | fput(filep); | ||
1037 | return -EINVAL; | ||
1038 | } | ||
1039 | |||
1040 | container = filep->private_data; | ||
1041 | WARN_ON(!container); /* fget ensures we don't race vfio_release */ | ||
1042 | |||
1043 | mutex_lock(&container->group_lock); | ||
1044 | |||
1045 | driver = container->iommu_driver; | ||
1046 | if (driver) { | ||
1047 | ret = driver->ops->attach_group(container->iommu_data, | ||
1048 | group->iommu_group); | ||
1049 | if (ret) | ||
1050 | goto unlock_out; | ||
1051 | } | ||
1052 | |||
1053 | group->container = container; | ||
1054 | list_add(&group->container_next, &container->group_list); | ||
1055 | |||
1056 | /* Get a reference on the container and mark a user within the group */ | ||
1057 | vfio_container_get(container); | ||
1058 | atomic_inc(&group->container_users); | ||
1059 | |||
1060 | unlock_out: | ||
1061 | mutex_unlock(&container->group_lock); | ||
1062 | fput(filep); | ||
1063 | |||
1064 | return ret; | ||
1065 | } | ||
1066 | |||
1067 | static bool vfio_group_viable(struct vfio_group *group) | ||
1068 | { | ||
1069 | return (iommu_group_for_each_dev(group->iommu_group, | ||
1070 | group, vfio_dev_viable) == 0); | ||
1071 | } | ||
1072 | |||
1073 | static const struct file_operations vfio_device_fops; | ||
1074 | |||
1075 | static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) | ||
1076 | { | ||
1077 | struct vfio_device *device; | ||
1078 | struct file *filep; | ||
1079 | int ret = -ENODEV; | ||
1080 | |||
1081 | if (0 == atomic_read(&group->container_users) || | ||
1082 | !group->container->iommu_driver || !vfio_group_viable(group)) | ||
1083 | return -EINVAL; | ||
1084 | |||
1085 | mutex_lock(&group->device_lock); | ||
1086 | list_for_each_entry(device, &group->device_list, group_next) { | ||
1087 | if (strcmp(dev_name(device->dev), buf)) | ||
1088 | continue; | ||
1089 | |||
1090 | ret = device->ops->open(device->device_data); | ||
1091 | if (ret) | ||
1092 | break; | ||
1093 | /* | ||
1094 | * We can't use anon_inode_getfd() because we need to modify | ||
1095 | * the f_mode flags directly to allow more than just ioctls | ||
1096 | */ | ||
1097 | ret = get_unused_fd(); | ||
1098 | if (ret < 0) { | ||
1099 | device->ops->release(device->device_data); | ||
1100 | break; | ||
1101 | } | ||
1102 | |||
1103 | filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, | ||
1104 | device, O_RDWR); | ||
1105 | if (IS_ERR(filep)) { | ||
1106 | put_unused_fd(ret); | ||
1107 | ret = PTR_ERR(filep); | ||
1108 | device->ops->release(device->device_data); | ||
1109 | break; | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * TODO: add an anon_inode interface to do this. | ||
1114 | * Appears to be missing by lack of need rather than | ||
1115 | * explicitly prevented. Now there's need. | ||
1116 | */ | ||
1117 | filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); | ||
1118 | |||
1119 | fd_install(ret, filep); | ||
1120 | |||
1121 | vfio_device_get(device); | ||
1122 | atomic_inc(&group->container_users); | ||
1123 | break; | ||
1124 | } | ||
1125 | mutex_unlock(&group->device_lock); | ||
1126 | |||
1127 | return ret; | ||
1128 | } | ||
1129 | |||
1130 | static long vfio_group_fops_unl_ioctl(struct file *filep, | ||
1131 | unsigned int cmd, unsigned long arg) | ||
1132 | { | ||
1133 | struct vfio_group *group = filep->private_data; | ||
1134 | long ret = -ENOTTY; | ||
1135 | |||
1136 | switch (cmd) { | ||
1137 | case VFIO_GROUP_GET_STATUS: | ||
1138 | { | ||
1139 | struct vfio_group_status status; | ||
1140 | unsigned long minsz; | ||
1141 | |||
1142 | minsz = offsetofend(struct vfio_group_status, flags); | ||
1143 | |||
1144 | if (copy_from_user(&status, (void __user *)arg, minsz)) | ||
1145 | return -EFAULT; | ||
1146 | |||
1147 | if (status.argsz < minsz) | ||
1148 | return -EINVAL; | ||
1149 | |||
1150 | status.flags = 0; | ||
1151 | |||
1152 | if (vfio_group_viable(group)) | ||
1153 | status.flags |= VFIO_GROUP_FLAGS_VIABLE; | ||
1154 | |||
1155 | if (group->container) | ||
1156 | status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET; | ||
1157 | |||
1158 | if (copy_to_user((void __user *)arg, &status, minsz)) | ||
1159 | return -EFAULT; | ||
1160 | |||
1161 | ret = 0; | ||
1162 | break; | ||
1163 | } | ||
1164 | case VFIO_GROUP_SET_CONTAINER: | ||
1165 | { | ||
1166 | int fd; | ||
1167 | |||
1168 | if (get_user(fd, (int __user *)arg)) | ||
1169 | return -EFAULT; | ||
1170 | |||
1171 | if (fd < 0) | ||
1172 | return -EINVAL; | ||
1173 | |||
1174 | ret = vfio_group_set_container(group, fd); | ||
1175 | break; | ||
1176 | } | ||
1177 | case VFIO_GROUP_UNSET_CONTAINER: | ||
1178 | ret = vfio_group_unset_container(group); | ||
1179 | break; | ||
1180 | case VFIO_GROUP_GET_DEVICE_FD: | ||
1181 | { | ||
1182 | char *buf; | ||
1183 | |||
1184 | buf = strndup_user((const char __user *)arg, PAGE_SIZE); | ||
1185 | if (IS_ERR(buf)) | ||
1186 | return PTR_ERR(buf); | ||
1187 | |||
1188 | ret = vfio_group_get_device_fd(group, buf); | ||
1189 | kfree(buf); | ||
1190 | break; | ||
1191 | } | ||
1192 | } | ||
1193 | |||
1194 | return ret; | ||
1195 | } | ||
1196 | |||
1197 | #ifdef CONFIG_COMPAT | ||
1198 | static long vfio_group_fops_compat_ioctl(struct file *filep, | ||
1199 | unsigned int cmd, unsigned long arg) | ||
1200 | { | ||
1201 | arg = (unsigned long)compat_ptr(arg); | ||
1202 | return vfio_group_fops_unl_ioctl(filep, cmd, arg); | ||
1203 | } | ||
1204 | #endif /* CONFIG_COMPAT */ | ||
1205 | |||
1206 | static int vfio_group_fops_open(struct inode *inode, struct file *filep) | ||
1207 | { | ||
1208 | struct vfio_group *group; | ||
1209 | |||
1210 | group = vfio_group_get_from_minor(iminor(inode)); | ||
1211 | if (!group) | ||
1212 | return -ENODEV; | ||
1213 | |||
1214 | if (group->container) { | ||
1215 | vfio_group_put(group); | ||
1216 | return -EBUSY; | ||
1217 | } | ||
1218 | |||
1219 | filep->private_data = group; | ||
1220 | |||
1221 | return 0; | ||
1222 | } | ||
1223 | |||
1224 | static int vfio_group_fops_release(struct inode *inode, struct file *filep) | ||
1225 | { | ||
1226 | struct vfio_group *group = filep->private_data; | ||
1227 | |||
1228 | filep->private_data = NULL; | ||
1229 | |||
1230 | vfio_group_try_dissolve_container(group); | ||
1231 | |||
1232 | vfio_group_put(group); | ||
1233 | |||
1234 | return 0; | ||
1235 | } | ||
1236 | |||
1237 | static const struct file_operations vfio_group_fops = { | ||
1238 | .owner = THIS_MODULE, | ||
1239 | .unlocked_ioctl = vfio_group_fops_unl_ioctl, | ||
1240 | #ifdef CONFIG_COMPAT | ||
1241 | .compat_ioctl = vfio_group_fops_compat_ioctl, | ||
1242 | #endif | ||
1243 | .open = vfio_group_fops_open, | ||
1244 | .release = vfio_group_fops_release, | ||
1245 | }; | ||
1246 | |||
1247 | /** | ||
1248 | * VFIO Device fd | ||
1249 | */ | ||
1250 | static int vfio_device_fops_release(struct inode *inode, struct file *filep) | ||
1251 | { | ||
1252 | struct vfio_device *device = filep->private_data; | ||
1253 | |||
1254 | device->ops->release(device->device_data); | ||
1255 | |||
1256 | vfio_group_try_dissolve_container(device->group); | ||
1257 | |||
1258 | vfio_device_put(device); | ||
1259 | |||
1260 | return 0; | ||
1261 | } | ||
1262 | |||
1263 | static long vfio_device_fops_unl_ioctl(struct file *filep, | ||
1264 | unsigned int cmd, unsigned long arg) | ||
1265 | { | ||
1266 | struct vfio_device *device = filep->private_data; | ||
1267 | |||
1268 | if (unlikely(!device->ops->ioctl)) | ||
1269 | return -EINVAL; | ||
1270 | |||
1271 | return device->ops->ioctl(device->device_data, cmd, arg); | ||
1272 | } | ||
1273 | |||
1274 | static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, | ||
1275 | size_t count, loff_t *ppos) | ||
1276 | { | ||
1277 | struct vfio_device *device = filep->private_data; | ||
1278 | |||
1279 | if (unlikely(!device->ops->read)) | ||
1280 | return -EINVAL; | ||
1281 | |||
1282 | return device->ops->read(device->device_data, buf, count, ppos); | ||
1283 | } | ||
1284 | |||
1285 | static ssize_t vfio_device_fops_write(struct file *filep, | ||
1286 | const char __user *buf, | ||
1287 | size_t count, loff_t *ppos) | ||
1288 | { | ||
1289 | struct vfio_device *device = filep->private_data; | ||
1290 | |||
1291 | if (unlikely(!device->ops->write)) | ||
1292 | return -EINVAL; | ||
1293 | |||
1294 | return device->ops->write(device->device_data, buf, count, ppos); | ||
1295 | } | ||
1296 | |||
1297 | static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) | ||
1298 | { | ||
1299 | struct vfio_device *device = filep->private_data; | ||
1300 | |||
1301 | if (unlikely(!device->ops->mmap)) | ||
1302 | return -EINVAL; | ||
1303 | |||
1304 | return device->ops->mmap(device->device_data, vma); | ||
1305 | } | ||
1306 | |||
1307 | #ifdef CONFIG_COMPAT | ||
1308 | static long vfio_device_fops_compat_ioctl(struct file *filep, | ||
1309 | unsigned int cmd, unsigned long arg) | ||
1310 | { | ||
1311 | arg = (unsigned long)compat_ptr(arg); | ||
1312 | return vfio_device_fops_unl_ioctl(filep, cmd, arg); | ||
1313 | } | ||
1314 | #endif /* CONFIG_COMPAT */ | ||
1315 | |||
1316 | static const struct file_operations vfio_device_fops = { | ||
1317 | .owner = THIS_MODULE, | ||
1318 | .release = vfio_device_fops_release, | ||
1319 | .read = vfio_device_fops_read, | ||
1320 | .write = vfio_device_fops_write, | ||
1321 | .unlocked_ioctl = vfio_device_fops_unl_ioctl, | ||
1322 | #ifdef CONFIG_COMPAT | ||
1323 | .compat_ioctl = vfio_device_fops_compat_ioctl, | ||
1324 | #endif | ||
1325 | .mmap = vfio_device_fops_mmap, | ||
1326 | }; | ||
1327 | |||
1328 | /** | ||
1329 | * Module/class support | ||
1330 | */ | ||
1331 | static char *vfio_devnode(struct device *dev, umode_t *mode) | ||
1332 | { | ||
1333 | return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); | ||
1334 | } | ||
1335 | |||
1336 | static int __init vfio_init(void) | ||
1337 | { | ||
1338 | int ret; | ||
1339 | |||
1340 | idr_init(&vfio.group_idr); | ||
1341 | mutex_init(&vfio.group_lock); | ||
1342 | mutex_init(&vfio.iommu_drivers_lock); | ||
1343 | INIT_LIST_HEAD(&vfio.group_list); | ||
1344 | INIT_LIST_HEAD(&vfio.iommu_drivers_list); | ||
1345 | init_waitqueue_head(&vfio.release_q); | ||
1346 | |||
1347 | vfio.class = class_create(THIS_MODULE, "vfio"); | ||
1348 | if (IS_ERR(vfio.class)) { | ||
1349 | ret = PTR_ERR(vfio.class); | ||
1350 | goto err_class; | ||
1351 | } | ||
1352 | |||
1353 | vfio.class->devnode = vfio_devnode; | ||
1354 | |||
1355 | ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio"); | ||
1356 | if (ret) | ||
1357 | goto err_base_chrdev; | ||
1358 | |||
1359 | cdev_init(&vfio.cdev, &vfio_fops); | ||
1360 | ret = cdev_add(&vfio.cdev, vfio.devt, 1); | ||
1361 | if (ret) | ||
1362 | goto err_base_cdev; | ||
1363 | |||
1364 | vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio"); | ||
1365 | if (IS_ERR(vfio.dev)) { | ||
1366 | ret = PTR_ERR(vfio.dev); | ||
1367 | goto err_base_dev; | ||
1368 | } | ||
1369 | |||
1370 | /* /dev/vfio/$GROUP */ | ||
1371 | cdev_init(&vfio.group_cdev, &vfio_group_fops); | ||
1372 | ret = cdev_add(&vfio.group_cdev, | ||
1373 | MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1); | ||
1374 | if (ret) | ||
1375 | goto err_groups_cdev; | ||
1376 | |||
1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); | ||
1378 | |||
1379 | /* | ||
1380 | * Attempt to load known iommu-drivers. This gives us a working | ||
1381 | * environment without the user needing to explicitly load iommu | ||
1382 | * drivers. | ||
1383 | */ | ||
1384 | request_module_nowait("vfio_iommu_type1"); | ||
1385 | |||
1386 | return 0; | ||
1387 | |||
1388 | err_groups_cdev: | ||
1389 | device_destroy(vfio.class, vfio.devt); | ||
1390 | err_base_dev: | ||
1391 | cdev_del(&vfio.cdev); | ||
1392 | err_base_cdev: | ||
1393 | unregister_chrdev_region(vfio.devt, MINORMASK); | ||
1394 | err_base_chrdev: | ||
1395 | class_destroy(vfio.class); | ||
1396 | vfio.class = NULL; | ||
1397 | err_class: | ||
1398 | return ret; | ||
1399 | } | ||
1400 | |||
1401 | static void __exit vfio_cleanup(void) | ||
1402 | { | ||
1403 | WARN_ON(!list_empty(&vfio.group_list)); | ||
1404 | |||
1405 | idr_destroy(&vfio.group_idr); | ||
1406 | cdev_del(&vfio.group_cdev); | ||
1407 | device_destroy(vfio.class, vfio.devt); | ||
1408 | cdev_del(&vfio.cdev); | ||
1409 | unregister_chrdev_region(vfio.devt, MINORMASK); | ||
1410 | class_destroy(vfio.class); | ||
1411 | vfio.class = NULL; | ||
1412 | } | ||
1413 | |||
1414 | module_init(vfio_init); | ||
1415 | module_exit(vfio_cleanup); | ||
1416 | |||
1417 | MODULE_VERSION(DRIVER_VERSION); | ||
1418 | MODULE_LICENSE("GPL v2"); | ||
1419 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
1420 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c new file mode 100644 index 000000000000..6f3fbc48a6c7 --- /dev/null +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -0,0 +1,753 @@ | |||
1 | /* | ||
2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | * | ||
15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | ||
16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | ||
17 | * VT-d, but that makes it harder to re-use as theoretically anyone | ||
18 | * implementing a similar IOMMU could make use of this. We expect the | ||
19 | * IOMMU to support the IOMMU API and have few to no restrictions around | ||
20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | ||
21 | * optimized for relatively static mappings of a userspace process with | ||
22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | ||
23 | * domains are PCI based as the IOMMU API is still centered around a | ||
24 | * device/bus interface rather than a group interface. | ||
25 | */ | ||
26 | |||
27 | #include <linux/compat.h> | ||
28 | #include <linux/device.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/iommu.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/pci.h> /* pci_bus_type */ | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/uaccess.h> | ||
37 | #include <linux/vfio.h> | ||
38 | #include <linux/workqueue.h> | ||
39 | |||
40 | #define DRIVER_VERSION "0.2" | ||
41 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
42 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | ||
43 | |||
44 | static bool allow_unsafe_interrupts; | ||
45 | module_param_named(allow_unsafe_interrupts, | ||
46 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | ||
47 | MODULE_PARM_DESC(allow_unsafe_interrupts, | ||
48 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | ||
49 | |||
50 | struct vfio_iommu { | ||
51 | struct iommu_domain *domain; | ||
52 | struct mutex lock; | ||
53 | struct list_head dma_list; | ||
54 | struct list_head group_list; | ||
55 | bool cache; | ||
56 | }; | ||
57 | |||
58 | struct vfio_dma { | ||
59 | struct list_head next; | ||
60 | dma_addr_t iova; /* Device address */ | ||
61 | unsigned long vaddr; /* Process virtual addr */ | ||
62 | long npage; /* Number of pages */ | ||
63 | int prot; /* IOMMU_READ/WRITE */ | ||
64 | }; | ||
65 | |||
66 | struct vfio_group { | ||
67 | struct iommu_group *iommu_group; | ||
68 | struct list_head next; | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * This code handles mapping and unmapping of user data buffers | ||
73 | * into DMA'ble space using the IOMMU | ||
74 | */ | ||
75 | |||
76 | #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) | ||
77 | |||
78 | struct vwork { | ||
79 | struct mm_struct *mm; | ||
80 | long npage; | ||
81 | struct work_struct work; | ||
82 | }; | ||
83 | |||
84 | /* delayed decrement/increment for locked_vm */ | ||
85 | static void vfio_lock_acct_bg(struct work_struct *work) | ||
86 | { | ||
87 | struct vwork *vwork = container_of(work, struct vwork, work); | ||
88 | struct mm_struct *mm; | ||
89 | |||
90 | mm = vwork->mm; | ||
91 | down_write(&mm->mmap_sem); | ||
92 | mm->locked_vm += vwork->npage; | ||
93 | up_write(&mm->mmap_sem); | ||
94 | mmput(mm); | ||
95 | kfree(vwork); | ||
96 | } | ||
97 | |||
98 | static void vfio_lock_acct(long npage) | ||
99 | { | ||
100 | struct vwork *vwork; | ||
101 | struct mm_struct *mm; | ||
102 | |||
103 | if (!current->mm) | ||
104 | return; /* process exited */ | ||
105 | |||
106 | if (down_write_trylock(¤t->mm->mmap_sem)) { | ||
107 | current->mm->locked_vm += npage; | ||
108 | up_write(¤t->mm->mmap_sem); | ||
109 | return; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Couldn't get mmap_sem lock, so must setup to update | ||
114 | * mm->locked_vm later. If locked_vm were atomic, we | ||
115 | * wouldn't need this silliness | ||
116 | */ | ||
117 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | ||
118 | if (!vwork) | ||
119 | return; | ||
120 | mm = get_task_mm(current); | ||
121 | if (!mm) { | ||
122 | kfree(vwork); | ||
123 | return; | ||
124 | } | ||
125 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | ||
126 | vwork->mm = mm; | ||
127 | vwork->npage = npage; | ||
128 | schedule_work(&vwork->work); | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Some mappings aren't backed by a struct page, for example an mmap'd | ||
133 | * MMIO range for our own or another device. These use a different | ||
134 | * pfn conversion and shouldn't be tracked as locked pages. | ||
135 | */ | ||
136 | static bool is_invalid_reserved_pfn(unsigned long pfn) | ||
137 | { | ||
138 | if (pfn_valid(pfn)) { | ||
139 | bool reserved; | ||
140 | struct page *tail = pfn_to_page(pfn); | ||
141 | struct page *head = compound_trans_head(tail); | ||
142 | reserved = !!(PageReserved(head)); | ||
143 | if (head != tail) { | ||
144 | /* | ||
145 | * "head" is not a dangling pointer | ||
146 | * (compound_trans_head takes care of that) | ||
147 | * but the hugepage may have been split | ||
148 | * from under us (and we may not hold a | ||
149 | * reference count on the head page so it can | ||
150 | * be reused before we run PageReferenced), so | ||
151 | * we've to check PageTail before returning | ||
152 | * what we just read. | ||
153 | */ | ||
154 | smp_rmb(); | ||
155 | if (PageTail(tail)) | ||
156 | return reserved; | ||
157 | } | ||
158 | return PageReserved(tail); | ||
159 | } | ||
160 | |||
161 | return true; | ||
162 | } | ||
163 | |||
164 | static int put_pfn(unsigned long pfn, int prot) | ||
165 | { | ||
166 | if (!is_invalid_reserved_pfn(pfn)) { | ||
167 | struct page *page = pfn_to_page(pfn); | ||
168 | if (prot & IOMMU_WRITE) | ||
169 | SetPageDirty(page); | ||
170 | put_page(page); | ||
171 | return 1; | ||
172 | } | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | /* Unmap DMA region */ | ||
177 | static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
178 | long npage, int prot) | ||
179 | { | ||
180 | long i, unlocked = 0; | ||
181 | |||
182 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { | ||
183 | unsigned long pfn; | ||
184 | |||
185 | pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; | ||
186 | if (pfn) { | ||
187 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
188 | unlocked += put_pfn(pfn, prot); | ||
189 | } | ||
190 | } | ||
191 | return unlocked; | ||
192 | } | ||
193 | |||
194 | static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
195 | long npage, int prot) | ||
196 | { | ||
197 | long unlocked; | ||
198 | |||
199 | unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); | ||
200 | vfio_lock_acct(-unlocked); | ||
201 | } | ||
202 | |||
203 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | ||
204 | { | ||
205 | struct page *page[1]; | ||
206 | struct vm_area_struct *vma; | ||
207 | int ret = -EFAULT; | ||
208 | |||
209 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | ||
210 | *pfn = page_to_pfn(page[0]); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | down_read(¤t->mm->mmap_sem); | ||
215 | |||
216 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | ||
217 | |||
218 | if (vma && vma->vm_flags & VM_PFNMAP) { | ||
219 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
220 | if (is_invalid_reserved_pfn(*pfn)) | ||
221 | ret = 0; | ||
222 | } | ||
223 | |||
224 | up_read(¤t->mm->mmap_sem); | ||
225 | |||
226 | return ret; | ||
227 | } | ||
228 | |||
229 | /* Map DMA region */ | ||
230 | static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, | ||
231 | unsigned long vaddr, long npage, int prot) | ||
232 | { | ||
233 | dma_addr_t start = iova; | ||
234 | long i, locked = 0; | ||
235 | int ret; | ||
236 | |||
237 | /* Verify that pages are not already mapped */ | ||
238 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) | ||
239 | if (iommu_iova_to_phys(iommu->domain, iova)) | ||
240 | return -EBUSY; | ||
241 | |||
242 | iova = start; | ||
243 | |||
244 | if (iommu->cache) | ||
245 | prot |= IOMMU_CACHE; | ||
246 | |||
247 | /* | ||
248 | * XXX We break mappings into pages and use get_user_pages_fast to | ||
249 | * pin the pages in memory. It's been suggested that mlock might | ||
250 | * provide a more efficient mechanism, but nothing prevents the | ||
251 | * user from munlocking the pages, which could then allow the user | ||
252 | * access to random host memory. We also have no guarantee from the | ||
253 | * IOMMU API that the iommu driver can unmap sub-pages of previous | ||
254 | * mappings. This means we might lose an entire range if a single | ||
255 | * page within it is unmapped. Single page mappings are inefficient, | ||
256 | * but provide the most flexibility for now. | ||
257 | */ | ||
258 | for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { | ||
259 | unsigned long pfn = 0; | ||
260 | |||
261 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | ||
262 | if (ret) { | ||
263 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
264 | return ret; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Only add actual locked pages to accounting | ||
269 | * XXX We're effectively marking a page locked for every | ||
270 | * IOVA page even though it's possible the user could be | ||
271 | * backing multiple IOVAs with the same vaddr. This over- | ||
272 | * penalizes the user process, but we currently have no | ||
273 | * easy way to do this properly. | ||
274 | */ | ||
275 | if (!is_invalid_reserved_pfn(pfn)) | ||
276 | locked++; | ||
277 | |||
278 | ret = iommu_map(iommu->domain, iova, | ||
279 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
280 | PAGE_SIZE, prot); | ||
281 | if (ret) { | ||
282 | /* Back out mappings on error */ | ||
283 | put_pfn(pfn, prot); | ||
284 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
285 | return ret; | ||
286 | } | ||
287 | } | ||
288 | vfio_lock_acct(locked); | ||
289 | return 0; | ||
290 | } | ||
291 | |||
292 | static inline bool ranges_overlap(dma_addr_t start1, size_t size1, | ||
293 | dma_addr_t start2, size_t size2) | ||
294 | { | ||
295 | if (start1 < start2) | ||
296 | return (start2 - start1 < size1); | ||
297 | else if (start2 < start1) | ||
298 | return (start1 - start2 < size2); | ||
299 | return (size1 > 0 && size2 > 0); | ||
300 | } | ||
301 | |||
302 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | ||
303 | dma_addr_t start, size_t size) | ||
304 | { | ||
305 | struct vfio_dma *dma; | ||
306 | |||
307 | list_for_each_entry(dma, &iommu->dma_list, next) { | ||
308 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
309 | start, size)) | ||
310 | return dma; | ||
311 | } | ||
312 | return NULL; | ||
313 | } | ||
314 | |||
315 | static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | ||
316 | size_t size, struct vfio_dma *dma) | ||
317 | { | ||
318 | struct vfio_dma *split; | ||
319 | long npage_lo, npage_hi; | ||
320 | |||
321 | /* Existing dma region is completely covered, unmap all */ | ||
322 | if (start <= dma->iova && | ||
323 | start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
324 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
325 | list_del(&dma->next); | ||
326 | npage_lo = dma->npage; | ||
327 | kfree(dma); | ||
328 | return npage_lo; | ||
329 | } | ||
330 | |||
331 | /* Overlap low address of existing range */ | ||
332 | if (start <= dma->iova) { | ||
333 | size_t overlap; | ||
334 | |||
335 | overlap = start + size - dma->iova; | ||
336 | npage_lo = overlap >> PAGE_SHIFT; | ||
337 | |||
338 | vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); | ||
339 | dma->iova += overlap; | ||
340 | dma->vaddr += overlap; | ||
341 | dma->npage -= npage_lo; | ||
342 | return npage_lo; | ||
343 | } | ||
344 | |||
345 | /* Overlap high address of existing range */ | ||
346 | if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
347 | size_t overlap; | ||
348 | |||
349 | overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; | ||
350 | npage_hi = overlap >> PAGE_SHIFT; | ||
351 | |||
352 | vfio_dma_unmap(iommu, start, npage_hi, dma->prot); | ||
353 | dma->npage -= npage_hi; | ||
354 | return npage_hi; | ||
355 | } | ||
356 | |||
357 | /* Split existing */ | ||
358 | npage_lo = (start - dma->iova) >> PAGE_SHIFT; | ||
359 | npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; | ||
360 | |||
361 | split = kzalloc(sizeof *split, GFP_KERNEL); | ||
362 | if (!split) | ||
363 | return -ENOMEM; | ||
364 | |||
365 | vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); | ||
366 | |||
367 | dma->npage = npage_lo; | ||
368 | |||
369 | split->npage = npage_hi; | ||
370 | split->iova = start + size; | ||
371 | split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; | ||
372 | split->prot = dma->prot; | ||
373 | list_add(&split->next, &iommu->dma_list); | ||
374 | return size >> PAGE_SHIFT; | ||
375 | } | ||
376 | |||
377 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | ||
378 | struct vfio_iommu_type1_dma_unmap *unmap) | ||
379 | { | ||
380 | long ret = 0, npage = unmap->size >> PAGE_SHIFT; | ||
381 | struct vfio_dma *dma, *tmp; | ||
382 | uint64_t mask; | ||
383 | |||
384 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
385 | |||
386 | if (unmap->iova & mask) | ||
387 | return -EINVAL; | ||
388 | if (unmap->size & mask) | ||
389 | return -EINVAL; | ||
390 | |||
391 | /* XXX We still break these down into PAGE_SIZE */ | ||
392 | WARN_ON(mask & PAGE_MASK); | ||
393 | |||
394 | mutex_lock(&iommu->lock); | ||
395 | |||
396 | list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { | ||
397 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
398 | unmap->iova, unmap->size)) { | ||
399 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, | ||
400 | unmap->size, dma); | ||
401 | if (ret > 0) | ||
402 | npage -= ret; | ||
403 | if (ret < 0 || npage == 0) | ||
404 | break; | ||
405 | } | ||
406 | } | ||
407 | mutex_unlock(&iommu->lock); | ||
408 | return ret > 0 ? 0 : (int)ret; | ||
409 | } | ||
410 | |||
411 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | ||
412 | struct vfio_iommu_type1_dma_map *map) | ||
413 | { | ||
414 | struct vfio_dma *dma, *pdma = NULL; | ||
415 | dma_addr_t iova = map->iova; | ||
416 | unsigned long locked, lock_limit, vaddr = map->vaddr; | ||
417 | size_t size = map->size; | ||
418 | int ret = 0, prot = 0; | ||
419 | uint64_t mask; | ||
420 | long npage; | ||
421 | |||
422 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
423 | |||
424 | /* READ/WRITE from device perspective */ | ||
425 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | ||
426 | prot |= IOMMU_WRITE; | ||
427 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | ||
428 | prot |= IOMMU_READ; | ||
429 | |||
430 | if (!prot) | ||
431 | return -EINVAL; /* No READ/WRITE? */ | ||
432 | |||
433 | if (vaddr & mask) | ||
434 | return -EINVAL; | ||
435 | if (iova & mask) | ||
436 | return -EINVAL; | ||
437 | if (size & mask) | ||
438 | return -EINVAL; | ||
439 | |||
440 | /* XXX We still break these down into PAGE_SIZE */ | ||
441 | WARN_ON(mask & PAGE_MASK); | ||
442 | |||
443 | /* Don't allow IOVA wrap */ | ||
444 | if (iova + size && iova + size < iova) | ||
445 | return -EINVAL; | ||
446 | |||
447 | /* Don't allow virtual address wrap */ | ||
448 | if (vaddr + size && vaddr + size < vaddr) | ||
449 | return -EINVAL; | ||
450 | |||
451 | npage = size >> PAGE_SHIFT; | ||
452 | if (!npage) | ||
453 | return -EINVAL; | ||
454 | |||
455 | mutex_lock(&iommu->lock); | ||
456 | |||
457 | if (vfio_find_dma(iommu, iova, size)) { | ||
458 | ret = -EBUSY; | ||
459 | goto out_lock; | ||
460 | } | ||
461 | |||
462 | /* account for locked pages */ | ||
463 | locked = current->mm->locked_vm + npage; | ||
464 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
465 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { | ||
466 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | ||
467 | __func__, rlimit(RLIMIT_MEMLOCK)); | ||
468 | ret = -ENOMEM; | ||
469 | goto out_lock; | ||
470 | } | ||
471 | |||
472 | ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); | ||
473 | if (ret) | ||
474 | goto out_lock; | ||
475 | |||
476 | /* Check if we abut a region below - nothing below 0 */ | ||
477 | if (iova) { | ||
478 | dma = vfio_find_dma(iommu, iova - 1, 1); | ||
479 | if (dma && dma->prot == prot && | ||
480 | dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { | ||
481 | |||
482 | dma->npage += npage; | ||
483 | iova = dma->iova; | ||
484 | vaddr = dma->vaddr; | ||
485 | npage = dma->npage; | ||
486 | size = NPAGE_TO_SIZE(npage); | ||
487 | |||
488 | pdma = dma; | ||
489 | } | ||
490 | } | ||
491 | |||
492 | /* Check if we abut a region above - nothing above ~0 + 1 */ | ||
493 | if (iova + size) { | ||
494 | dma = vfio_find_dma(iommu, iova + size, 1); | ||
495 | if (dma && dma->prot == prot && | ||
496 | dma->vaddr == vaddr + size) { | ||
497 | |||
498 | dma->npage += npage; | ||
499 | dma->iova = iova; | ||
500 | dma->vaddr = vaddr; | ||
501 | |||
502 | /* | ||
503 | * If merged above and below, remove previously | ||
504 | * merged entry. New entry covers it. | ||
505 | */ | ||
506 | if (pdma) { | ||
507 | list_del(&pdma->next); | ||
508 | kfree(pdma); | ||
509 | } | ||
510 | pdma = dma; | ||
511 | } | ||
512 | } | ||
513 | |||
514 | /* Isolated, new region */ | ||
515 | if (!pdma) { | ||
516 | dma = kzalloc(sizeof *dma, GFP_KERNEL); | ||
517 | if (!dma) { | ||
518 | ret = -ENOMEM; | ||
519 | vfio_dma_unmap(iommu, iova, npage, prot); | ||
520 | goto out_lock; | ||
521 | } | ||
522 | |||
523 | dma->npage = npage; | ||
524 | dma->iova = iova; | ||
525 | dma->vaddr = vaddr; | ||
526 | dma->prot = prot; | ||
527 | list_add(&dma->next, &iommu->dma_list); | ||
528 | } | ||
529 | |||
530 | out_lock: | ||
531 | mutex_unlock(&iommu->lock); | ||
532 | return ret; | ||
533 | } | ||
534 | |||
535 | static int vfio_iommu_type1_attach_group(void *iommu_data, | ||
536 | struct iommu_group *iommu_group) | ||
537 | { | ||
538 | struct vfio_iommu *iommu = iommu_data; | ||
539 | struct vfio_group *group, *tmp; | ||
540 | int ret; | ||
541 | |||
542 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
543 | if (!group) | ||
544 | return -ENOMEM; | ||
545 | |||
546 | mutex_lock(&iommu->lock); | ||
547 | |||
548 | list_for_each_entry(tmp, &iommu->group_list, next) { | ||
549 | if (tmp->iommu_group == iommu_group) { | ||
550 | mutex_unlock(&iommu->lock); | ||
551 | kfree(group); | ||
552 | return -EINVAL; | ||
553 | } | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * TODO: Domain have capabilities that might change as we add | ||
558 | * groups (see iommu->cache, currently never set). Check for | ||
559 | * them and potentially disallow groups to be attached when it | ||
560 | * would change capabilities (ugh). | ||
561 | */ | ||
562 | ret = iommu_attach_group(iommu->domain, iommu_group); | ||
563 | if (ret) { | ||
564 | mutex_unlock(&iommu->lock); | ||
565 | kfree(group); | ||
566 | return ret; | ||
567 | } | ||
568 | |||
569 | group->iommu_group = iommu_group; | ||
570 | list_add(&group->next, &iommu->group_list); | ||
571 | |||
572 | mutex_unlock(&iommu->lock); | ||
573 | |||
574 | return 0; | ||
575 | } | ||
576 | |||
577 | static void vfio_iommu_type1_detach_group(void *iommu_data, | ||
578 | struct iommu_group *iommu_group) | ||
579 | { | ||
580 | struct vfio_iommu *iommu = iommu_data; | ||
581 | struct vfio_group *group; | ||
582 | |||
583 | mutex_lock(&iommu->lock); | ||
584 | |||
585 | list_for_each_entry(group, &iommu->group_list, next) { | ||
586 | if (group->iommu_group == iommu_group) { | ||
587 | iommu_detach_group(iommu->domain, iommu_group); | ||
588 | list_del(&group->next); | ||
589 | kfree(group); | ||
590 | break; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | mutex_unlock(&iommu->lock); | ||
595 | } | ||
596 | |||
597 | static void *vfio_iommu_type1_open(unsigned long arg) | ||
598 | { | ||
599 | struct vfio_iommu *iommu; | ||
600 | |||
601 | if (arg != VFIO_TYPE1_IOMMU) | ||
602 | return ERR_PTR(-EINVAL); | ||
603 | |||
604 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | ||
605 | if (!iommu) | ||
606 | return ERR_PTR(-ENOMEM); | ||
607 | |||
608 | INIT_LIST_HEAD(&iommu->group_list); | ||
609 | INIT_LIST_HEAD(&iommu->dma_list); | ||
610 | mutex_init(&iommu->lock); | ||
611 | |||
612 | /* | ||
613 | * Wish we didn't have to know about bus_type here. | ||
614 | */ | ||
615 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | ||
616 | if (!iommu->domain) { | ||
617 | kfree(iommu); | ||
618 | return ERR_PTR(-EIO); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Wish we could specify required capabilities rather than create | ||
623 | * a domain, see what comes out and hope it doesn't change along | ||
624 | * the way. Fortunately we know interrupt remapping is global for | ||
625 | * our iommus. | ||
626 | */ | ||
627 | if (!allow_unsafe_interrupts && | ||
628 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | ||
629 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
630 | __func__); | ||
631 | iommu_domain_free(iommu->domain); | ||
632 | kfree(iommu); | ||
633 | return ERR_PTR(-EPERM); | ||
634 | } | ||
635 | |||
636 | return iommu; | ||
637 | } | ||
638 | |||
639 | static void vfio_iommu_type1_release(void *iommu_data) | ||
640 | { | ||
641 | struct vfio_iommu *iommu = iommu_data; | ||
642 | struct vfio_group *group, *group_tmp; | ||
643 | struct vfio_dma *dma, *dma_tmp; | ||
644 | |||
645 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | ||
646 | iommu_detach_group(iommu->domain, group->iommu_group); | ||
647 | list_del(&group->next); | ||
648 | kfree(group); | ||
649 | } | ||
650 | |||
651 | list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { | ||
652 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
653 | list_del(&dma->next); | ||
654 | kfree(dma); | ||
655 | } | ||
656 | |||
657 | iommu_domain_free(iommu->domain); | ||
658 | iommu->domain = NULL; | ||
659 | kfree(iommu); | ||
660 | } | ||
661 | |||
662 | static long vfio_iommu_type1_ioctl(void *iommu_data, | ||
663 | unsigned int cmd, unsigned long arg) | ||
664 | { | ||
665 | struct vfio_iommu *iommu = iommu_data; | ||
666 | unsigned long minsz; | ||
667 | |||
668 | if (cmd == VFIO_CHECK_EXTENSION) { | ||
669 | switch (arg) { | ||
670 | case VFIO_TYPE1_IOMMU: | ||
671 | return 1; | ||
672 | default: | ||
673 | return 0; | ||
674 | } | ||
675 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | ||
676 | struct vfio_iommu_type1_info info; | ||
677 | |||
678 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | ||
679 | |||
680 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
681 | return -EFAULT; | ||
682 | |||
683 | if (info.argsz < minsz) | ||
684 | return -EINVAL; | ||
685 | |||
686 | info.flags = 0; | ||
687 | |||
688 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | ||
689 | |||
690 | return copy_to_user((void __user *)arg, &info, minsz); | ||
691 | |||
692 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | ||
693 | struct vfio_iommu_type1_dma_map map; | ||
694 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | ||
695 | VFIO_DMA_MAP_FLAG_WRITE; | ||
696 | |||
697 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | ||
698 | |||
699 | if (copy_from_user(&map, (void __user *)arg, minsz)) | ||
700 | return -EFAULT; | ||
701 | |||
702 | if (map.argsz < minsz || map.flags & ~mask) | ||
703 | return -EINVAL; | ||
704 | |||
705 | return vfio_dma_do_map(iommu, &map); | ||
706 | |||
707 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | ||
708 | struct vfio_iommu_type1_dma_unmap unmap; | ||
709 | |||
710 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | ||
711 | |||
712 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | ||
713 | return -EFAULT; | ||
714 | |||
715 | if (unmap.argsz < minsz || unmap.flags) | ||
716 | return -EINVAL; | ||
717 | |||
718 | return vfio_dma_do_unmap(iommu, &unmap); | ||
719 | } | ||
720 | |||
721 | return -ENOTTY; | ||
722 | } | ||
723 | |||
724 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | ||
725 | .name = "vfio-iommu-type1", | ||
726 | .owner = THIS_MODULE, | ||
727 | .open = vfio_iommu_type1_open, | ||
728 | .release = vfio_iommu_type1_release, | ||
729 | .ioctl = vfio_iommu_type1_ioctl, | ||
730 | .attach_group = vfio_iommu_type1_attach_group, | ||
731 | .detach_group = vfio_iommu_type1_detach_group, | ||
732 | }; | ||
733 | |||
734 | static int __init vfio_iommu_type1_init(void) | ||
735 | { | ||
736 | if (!iommu_present(&pci_bus_type)) | ||
737 | return -ENODEV; | ||
738 | |||
739 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
740 | } | ||
741 | |||
742 | static void __exit vfio_iommu_type1_cleanup(void) | ||
743 | { | ||
744 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
745 | } | ||
746 | |||
747 | module_init(vfio_iommu_type1_init); | ||
748 | module_exit(vfio_iommu_type1_cleanup); | ||
749 | |||
750 | MODULE_VERSION(DRIVER_VERSION); | ||
751 | MODULE_LICENSE("GPL v2"); | ||
752 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
753 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h new file mode 100644 index 000000000000..0a4f180a11d8 --- /dev/null +++ b/include/linux/vfio.h | |||
@@ -0,0 +1,445 @@ | |||
1 | /* | ||
2 | * VFIO API definition | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | #ifndef VFIO_H | ||
12 | #define VFIO_H | ||
13 | |||
14 | #include <linux/types.h> | ||
15 | #include <linux/ioctl.h> | ||
16 | |||
17 | #define VFIO_API_VERSION 0 | ||
18 | |||
19 | #ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */ | ||
20 | |||
21 | #include <linux/iommu.h> | ||
22 | #include <linux/mm.h> | ||
23 | |||
24 | /** | ||
25 | * struct vfio_device_ops - VFIO bus driver device callbacks | ||
26 | * | ||
27 | * @open: Called when userspace creates new file descriptor for device | ||
28 | * @release: Called when userspace releases file descriptor for device | ||
29 | * @read: Perform read(2) on device file descriptor | ||
30 | * @write: Perform write(2) on device file descriptor | ||
31 | * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* | ||
32 | * operations documented below | ||
33 | * @mmap: Perform mmap(2) on a region of the device file descriptor | ||
34 | */ | ||
35 | struct vfio_device_ops { | ||
36 | char *name; | ||
37 | int (*open)(void *device_data); | ||
38 | void (*release)(void *device_data); | ||
39 | ssize_t (*read)(void *device_data, char __user *buf, | ||
40 | size_t count, loff_t *ppos); | ||
41 | ssize_t (*write)(void *device_data, const char __user *buf, | ||
42 | size_t count, loff_t *size); | ||
43 | long (*ioctl)(void *device_data, unsigned int cmd, | ||
44 | unsigned long arg); | ||
45 | int (*mmap)(void *device_data, struct vm_area_struct *vma); | ||
46 | }; | ||
47 | |||
48 | extern int vfio_add_group_dev(struct device *dev, | ||
49 | const struct vfio_device_ops *ops, | ||
50 | void *device_data); | ||
51 | |||
52 | extern void *vfio_del_group_dev(struct device *dev); | ||
53 | |||
54 | /** | ||
55 | * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks | ||
56 | */ | ||
57 | struct vfio_iommu_driver_ops { | ||
58 | char *name; | ||
59 | struct module *owner; | ||
60 | void *(*open)(unsigned long arg); | ||
61 | void (*release)(void *iommu_data); | ||
62 | ssize_t (*read)(void *iommu_data, char __user *buf, | ||
63 | size_t count, loff_t *ppos); | ||
64 | ssize_t (*write)(void *iommu_data, const char __user *buf, | ||
65 | size_t count, loff_t *size); | ||
66 | long (*ioctl)(void *iommu_data, unsigned int cmd, | ||
67 | unsigned long arg); | ||
68 | int (*mmap)(void *iommu_data, struct vm_area_struct *vma); | ||
69 | int (*attach_group)(void *iommu_data, | ||
70 | struct iommu_group *group); | ||
71 | void (*detach_group)(void *iommu_data, | ||
72 | struct iommu_group *group); | ||
73 | |||
74 | }; | ||
75 | |||
76 | extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); | ||
77 | |||
78 | extern void vfio_unregister_iommu_driver( | ||
79 | const struct vfio_iommu_driver_ops *ops); | ||
80 | |||
81 | /** | ||
82 | * offsetofend(TYPE, MEMBER) | ||
83 | * | ||
84 | * @TYPE: The type of the structure | ||
85 | * @MEMBER: The member within the structure to get the end offset of | ||
86 | * | ||
87 | * Simple helper macro for dealing with variable sized structures passed | ||
88 | * from user space. This allows us to easily determine if the provided | ||
89 | * structure is sized to include various fields. | ||
90 | */ | ||
91 | #define offsetofend(TYPE, MEMBER) ({ \ | ||
92 | TYPE tmp; \ | ||
93 | offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ | ||
94 | |||
95 | #endif /* __KERNEL__ */ | ||
96 | |||
97 | /* Kernel & User level defines for VFIO IOCTLs. */ | ||
98 | |||
99 | /* Extensions */ | ||
100 | |||
101 | #define VFIO_TYPE1_IOMMU 1 | ||
102 | |||
103 | /* | ||
104 | * The IOCTL interface is designed for extensibility by embedding the | ||
105 | * structure length (argsz) and flags into structures passed between | ||
106 | * kernel and userspace. We therefore use the _IO() macro for these | ||
107 | * defines to avoid implicitly embedding a size into the ioctl request. | ||
108 | * As structure fields are added, argsz will increase to match and flag | ||
109 | * bits will be defined to indicate additional fields with valid data. | ||
110 | * It's *always* the caller's responsibility to indicate the size of | ||
111 | * the structure passed by setting argsz appropriately. | ||
112 | */ | ||
113 | |||
114 | #define VFIO_TYPE (';') | ||
115 | #define VFIO_BASE 100 | ||
116 | |||
117 | /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ | ||
118 | |||
119 | /** | ||
120 | * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) | ||
121 | * | ||
122 | * Report the version of the VFIO API. This allows us to bump the entire | ||
123 | * API version should we later need to add or change features in incompatible | ||
124 | * ways. | ||
125 | * Return: VFIO_API_VERSION | ||
126 | * Availability: Always | ||
127 | */ | ||
128 | #define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) | ||
129 | |||
130 | /** | ||
131 | * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) | ||
132 | * | ||
133 | * Check whether an extension is supported. | ||
134 | * Return: 0 if not supported, 1 (or some other positive integer) if supported. | ||
135 | * Availability: Always | ||
136 | */ | ||
137 | #define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) | ||
138 | |||
139 | /** | ||
140 | * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) | ||
141 | * | ||
142 | * Set the iommu to the given type. The type must be supported by an | ||
143 | * iommu driver as verified by calling CHECK_EXTENSION using the same | ||
144 | * type. A group must be set to this file descriptor before this | ||
145 | * ioctl is available. The IOMMU interfaces enabled by this call are | ||
146 | * specific to the value set. | ||
147 | * Return: 0 on success, -errno on failure | ||
148 | * Availability: When VFIO group attached | ||
149 | */ | ||
150 | #define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) | ||
151 | |||
152 | /* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ | ||
153 | |||
154 | /** | ||
155 | * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, | ||
156 | * struct vfio_group_status) | ||
157 | * | ||
158 | * Retrieve information about the group. Fills in provided | ||
159 | * struct vfio_group_info. Caller sets argsz. | ||
160 | * Return: 0 on succes, -errno on failure. | ||
161 | * Availability: Always | ||
162 | */ | ||
163 | struct vfio_group_status { | ||
164 | __u32 argsz; | ||
165 | __u32 flags; | ||
166 | #define VFIO_GROUP_FLAGS_VIABLE (1 << 0) | ||
167 | #define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) | ||
168 | }; | ||
169 | #define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) | ||
170 | |||
171 | /** | ||
172 | * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) | ||
173 | * | ||
174 | * Set the container for the VFIO group to the open VFIO file | ||
175 | * descriptor provided. Groups may only belong to a single | ||
176 | * container. Containers may, at their discretion, support multiple | ||
177 | * groups. Only when a container is set are all of the interfaces | ||
178 | * of the VFIO file descriptor and the VFIO group file descriptor | ||
179 | * available to the user. | ||
180 | * Return: 0 on success, -errno on failure. | ||
181 | * Availability: Always | ||
182 | */ | ||
183 | #define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) | ||
184 | |||
185 | /** | ||
186 | * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) | ||
187 | * | ||
188 | * Remove the group from the attached container. This is the | ||
189 | * opposite of the SET_CONTAINER call and returns the group to | ||
190 | * an initial state. All device file descriptors must be released | ||
191 | * prior to calling this interface. When removing the last group | ||
192 | * from a container, the IOMMU will be disabled and all state lost, | ||
193 | * effectively also returning the VFIO file descriptor to an initial | ||
194 | * state. | ||
195 | * Return: 0 on success, -errno on failure. | ||
196 | * Availability: When attached to container | ||
197 | */ | ||
198 | #define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) | ||
199 | |||
200 | /** | ||
201 | * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) | ||
202 | * | ||
203 | * Return a new file descriptor for the device object described by | ||
204 | * the provided string. The string should match a device listed in | ||
205 | * the devices subdirectory of the IOMMU group sysfs entry. The | ||
206 | * group containing the device must already be added to this context. | ||
207 | * Return: new file descriptor on success, -errno on failure. | ||
208 | * Availability: When attached to container | ||
209 | */ | ||
210 | #define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) | ||
211 | |||
212 | /* --------------- IOCTLs for DEVICE file descriptors --------------- */ | ||
213 | |||
214 | /** | ||
215 | * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, | ||
216 | * struct vfio_device_info) | ||
217 | * | ||
218 | * Retrieve information about the device. Fills in provided | ||
219 | * struct vfio_device_info. Caller sets argsz. | ||
220 | * Return: 0 on success, -errno on failure. | ||
221 | */ | ||
222 | struct vfio_device_info { | ||
223 | __u32 argsz; | ||
224 | __u32 flags; | ||
225 | #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ | ||
226 | #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ | ||
227 | __u32 num_regions; /* Max region index + 1 */ | ||
228 | __u32 num_irqs; /* Max IRQ index + 1 */ | ||
229 | }; | ||
230 | #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) | ||
231 | |||
232 | /** | ||
233 | * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, | ||
234 | * struct vfio_region_info) | ||
235 | * | ||
236 | * Retrieve information about a device region. Caller provides | ||
237 | * struct vfio_region_info with index value set. Caller sets argsz. | ||
238 | * Implementation of region mapping is bus driver specific. This is | ||
239 | * intended to describe MMIO, I/O port, as well as bus specific | ||
240 | * regions (ex. PCI config space). Zero sized regions may be used | ||
241 | * to describe unimplemented regions (ex. unimplemented PCI BARs). | ||
242 | * Return: 0 on success, -errno on failure. | ||
243 | */ | ||
244 | struct vfio_region_info { | ||
245 | __u32 argsz; | ||
246 | __u32 flags; | ||
247 | #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ | ||
248 | #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ | ||
249 | #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ | ||
250 | __u32 index; /* Region index */ | ||
251 | __u32 resv; /* Reserved for alignment */ | ||
252 | __u64 size; /* Region size (bytes) */ | ||
253 | __u64 offset; /* Region offset from start of device fd */ | ||
254 | }; | ||
255 | #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) | ||
256 | |||
257 | /** | ||
258 | * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, | ||
259 | * struct vfio_irq_info) | ||
260 | * | ||
261 | * Retrieve information about a device IRQ. Caller provides | ||
262 | * struct vfio_irq_info with index value set. Caller sets argsz. | ||
263 | * Implementation of IRQ mapping is bus driver specific. Indexes | ||
264 | * using multiple IRQs are primarily intended to support MSI-like | ||
265 | * interrupt blocks. Zero count irq blocks may be used to describe | ||
266 | * unimplemented interrupt types. | ||
267 | * | ||
268 | * The EVENTFD flag indicates the interrupt index supports eventfd based | ||
269 | * signaling. | ||
270 | * | ||
271 | * The MASKABLE flags indicates the index supports MASK and UNMASK | ||
272 | * actions described below. | ||
273 | * | ||
274 | * AUTOMASKED indicates that after signaling, the interrupt line is | ||
275 | * automatically masked by VFIO and the user needs to unmask the line | ||
276 | * to receive new interrupts. This is primarily intended to distinguish | ||
277 | * level triggered interrupts. | ||
278 | * | ||
279 | * The NORESIZE flag indicates that the interrupt lines within the index | ||
280 | * are setup as a set and new subindexes cannot be enabled without first | ||
281 | * disabling the entire index. This is used for interrupts like PCI MSI | ||
282 | * and MSI-X where the driver may only use a subset of the available | ||
283 | * indexes, but VFIO needs to enable a specific number of vectors | ||
284 | * upfront. In the case of MSI-X, where the user can enable MSI-X and | ||
285 | * then add and unmask vectors, it's up to userspace to make the decision | ||
286 | * whether to allocate the maximum supported number of vectors or tear | ||
287 | * down setup and incrementally increase the vectors as each is enabled. | ||
288 | */ | ||
289 | struct vfio_irq_info { | ||
290 | __u32 argsz; | ||
291 | __u32 flags; | ||
292 | #define VFIO_IRQ_INFO_EVENTFD (1 << 0) | ||
293 | #define VFIO_IRQ_INFO_MASKABLE (1 << 1) | ||
294 | #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) | ||
295 | #define VFIO_IRQ_INFO_NORESIZE (1 << 3) | ||
296 | __u32 index; /* IRQ index */ | ||
297 | __u32 count; /* Number of IRQs within this index */ | ||
298 | }; | ||
299 | #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) | ||
300 | |||
301 | /** | ||
302 | * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) | ||
303 | * | ||
304 | * Set signaling, masking, and unmasking of interrupts. Caller provides | ||
305 | * struct vfio_irq_set with all fields set. 'start' and 'count' indicate | ||
306 | * the range of subindexes being specified. | ||
307 | * | ||
308 | * The DATA flags specify the type of data provided. If DATA_NONE, the | ||
309 | * operation performs the specified action immediately on the specified | ||
310 | * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: | ||
311 | * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. | ||
312 | * | ||
313 | * DATA_BOOL allows sparse support for the same on arrays of interrupts. | ||
314 | * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): | ||
315 | * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, | ||
316 | * data = {1,0,1} | ||
317 | * | ||
318 | * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. | ||
319 | * A value of -1 can be used to either de-assign interrupts if already | ||
320 | * assigned or skip un-assigned interrupts. For example, to set an eventfd | ||
321 | * to be trigger for interrupts [0,0] and [0,2]: | ||
322 | * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, | ||
323 | * data = {fd1, -1, fd2} | ||
324 | * If index [0,1] is previously set, two count = 1 ioctls calls would be | ||
325 | * required to set [0,0] and [0,2] without changing [0,1]. | ||
326 | * | ||
327 | * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used | ||
328 | * with ACTION_TRIGGER to perform kernel level interrupt loopback testing | ||
329 | * from userspace (ie. simulate hardware triggering). | ||
330 | * | ||
331 | * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER | ||
332 | * enables the interrupt index for the device. Individual subindex interrupts | ||
333 | * can be disabled using the -1 value for DATA_EVENTFD or the index can be | ||
334 | * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. | ||
335 | * | ||
336 | * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while | ||
337 | * ACTION_TRIGGER specifies kernel->user signaling. | ||
338 | */ | ||
339 | struct vfio_irq_set { | ||
340 | __u32 argsz; | ||
341 | __u32 flags; | ||
342 | #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ | ||
343 | #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ | ||
344 | #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ | ||
345 | #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ | ||
346 | #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ | ||
347 | #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ | ||
348 | __u32 index; | ||
349 | __u32 start; | ||
350 | __u32 count; | ||
351 | __u8 data[]; | ||
352 | }; | ||
353 | #define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) | ||
354 | |||
355 | #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ | ||
356 | VFIO_IRQ_SET_DATA_BOOL | \ | ||
357 | VFIO_IRQ_SET_DATA_EVENTFD) | ||
358 | #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ | ||
359 | VFIO_IRQ_SET_ACTION_UNMASK | \ | ||
360 | VFIO_IRQ_SET_ACTION_TRIGGER) | ||
361 | /** | ||
362 | * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) | ||
363 | * | ||
364 | * Reset a device. | ||
365 | */ | ||
366 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) | ||
367 | |||
368 | /* | ||
369 | * The VFIO-PCI bus driver makes use of the following fixed region and | ||
370 | * IRQ index mapping. Unimplemented regions return a size of zero. | ||
371 | * Unimplemented IRQ types return a count of zero. | ||
372 | */ | ||
373 | |||
374 | enum { | ||
375 | VFIO_PCI_BAR0_REGION_INDEX, | ||
376 | VFIO_PCI_BAR1_REGION_INDEX, | ||
377 | VFIO_PCI_BAR2_REGION_INDEX, | ||
378 | VFIO_PCI_BAR3_REGION_INDEX, | ||
379 | VFIO_PCI_BAR4_REGION_INDEX, | ||
380 | VFIO_PCI_BAR5_REGION_INDEX, | ||
381 | VFIO_PCI_ROM_REGION_INDEX, | ||
382 | VFIO_PCI_CONFIG_REGION_INDEX, | ||
383 | VFIO_PCI_NUM_REGIONS | ||
384 | }; | ||
385 | |||
386 | enum { | ||
387 | VFIO_PCI_INTX_IRQ_INDEX, | ||
388 | VFIO_PCI_MSI_IRQ_INDEX, | ||
389 | VFIO_PCI_MSIX_IRQ_INDEX, | ||
390 | VFIO_PCI_NUM_IRQS | ||
391 | }; | ||
392 | |||
393 | /* -------- API for Type1 VFIO IOMMU -------- */ | ||
394 | |||
395 | /** | ||
396 | * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) | ||
397 | * | ||
398 | * Retrieve information about the IOMMU object. Fills in provided | ||
399 | * struct vfio_iommu_info. Caller sets argsz. | ||
400 | * | ||
401 | * XXX Should we do these by CHECK_EXTENSION too? | ||
402 | */ | ||
403 | struct vfio_iommu_type1_info { | ||
404 | __u32 argsz; | ||
405 | __u32 flags; | ||
406 | #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ | ||
407 | __u64 iova_pgsizes; /* Bitmap of supported page sizes */ | ||
408 | }; | ||
409 | |||
410 | #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) | ||
411 | |||
412 | /** | ||
413 | * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) | ||
414 | * | ||
415 | * Map process virtual addresses to IO virtual addresses using the | ||
416 | * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. | ||
417 | */ | ||
418 | struct vfio_iommu_type1_dma_map { | ||
419 | __u32 argsz; | ||
420 | __u32 flags; | ||
421 | #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ | ||
422 | #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ | ||
423 | __u64 vaddr; /* Process virtual address */ | ||
424 | __u64 iova; /* IO virtual address */ | ||
425 | __u64 size; /* Size of mapping (bytes) */ | ||
426 | }; | ||
427 | |||
428 | #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) | ||
429 | |||
430 | /** | ||
431 | * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) | ||
432 | * | ||
433 | * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. | ||
434 | * Caller sets argsz. | ||
435 | */ | ||
436 | struct vfio_iommu_type1_dma_unmap { | ||
437 | __u32 argsz; | ||
438 | __u32 flags; | ||
439 | __u64 iova; /* IO virtual address */ | ||
440 | __u64 size; /* Size of mapping (bytes) */ | ||
441 | }; | ||
442 | |||
443 | #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) | ||
444 | |||
445 | #endif /* VFIO_H */ | ||