diff options
| -rw-r--r-- | Documentation/ioctl/ioctl-number.txt | 1 | ||||
| -rw-r--r-- | Documentation/vfio.txt | 314 | ||||
| -rw-r--r-- | MAINTAINERS | 8 | ||||
| -rw-r--r-- | drivers/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/Makefile | 1 | ||||
| -rw-r--r-- | drivers/vfio/Kconfig | 16 | ||||
| -rw-r--r-- | drivers/vfio/Makefile | 3 | ||||
| -rw-r--r-- | drivers/vfio/pci/Kconfig | 8 | ||||
| -rw-r--r-- | drivers/vfio/pci/Makefile | 4 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 579 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 1540 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 740 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 91 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 269 | ||||
| -rw-r--r-- | drivers/vfio/vfio.c | 1420 | ||||
| -rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 753 | ||||
| -rw-r--r-- | include/linux/vfio.h | 445 |
17 files changed, 6194 insertions, 0 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 915f28c470e9..849b771c5e03 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
| @@ -88,6 +88,7 @@ Code Seq#(hex) Include File Comments | |||
| 88 | and kernel/power/user.c | 88 | and kernel/power/user.c |
| 89 | '8' all SNP8023 advanced NIC card | 89 | '8' all SNP8023 advanced NIC card |
| 90 | <mailto:mcr@solidum.com> | 90 | <mailto:mcr@solidum.com> |
| 91 | ';' 64-7F linux/vfio.h | ||
| 91 | '@' 00-0F linux/radeonfb.h conflict! | 92 | '@' 00-0F linux/radeonfb.h conflict! |
| 92 | '@' 00-0F drivers/video/aty/aty128fb.c conflict! | 93 | '@' 00-0F drivers/video/aty/aty128fb.c conflict! |
| 93 | 'A' 00-1F linux/apm_bios.h conflict! | 94 | 'A' 00-1F linux/apm_bios.h conflict! |
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt new file mode 100644 index 000000000000..0cb6685c8029 --- /dev/null +++ b/Documentation/vfio.txt | |||
| @@ -0,0 +1,314 @@ | |||
| 1 | VFIO - "Virtual Function I/O"[1] | ||
| 2 | ------------------------------------------------------------------------------- | ||
| 3 | Many modern system now provide DMA and interrupt remapping facilities | ||
| 4 | to help ensure I/O devices behave within the boundaries they've been | ||
| 5 | allotted. This includes x86 hardware with AMD-Vi and Intel VT-d, | ||
| 6 | POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC | ||
| 7 | systems such as Freescale PAMU. The VFIO driver is an IOMMU/device | ||
| 8 | agnostic framework for exposing direct device access to userspace, in | ||
| 9 | a secure, IOMMU protected environment. In other words, this allows | ||
| 10 | safe[2], non-privileged, userspace drivers. | ||
| 11 | |||
| 12 | Why do we want that? Virtual machines often make use of direct device | ||
| 13 | access ("device assignment") when configured for the highest possible | ||
| 14 | I/O performance. From a device and host perspective, this simply | ||
| 15 | turns the VM into a userspace driver, with the benefits of | ||
| 16 | significantly reduced latency, higher bandwidth, and direct use of | ||
| 17 | bare-metal device drivers[3]. | ||
| 18 | |||
| 19 | Some applications, particularly in the high performance computing | ||
| 20 | field, also benefit from low-overhead, direct device access from | ||
| 21 | userspace. Examples include network adapters (often non-TCP/IP based) | ||
| 22 | and compute accelerators. Prior to VFIO, these drivers had to either | ||
| 23 | go through the full development cycle to become proper upstream | ||
| 24 | driver, be maintained out of tree, or make use of the UIO framework, | ||
| 25 | which has no notion of IOMMU protection, limited interrupt support, | ||
| 26 | and requires root privileges to access things like PCI configuration | ||
| 27 | space. | ||
| 28 | |||
| 29 | The VFIO driver framework intends to unify these, replacing both the | ||
| 30 | KVM PCI specific device assignment code as well as provide a more | ||
| 31 | secure, more featureful userspace driver environment than UIO. | ||
| 32 | |||
| 33 | Groups, Devices, and IOMMUs | ||
| 34 | ------------------------------------------------------------------------------- | ||
| 35 | |||
| 36 | Devices are the main target of any I/O driver. Devices typically | ||
| 37 | create a programming interface made up of I/O access, interrupts, | ||
| 38 | and DMA. Without going into the details of each of these, DMA is | ||
| 39 | by far the most critical aspect for maintaining a secure environment | ||
| 40 | as allowing a device read-write access to system memory imposes the | ||
| 41 | greatest risk to the overall system integrity. | ||
| 42 | |||
| 43 | To help mitigate this risk, many modern IOMMUs now incorporate | ||
| 44 | isolation properties into what was, in many cases, an interface only | ||
| 45 | meant for translation (ie. solving the addressing problems of devices | ||
| 46 | with limited address spaces). With this, devices can now be isolated | ||
| 47 | from each other and from arbitrary memory access, thus allowing | ||
| 48 | things like secure direct assignment of devices into virtual machines. | ||
| 49 | |||
| 50 | This isolation is not always at the granularity of a single device | ||
| 51 | though. Even when an IOMMU is capable of this, properties of devices, | ||
| 52 | interconnects, and IOMMU topologies can each reduce this isolation. | ||
| 53 | For instance, an individual device may be part of a larger multi- | ||
| 54 | function enclosure. While the IOMMU may be able to distinguish | ||
| 55 | between devices within the enclosure, the enclosure may not require | ||
| 56 | transactions between devices to reach the IOMMU. Examples of this | ||
| 57 | could be anything from a multi-function PCI device with backdoors | ||
| 58 | between functions to a non-PCI-ACS (Access Control Services) capable | ||
| 59 | bridge allowing redirection without reaching the IOMMU. Topology | ||
| 60 | can also play a factor in terms of hiding devices. A PCIe-to-PCI | ||
| 61 | bridge masks the devices behind it, making transaction appear as if | ||
| 62 | from the bridge itself. Obviously IOMMU design plays a major factor | ||
| 63 | as well. | ||
| 64 | |||
| 65 | Therefore, while for the most part an IOMMU may have device level | ||
| 66 | granularity, any system is susceptible to reduced granularity. The | ||
| 67 | IOMMU API therefore supports a notion of IOMMU groups. A group is | ||
| 68 | a set of devices which is isolatable from all other devices in the | ||
| 69 | system. Groups are therefore the unit of ownership used by VFIO. | ||
| 70 | |||
| 71 | While the group is the minimum granularity that must be used to | ||
| 72 | ensure secure user access, it's not necessarily the preferred | ||
| 73 | granularity. In IOMMUs which make use of page tables, it may be | ||
| 74 | possible to share a set of page tables between different groups, | ||
| 75 | reducing the overhead both to the platform (reduced TLB thrashing, | ||
| 76 | reduced duplicate page tables), and to the user (programming only | ||
| 77 | a single set of translations). For this reason, VFIO makes use of | ||
| 78 | a container class, which may hold one or more groups. A container | ||
| 79 | is created by simply opening the /dev/vfio/vfio character device. | ||
| 80 | |||
| 81 | On its own, the container provides little functionality, with all | ||
| 82 | but a couple version and extension query interfaces locked away. | ||
| 83 | The user needs to add a group into the container for the next level | ||
| 84 | of functionality. To do this, the user first needs to identify the | ||
| 85 | group associated with the desired device. This can be done using | ||
| 86 | the sysfs links described in the example below. By unbinding the | ||
| 87 | device from the host driver and binding it to a VFIO driver, a new | ||
| 88 | VFIO group will appear for the group as /dev/vfio/$GROUP, where | ||
| 89 | $GROUP is the IOMMU group number of which the device is a member. | ||
| 90 | If the IOMMU group contains multiple devices, each will need to | ||
| 91 | be bound to a VFIO driver before operations on the VFIO group | ||
| 92 | are allowed (it's also sufficient to only unbind the device from | ||
| 93 | host drivers if a VFIO driver is unavailable; this will make the | ||
| 94 | group available, but not that particular device). TBD - interface | ||
| 95 | for disabling driver probing/locking a device. | ||
| 96 | |||
| 97 | Once the group is ready, it may be added to the container by opening | ||
| 98 | the VFIO group character device (/dev/vfio/$GROUP) and using the | ||
| 99 | VFIO_GROUP_SET_CONTAINER ioctl, passing the file descriptor of the | ||
| 100 | previously opened container file. If desired and if the IOMMU driver | ||
| 101 | supports sharing the IOMMU context between groups, multiple groups may | ||
| 102 | be set to the same container. If a group fails to set to a container | ||
| 103 | with existing groups, a new empty container will need to be used | ||
| 104 | instead. | ||
| 105 | |||
| 106 | With a group (or groups) attached to a container, the remaining | ||
| 107 | ioctls become available, enabling access to the VFIO IOMMU interfaces. | ||
| 108 | Additionally, it now becomes possible to get file descriptors for each | ||
| 109 | device within a group using an ioctl on the VFIO group file descriptor. | ||
| 110 | |||
| 111 | The VFIO device API includes ioctls for describing the device, the I/O | ||
| 112 | regions and their read/write/mmap offsets on the device descriptor, as | ||
| 113 | well as mechanisms for describing and registering interrupt | ||
| 114 | notifications. | ||
| 115 | |||
| 116 | VFIO Usage Example | ||
| 117 | ------------------------------------------------------------------------------- | ||
| 118 | |||
| 119 | Assume user wants to access PCI device 0000:06:0d.0 | ||
| 120 | |||
| 121 | $ readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group | ||
| 122 | ../../../../kernel/iommu_groups/26 | ||
| 123 | |||
| 124 | This device is therefore in IOMMU group 26. This device is on the | ||
| 125 | pci bus, therefore the user will make use of vfio-pci to manage the | ||
| 126 | group: | ||
| 127 | |||
| 128 | # modprobe vfio-pci | ||
| 129 | |||
| 130 | Binding this device to the vfio-pci driver creates the VFIO group | ||
| 131 | character devices for this group: | ||
| 132 | |||
| 133 | $ lspci -n -s 0000:06:0d.0 | ||
| 134 | 06:0d.0 0401: 1102:0002 (rev 08) | ||
| 135 | # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind | ||
| 136 | # echo 1102 0002 > /sys/bus/pci/drivers/vfio/new_id | ||
| 137 | |||
| 138 | Now we need to look at what other devices are in the group to free | ||
| 139 | it for use by VFIO: | ||
| 140 | |||
| 141 | $ ls -l /sys/bus/pci/devices/0000:06:0d.0/iommu_group/devices | ||
| 142 | total 0 | ||
| 143 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:00:1e.0 -> | ||
| 144 | ../../../../devices/pci0000:00/0000:00:1e.0 | ||
| 145 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.0 -> | ||
| 146 | ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.0 | ||
| 147 | lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.1 -> | ||
| 148 | ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.1 | ||
| 149 | |||
| 150 | This device is behind a PCIe-to-PCI bridge[4], therefore we also | ||
| 151 | need to add device 0000:06:0d.1 to the group following the same | ||
| 152 | procedure as above. Device 0000:00:1e.0 is a bridge that does | ||
| 153 | not currently have a host driver, therefore it's not required to | ||
| 154 | bind this device to the vfio-pci driver (vfio-pci does not currently | ||
| 155 | support PCI bridges). | ||
| 156 | |||
| 157 | The final step is to provide the user with access to the group if | ||
| 158 | unprivileged operation is desired (note that /dev/vfio/vfio provides | ||
| 159 | no capabilities on its own and is therefore expected to be set to | ||
| 160 | mode 0666 by the system). | ||
| 161 | |||
| 162 | # chown user:user /dev/vfio/26 | ||
| 163 | |||
| 164 | The user now has full access to all the devices and the iommu for this | ||
| 165 | group and can access them as follows: | ||
| 166 | |||
| 167 | int container, group, device, i; | ||
| 168 | struct vfio_group_status group_status = | ||
| 169 | { .argsz = sizeof(group_status) }; | ||
| 170 | struct vfio_iommu_x86_info iommu_info = { .argsz = sizeof(iommu_info) }; | ||
| 171 | struct vfio_iommu_x86_dma_map dma_map = { .argsz = sizeof(dma_map) }; | ||
| 172 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; | ||
| 173 | |||
| 174 | /* Create a new container */ | ||
| 175 | container = open("/dev/vfio/vfio, O_RDWR); | ||
| 176 | |||
| 177 | if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) | ||
| 178 | /* Unknown API version */ | ||
| 179 | |||
| 180 | if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_X86_IOMMU)) | ||
| 181 | /* Doesn't support the IOMMU driver we want. */ | ||
| 182 | |||
| 183 | /* Open the group */ | ||
| 184 | group = open("/dev/vfio/26", O_RDWR); | ||
| 185 | |||
| 186 | /* Test the group is viable and available */ | ||
| 187 | ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); | ||
| 188 | |||
| 189 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) | ||
| 190 | /* Group is not viable (ie, not all devices bound for vfio) */ | ||
| 191 | |||
| 192 | /* Add the group to the container */ | ||
| 193 | ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); | ||
| 194 | |||
| 195 | /* Enable the IOMMU model we want */ | ||
| 196 | ioctl(container, VFIO_SET_IOMMU, VFIO_X86_IOMMU) | ||
| 197 | |||
| 198 | /* Get addition IOMMU info */ | ||
| 199 | ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); | ||
| 200 | |||
| 201 | /* Allocate some space and setup a DMA mapping */ | ||
| 202 | dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, | ||
| 203 | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); | ||
| 204 | dma_map.size = 1024 * 1024; | ||
| 205 | dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ | ||
| 206 | dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; | ||
| 207 | |||
| 208 | ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); | ||
| 209 | |||
| 210 | /* Get a file descriptor for the device */ | ||
| 211 | device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); | ||
| 212 | |||
| 213 | /* Test and setup the device */ | ||
| 214 | ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); | ||
| 215 | |||
| 216 | for (i = 0; i < device_info.num_regions; i++) { | ||
| 217 | struct vfio_region_info reg = { .argsz = sizeof(reg) }; | ||
| 218 | |||
| 219 | reg.index = i; | ||
| 220 | |||
| 221 | ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®); | ||
| 222 | |||
| 223 | /* Setup mappings... read/write offsets, mmaps | ||
| 224 | * For PCI devices, config space is a region */ | ||
| 225 | } | ||
| 226 | |||
| 227 | for (i = 0; i < device_info.num_irqs; i++) { | ||
| 228 | struct vfio_irq_info irq = { .argsz = sizeof(irq) }; | ||
| 229 | |||
| 230 | irq.index = i; | ||
| 231 | |||
| 232 | ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, ®); | ||
| 233 | |||
| 234 | /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */ | ||
| 235 | } | ||
| 236 | |||
| 237 | /* Gratuitous device reset and go... */ | ||
| 238 | ioctl(device, VFIO_DEVICE_RESET); | ||
| 239 | |||
| 240 | VFIO User API | ||
| 241 | ------------------------------------------------------------------------------- | ||
| 242 | |||
| 243 | Please see include/linux/vfio.h for complete API documentation. | ||
| 244 | |||
| 245 | VFIO bus driver API | ||
| 246 | ------------------------------------------------------------------------------- | ||
| 247 | |||
| 248 | VFIO bus drivers, such as vfio-pci make use of only a few interfaces | ||
| 249 | into VFIO core. When devices are bound and unbound to the driver, | ||
| 250 | the driver should call vfio_add_group_dev() and vfio_del_group_dev() | ||
| 251 | respectively: | ||
| 252 | |||
| 253 | extern int vfio_add_group_dev(struct iommu_group *iommu_group, | ||
| 254 | struct device *dev, | ||
| 255 | const struct vfio_device_ops *ops, | ||
| 256 | void *device_data); | ||
| 257 | |||
| 258 | extern void *vfio_del_group_dev(struct device *dev); | ||
| 259 | |||
| 260 | vfio_add_group_dev() indicates to the core to begin tracking the | ||
| 261 | specified iommu_group and register the specified dev as owned by | ||
| 262 | a VFIO bus driver. The driver provides an ops structure for callbacks | ||
| 263 | similar to a file operations structure: | ||
| 264 | |||
| 265 | struct vfio_device_ops { | ||
| 266 | int (*open)(void *device_data); | ||
| 267 | void (*release)(void *device_data); | ||
| 268 | ssize_t (*read)(void *device_data, char __user *buf, | ||
| 269 | size_t count, loff_t *ppos); | ||
| 270 | ssize_t (*write)(void *device_data, const char __user *buf, | ||
| 271 | size_t size, loff_t *ppos); | ||
| 272 | long (*ioctl)(void *device_data, unsigned int cmd, | ||
| 273 | unsigned long arg); | ||
| 274 | int (*mmap)(void *device_data, struct vm_area_struct *vma); | ||
| 275 | }; | ||
| 276 | |||
| 277 | Each function is passed the device_data that was originally registered | ||
| 278 | in the vfio_add_group_dev() call above. This allows the bus driver | ||
| 279 | an easy place to store its opaque, private data. The open/release | ||
| 280 | callbacks are issued when a new file descriptor is created for a | ||
| 281 | device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides | ||
| 282 | a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap | ||
| 283 | interfaces implement the device region access defined by the device's | ||
| 284 | own VFIO_DEVICE_GET_REGION_INFO ioctl. | ||
| 285 | |||
| 286 | ------------------------------------------------------------------------------- | ||
| 287 | |||
| 288 | [1] VFIO was originally an acronym for "Virtual Function I/O" in its | ||
| 289 | initial implementation by Tom Lyon while as Cisco. We've since | ||
| 290 | outgrown the acronym, but it's catchy. | ||
| 291 | |||
| 292 | [2] "safe" also depends upon a device being "well behaved". It's | ||
| 293 | possible for multi-function devices to have backdoors between | ||
| 294 | functions and even for single function devices to have alternative | ||
| 295 | access to things like PCI config space through MMIO registers. To | ||
| 296 | guard against the former we can include additional precautions in the | ||
| 297 | IOMMU driver to group multi-function PCI devices together | ||
| 298 | (iommu=group_mf). The latter we can't prevent, but the IOMMU should | ||
| 299 | still provide isolation. For PCI, SR-IOV Virtual Functions are the | ||
| 300 | best indicator of "well behaved", as these are designed for | ||
| 301 | virtualization usage models. | ||
| 302 | |||
| 303 | [3] As always there are trade-offs to virtual machine device | ||
| 304 | assignment that are beyond the scope of VFIO. It's expected that | ||
| 305 | future IOMMU technologies will reduce some, but maybe not all, of | ||
| 306 | these trade-offs. | ||
| 307 | |||
| 308 | [4] In this case the device is below a PCI bridge, so transactions | ||
| 309 | from either function of the device are indistinguishable to the iommu: | ||
| 310 | |||
| 311 | -[0000:00]-+-1e.0-[06]--+-0d.0 | ||
| 312 | \-0d.1 | ||
| 313 | |||
| 314 | 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 36ed8a14e8e2..6720018bc674 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -7414,6 +7414,14 @@ S: Maintained | |||
| 7414 | F: Documentation/filesystems/vfat.txt | 7414 | F: Documentation/filesystems/vfat.txt |
| 7415 | F: fs/fat/ | 7415 | F: fs/fat/ |
| 7416 | 7416 | ||
| 7417 | VFIO DRIVER | ||
| 7418 | M: Alex Williamson <alex.williamson@redhat.com> | ||
| 7419 | L: kvm@vger.kernel.org | ||
| 7420 | S: Maintained | ||
| 7421 | F: Documentation/vfio.txt | ||
| 7422 | F: drivers/vfio/ | ||
| 7423 | F: include/linux/vfio.h | ||
| 7424 | |||
| 7417 | VIDEOBUF2 FRAMEWORK | 7425 | VIDEOBUF2 FRAMEWORK |
| 7418 | M: Pawel Osciak <pawel@osciak.com> | 7426 | M: Pawel Osciak <pawel@osciak.com> |
| 7419 | M: Marek Szyprowski <m.szyprowski@samsung.com> | 7427 | M: Marek Szyprowski <m.szyprowski@samsung.com> |
diff --git a/drivers/Kconfig b/drivers/Kconfig index 805c432c9439..ece958d3762e 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
| @@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig" | |||
| 112 | 112 | ||
| 113 | source "drivers/uio/Kconfig" | 113 | source "drivers/uio/Kconfig" |
| 114 | 114 | ||
| 115 | source "drivers/vfio/Kconfig" | ||
| 116 | |||
| 115 | source "drivers/vlynq/Kconfig" | 117 | source "drivers/vlynq/Kconfig" |
| 116 | 118 | ||
| 117 | source "drivers/virtio/Kconfig" | 119 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index bd36f09f2246..5b421840c48d 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
| @@ -60,6 +60,7 @@ obj-$(CONFIG_ATM) += atm/ | |||
| 60 | obj-$(CONFIG_FUSION) += message/ | 60 | obj-$(CONFIG_FUSION) += message/ |
| 61 | obj-y += firewire/ | 61 | obj-y += firewire/ |
| 62 | obj-$(CONFIG_UIO) += uio/ | 62 | obj-$(CONFIG_UIO) += uio/ |
| 63 | obj-$(CONFIG_VFIO) += vfio/ | ||
| 63 | obj-y += cdrom/ | 64 | obj-y += cdrom/ |
| 64 | obj-y += auxdisplay/ | 65 | obj-y += auxdisplay/ |
| 65 | obj-$(CONFIG_PCCARD) += pcmcia/ | 66 | obj-$(CONFIG_PCCARD) += pcmcia/ |
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig new file mode 100644 index 000000000000..7cd5dec0abd1 --- /dev/null +++ b/drivers/vfio/Kconfig | |||
| @@ -0,0 +1,16 @@ | |||
| 1 | config VFIO_IOMMU_TYPE1 | ||
| 2 | tristate | ||
| 3 | depends on VFIO | ||
| 4 | default n | ||
| 5 | |||
| 6 | menuconfig VFIO | ||
| 7 | tristate "VFIO Non-Privileged userspace driver framework" | ||
| 8 | depends on IOMMU_API | ||
| 9 | select VFIO_IOMMU_TYPE1 if X86 | ||
| 10 | help | ||
| 11 | VFIO provides a framework for secure userspace device drivers. | ||
| 12 | See Documentation/vfio.txt for more details. | ||
| 13 | |||
| 14 | If you don't know what to do here, say N. | ||
| 15 | |||
| 16 | source "drivers/vfio/pci/Kconfig" | ||
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile new file mode 100644 index 000000000000..2398d4a0e38b --- /dev/null +++ b/drivers/vfio/Makefile | |||
| @@ -0,0 +1,3 @@ | |||
| 1 | obj-$(CONFIG_VFIO) += vfio.o | ||
| 2 | obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o | ||
| 3 | obj-$(CONFIG_VFIO_PCI) += pci/ | ||
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig new file mode 100644 index 000000000000..5980758563eb --- /dev/null +++ b/drivers/vfio/pci/Kconfig | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | config VFIO_PCI | ||
| 2 | tristate "VFIO support for PCI devices" | ||
| 3 | depends on VFIO && PCI && EVENTFD | ||
| 4 | help | ||
| 5 | Support for the PCI VFIO bus driver. This is required to make | ||
| 6 | use of PCI drivers using the VFIO framework. | ||
| 7 | |||
| 8 | If you don't know what to do here, say N. | ||
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile new file mode 100644 index 000000000000..131079255fd9 --- /dev/null +++ b/drivers/vfio/pci/Makefile | |||
| @@ -0,0 +1,4 @@ | |||
| 1 | |||
| 2 | vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o | ||
| 3 | |||
| 4 | obj-$(CONFIG_VFIO_PCI) += vfio-pci.o | ||
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c new file mode 100644 index 000000000000..6968b7232232 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci.c | |||
| @@ -0,0 +1,579 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License version 2 as | ||
| 7 | * published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * Derived from original vfio: | ||
| 10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 11 | * Author: Tom Lyon, pugs@cisco.com | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/device.h> | ||
| 15 | #include <linux/eventfd.h> | ||
| 16 | #include <linux/interrupt.h> | ||
| 17 | #include <linux/iommu.h> | ||
| 18 | #include <linux/module.h> | ||
| 19 | #include <linux/mutex.h> | ||
| 20 | #include <linux/notifier.h> | ||
| 21 | #include <linux/pci.h> | ||
| 22 | #include <linux/pm_runtime.h> | ||
| 23 | #include <linux/slab.h> | ||
| 24 | #include <linux/types.h> | ||
| 25 | #include <linux/uaccess.h> | ||
| 26 | #include <linux/vfio.h> | ||
| 27 | |||
| 28 | #include "vfio_pci_private.h" | ||
| 29 | |||
| 30 | #define DRIVER_VERSION "0.2" | ||
| 31 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
| 32 | #define DRIVER_DESC "VFIO PCI - User Level meta-driver" | ||
| 33 | |||
| 34 | static bool nointxmask; | ||
| 35 | module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); | ||
| 36 | MODULE_PARM_DESC(nointxmask, | ||
| 37 | "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); | ||
| 38 | |||
| 39 | static int vfio_pci_enable(struct vfio_pci_device *vdev) | ||
| 40 | { | ||
| 41 | struct pci_dev *pdev = vdev->pdev; | ||
| 42 | int ret; | ||
| 43 | u16 cmd; | ||
| 44 | u8 msix_pos; | ||
| 45 | |||
| 46 | vdev->reset_works = (pci_reset_function(pdev) == 0); | ||
| 47 | pci_save_state(pdev); | ||
| 48 | vdev->pci_saved_state = pci_store_saved_state(pdev); | ||
| 49 | if (!vdev->pci_saved_state) | ||
| 50 | pr_debug("%s: Couldn't store %s saved state\n", | ||
| 51 | __func__, dev_name(&pdev->dev)); | ||
| 52 | |||
| 53 | ret = vfio_config_init(vdev); | ||
| 54 | if (ret) | ||
| 55 | goto out; | ||
| 56 | |||
| 57 | if (likely(!nointxmask)) | ||
| 58 | vdev->pci_2_3 = pci_intx_mask_supported(pdev); | ||
| 59 | |||
| 60 | pci_read_config_word(pdev, PCI_COMMAND, &cmd); | ||
| 61 | if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { | ||
| 62 | cmd &= ~PCI_COMMAND_INTX_DISABLE; | ||
| 63 | pci_write_config_word(pdev, PCI_COMMAND, cmd); | ||
| 64 | } | ||
| 65 | |||
| 66 | msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); | ||
| 67 | if (msix_pos) { | ||
| 68 | u16 flags; | ||
| 69 | u32 table; | ||
| 70 | |||
| 71 | pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); | ||
| 72 | pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); | ||
| 73 | |||
| 74 | vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK; | ||
| 75 | vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; | ||
| 76 | vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; | ||
| 77 | } else | ||
| 78 | vdev->msix_bar = 0xFF; | ||
| 79 | |||
| 80 | ret = pci_enable_device(pdev); | ||
| 81 | if (ret) | ||
| 82 | goto out; | ||
| 83 | |||
| 84 | return ret; | ||
| 85 | |||
| 86 | out: | ||
| 87 | kfree(vdev->pci_saved_state); | ||
| 88 | vdev->pci_saved_state = NULL; | ||
| 89 | vfio_config_free(vdev); | ||
| 90 | return ret; | ||
| 91 | } | ||
| 92 | |||
| 93 | static void vfio_pci_disable(struct vfio_pci_device *vdev) | ||
| 94 | { | ||
| 95 | int bar; | ||
| 96 | |||
| 97 | pci_disable_device(vdev->pdev); | ||
| 98 | |||
| 99 | vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | | ||
| 100 | VFIO_IRQ_SET_ACTION_TRIGGER, | ||
| 101 | vdev->irq_type, 0, 0, NULL); | ||
| 102 | |||
| 103 | vdev->virq_disabled = false; | ||
| 104 | |||
| 105 | vfio_config_free(vdev); | ||
| 106 | |||
| 107 | pci_reset_function(vdev->pdev); | ||
| 108 | |||
| 109 | if (pci_load_and_free_saved_state(vdev->pdev, | ||
| 110 | &vdev->pci_saved_state) == 0) | ||
| 111 | pci_restore_state(vdev->pdev); | ||
| 112 | else | ||
| 113 | pr_info("%s: Couldn't reload %s saved state\n", | ||
| 114 | __func__, dev_name(&vdev->pdev->dev)); | ||
| 115 | |||
| 116 | for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { | ||
| 117 | if (!vdev->barmap[bar]) | ||
| 118 | continue; | ||
| 119 | pci_iounmap(vdev->pdev, vdev->barmap[bar]); | ||
| 120 | pci_release_selected_regions(vdev->pdev, 1 << bar); | ||
| 121 | vdev->barmap[bar] = NULL; | ||
| 122 | } | ||
| 123 | } | ||
| 124 | |||
| 125 | static void vfio_pci_release(void *device_data) | ||
| 126 | { | ||
| 127 | struct vfio_pci_device *vdev = device_data; | ||
| 128 | |||
| 129 | if (atomic_dec_and_test(&vdev->refcnt)) | ||
| 130 | vfio_pci_disable(vdev); | ||
| 131 | |||
| 132 | module_put(THIS_MODULE); | ||
| 133 | } | ||
| 134 | |||
| 135 | static int vfio_pci_open(void *device_data) | ||
| 136 | { | ||
| 137 | struct vfio_pci_device *vdev = device_data; | ||
| 138 | |||
| 139 | if (!try_module_get(THIS_MODULE)) | ||
| 140 | return -ENODEV; | ||
| 141 | |||
| 142 | if (atomic_inc_return(&vdev->refcnt) == 1) { | ||
| 143 | int ret = vfio_pci_enable(vdev); | ||
| 144 | if (ret) { | ||
| 145 | module_put(THIS_MODULE); | ||
| 146 | return ret; | ||
| 147 | } | ||
| 148 | } | ||
| 149 | |||
| 150 | return 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) | ||
| 154 | { | ||
| 155 | if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { | ||
| 156 | u8 pin; | ||
| 157 | pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); | ||
| 158 | if (pin) | ||
| 159 | return 1; | ||
| 160 | |||
| 161 | } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { | ||
| 162 | u8 pos; | ||
| 163 | u16 flags; | ||
| 164 | |||
| 165 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI); | ||
| 166 | if (pos) { | ||
| 167 | pci_read_config_word(vdev->pdev, | ||
| 168 | pos + PCI_MSI_FLAGS, &flags); | ||
| 169 | |||
| 170 | return 1 << (flags & PCI_MSI_FLAGS_QMASK); | ||
| 171 | } | ||
| 172 | } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { | ||
| 173 | u8 pos; | ||
| 174 | u16 flags; | ||
| 175 | |||
| 176 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX); | ||
| 177 | if (pos) { | ||
| 178 | pci_read_config_word(vdev->pdev, | ||
| 179 | pos + PCI_MSIX_FLAGS, &flags); | ||
| 180 | |||
| 181 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; | ||
| 182 | } | ||
| 183 | } | ||
| 184 | |||
| 185 | return 0; | ||
| 186 | } | ||
| 187 | |||
| 188 | static long vfio_pci_ioctl(void *device_data, | ||
| 189 | unsigned int cmd, unsigned long arg) | ||
| 190 | { | ||
| 191 | struct vfio_pci_device *vdev = device_data; | ||
| 192 | unsigned long minsz; | ||
| 193 | |||
| 194 | if (cmd == VFIO_DEVICE_GET_INFO) { | ||
| 195 | struct vfio_device_info info; | ||
| 196 | |||
| 197 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
| 198 | |||
| 199 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
| 200 | return -EFAULT; | ||
| 201 | |||
| 202 | if (info.argsz < minsz) | ||
| 203 | return -EINVAL; | ||
| 204 | |||
| 205 | info.flags = VFIO_DEVICE_FLAGS_PCI; | ||
| 206 | |||
| 207 | if (vdev->reset_works) | ||
| 208 | info.flags |= VFIO_DEVICE_FLAGS_RESET; | ||
| 209 | |||
| 210 | info.num_regions = VFIO_PCI_NUM_REGIONS; | ||
| 211 | info.num_irqs = VFIO_PCI_NUM_IRQS; | ||
| 212 | |||
| 213 | return copy_to_user((void __user *)arg, &info, minsz); | ||
| 214 | |||
| 215 | } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { | ||
| 216 | struct pci_dev *pdev = vdev->pdev; | ||
| 217 | struct vfio_region_info info; | ||
| 218 | |||
| 219 | minsz = offsetofend(struct vfio_region_info, offset); | ||
| 220 | |||
| 221 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
| 222 | return -EFAULT; | ||
| 223 | |||
| 224 | if (info.argsz < minsz) | ||
| 225 | return -EINVAL; | ||
| 226 | |||
| 227 | switch (info.index) { | ||
| 228 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
| 229 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
| 230 | info.size = pdev->cfg_size; | ||
| 231 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
| 232 | VFIO_REGION_INFO_FLAG_WRITE; | ||
| 233 | break; | ||
| 234 | case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: | ||
| 235 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
| 236 | info.size = pci_resource_len(pdev, info.index); | ||
| 237 | if (!info.size) { | ||
| 238 | info.flags = 0; | ||
| 239 | break; | ||
| 240 | } | ||
| 241 | |||
| 242 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
| 243 | VFIO_REGION_INFO_FLAG_WRITE; | ||
| 244 | if (pci_resource_flags(pdev, info.index) & | ||
| 245 | IORESOURCE_MEM && info.size >= PAGE_SIZE) | ||
| 246 | info.flags |= VFIO_REGION_INFO_FLAG_MMAP; | ||
| 247 | break; | ||
| 248 | case VFIO_PCI_ROM_REGION_INDEX: | ||
| 249 | { | ||
| 250 | void __iomem *io; | ||
| 251 | size_t size; | ||
| 252 | |||
| 253 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
| 254 | info.flags = 0; | ||
| 255 | |||
| 256 | /* Report the BAR size, not the ROM size */ | ||
| 257 | info.size = pci_resource_len(pdev, info.index); | ||
| 258 | if (!info.size) | ||
| 259 | break; | ||
| 260 | |||
| 261 | /* Is it really there? */ | ||
| 262 | io = pci_map_rom(pdev, &size); | ||
| 263 | if (!io || !size) { | ||
| 264 | info.size = 0; | ||
| 265 | break; | ||
| 266 | } | ||
| 267 | pci_unmap_rom(pdev, io); | ||
| 268 | |||
| 269 | info.flags = VFIO_REGION_INFO_FLAG_READ; | ||
| 270 | break; | ||
| 271 | } | ||
| 272 | default: | ||
| 273 | return -EINVAL; | ||
| 274 | } | ||
| 275 | |||
| 276 | return copy_to_user((void __user *)arg, &info, minsz); | ||
| 277 | |||
| 278 | } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { | ||
| 279 | struct vfio_irq_info info; | ||
| 280 | |||
| 281 | minsz = offsetofend(struct vfio_irq_info, count); | ||
| 282 | |||
| 283 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
| 284 | return -EFAULT; | ||
| 285 | |||
| 286 | if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) | ||
| 287 | return -EINVAL; | ||
| 288 | |||
| 289 | info.flags = VFIO_IRQ_INFO_EVENTFD; | ||
| 290 | |||
| 291 | info.count = vfio_pci_get_irq_count(vdev, info.index); | ||
| 292 | |||
| 293 | if (info.index == VFIO_PCI_INTX_IRQ_INDEX) | ||
| 294 | info.flags |= (VFIO_IRQ_INFO_MASKABLE | | ||
| 295 | VFIO_IRQ_INFO_AUTOMASKED); | ||
| 296 | else | ||
| 297 | info.flags |= VFIO_IRQ_INFO_NORESIZE; | ||
| 298 | |||
| 299 | return copy_to_user((void __user *)arg, &info, minsz); | ||
| 300 | |||
| 301 | } else if (cmd == VFIO_DEVICE_SET_IRQS) { | ||
| 302 | struct vfio_irq_set hdr; | ||
| 303 | u8 *data = NULL; | ||
| 304 | int ret = 0; | ||
| 305 | |||
| 306 | minsz = offsetofend(struct vfio_irq_set, count); | ||
| 307 | |||
| 308 | if (copy_from_user(&hdr, (void __user *)arg, minsz)) | ||
| 309 | return -EFAULT; | ||
| 310 | |||
| 311 | if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || | ||
| 312 | hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | | ||
| 313 | VFIO_IRQ_SET_ACTION_TYPE_MASK)) | ||
| 314 | return -EINVAL; | ||
| 315 | |||
| 316 | if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
| 317 | size_t size; | ||
| 318 | |||
| 319 | if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) | ||
| 320 | size = sizeof(uint8_t); | ||
| 321 | else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) | ||
| 322 | size = sizeof(int32_t); | ||
| 323 | else | ||
| 324 | return -EINVAL; | ||
| 325 | |||
| 326 | if (hdr.argsz - minsz < hdr.count * size || | ||
| 327 | hdr.count > vfio_pci_get_irq_count(vdev, hdr.index)) | ||
| 328 | return -EINVAL; | ||
| 329 | |||
| 330 | data = kmalloc(hdr.count * size, GFP_KERNEL); | ||
| 331 | if (!data) | ||
| 332 | return -ENOMEM; | ||
| 333 | |||
| 334 | if (copy_from_user(data, (void __user *)(arg + minsz), | ||
| 335 | hdr.count * size)) { | ||
| 336 | kfree(data); | ||
| 337 | return -EFAULT; | ||
| 338 | } | ||
| 339 | } | ||
| 340 | |||
| 341 | mutex_lock(&vdev->igate); | ||
| 342 | |||
| 343 | ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, | ||
| 344 | hdr.start, hdr.count, data); | ||
| 345 | |||
| 346 | mutex_unlock(&vdev->igate); | ||
| 347 | kfree(data); | ||
| 348 | |||
| 349 | return ret; | ||
| 350 | |||
| 351 | } else if (cmd == VFIO_DEVICE_RESET) | ||
| 352 | return vdev->reset_works ? | ||
| 353 | pci_reset_function(vdev->pdev) : -EINVAL; | ||
| 354 | |||
| 355 | return -ENOTTY; | ||
| 356 | } | ||
| 357 | |||
| 358 | static ssize_t vfio_pci_read(void *device_data, char __user *buf, | ||
| 359 | size_t count, loff_t *ppos) | ||
| 360 | { | ||
| 361 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
| 362 | struct vfio_pci_device *vdev = device_data; | ||
| 363 | struct pci_dev *pdev = vdev->pdev; | ||
| 364 | |||
| 365 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
| 366 | return -EINVAL; | ||
| 367 | |||
| 368 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
| 369 | return vfio_pci_config_readwrite(vdev, buf, count, ppos, false); | ||
| 370 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
| 371 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
| 372 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
| 373 | return vfio_pci_io_readwrite(vdev, buf, count, ppos, false); | ||
| 374 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) | ||
| 375 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
| 376 | |||
| 377 | return -EINVAL; | ||
| 378 | } | ||
| 379 | |||
| 380 | static ssize_t vfio_pci_write(void *device_data, const char __user *buf, | ||
| 381 | size_t count, loff_t *ppos) | ||
| 382 | { | ||
| 383 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
| 384 | struct vfio_pci_device *vdev = device_data; | ||
| 385 | struct pci_dev *pdev = vdev->pdev; | ||
| 386 | |||
| 387 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
| 388 | return -EINVAL; | ||
| 389 | |||
| 390 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
| 391 | return vfio_pci_config_readwrite(vdev, (char __user *)buf, | ||
| 392 | count, ppos, true); | ||
| 393 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
| 394 | return -EINVAL; | ||
| 395 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
| 396 | return vfio_pci_io_readwrite(vdev, (char __user *)buf, | ||
| 397 | count, ppos, true); | ||
| 398 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) { | ||
| 399 | return vfio_pci_mem_readwrite(vdev, (char __user *)buf, | ||
| 400 | count, ppos, true); | ||
| 401 | } | ||
| 402 | |||
| 403 | return -EINVAL; | ||
| 404 | } | ||
| 405 | |||
| 406 | static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) | ||
| 407 | { | ||
| 408 | struct vfio_pci_device *vdev = device_data; | ||
| 409 | struct pci_dev *pdev = vdev->pdev; | ||
| 410 | unsigned int index; | ||
| 411 | u64 phys_len, req_len, pgoff, req_start, phys; | ||
| 412 | int ret; | ||
| 413 | |||
| 414 | index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); | ||
| 415 | |||
| 416 | if (vma->vm_end < vma->vm_start) | ||
| 417 | return -EINVAL; | ||
| 418 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
| 419 | return -EINVAL; | ||
| 420 | if (index >= VFIO_PCI_ROM_REGION_INDEX) | ||
| 421 | return -EINVAL; | ||
| 422 | if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) | ||
| 423 | return -EINVAL; | ||
| 424 | |||
| 425 | phys_len = pci_resource_len(pdev, index); | ||
| 426 | req_len = vma->vm_end - vma->vm_start; | ||
| 427 | pgoff = vma->vm_pgoff & | ||
| 428 | ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); | ||
| 429 | req_start = pgoff << PAGE_SHIFT; | ||
| 430 | |||
| 431 | if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) | ||
| 432 | return -EINVAL; | ||
| 433 | |||
| 434 | if (index == vdev->msix_bar) { | ||
| 435 | /* | ||
| 436 | * Disallow mmaps overlapping the MSI-X table; users don't | ||
| 437 | * get to touch this directly. We could find somewhere | ||
| 438 | * else to map the overlap, but page granularity is only | ||
| 439 | * a recommendation, not a requirement, so the user needs | ||
| 440 | * to know which bits are real. Requiring them to mmap | ||
| 441 | * around the table makes that clear. | ||
| 442 | */ | ||
| 443 | |||
| 444 | /* If neither entirely above nor below, then it overlaps */ | ||
| 445 | if (!(req_start >= vdev->msix_offset + vdev->msix_size || | ||
| 446 | req_start + req_len <= vdev->msix_offset)) | ||
| 447 | return -EINVAL; | ||
| 448 | } | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Even though we don't make use of the barmap for the mmap, | ||
| 452 | * we need to request the region and the barmap tracks that. | ||
| 453 | */ | ||
| 454 | if (!vdev->barmap[index]) { | ||
| 455 | ret = pci_request_selected_regions(pdev, | ||
| 456 | 1 << index, "vfio-pci"); | ||
| 457 | if (ret) | ||
| 458 | return ret; | ||
| 459 | |||
| 460 | vdev->barmap[index] = pci_iomap(pdev, index, 0); | ||
| 461 | } | ||
| 462 | |||
| 463 | vma->vm_private_data = vdev; | ||
| 464 | vma->vm_flags |= (VM_IO | VM_RESERVED); | ||
| 465 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
| 466 | |||
| 467 | phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; | ||
| 468 | |||
| 469 | return remap_pfn_range(vma, vma->vm_start, phys, | ||
| 470 | req_len, vma->vm_page_prot); | ||
| 471 | } | ||
| 472 | |||
| 473 | static const struct vfio_device_ops vfio_pci_ops = { | ||
| 474 | .name = "vfio-pci", | ||
| 475 | .open = vfio_pci_open, | ||
| 476 | .release = vfio_pci_release, | ||
| 477 | .ioctl = vfio_pci_ioctl, | ||
| 478 | .read = vfio_pci_read, | ||
| 479 | .write = vfio_pci_write, | ||
| 480 | .mmap = vfio_pci_mmap, | ||
| 481 | }; | ||
| 482 | |||
| 483 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | ||
| 484 | { | ||
| 485 | u8 type; | ||
| 486 | struct vfio_pci_device *vdev; | ||
| 487 | struct iommu_group *group; | ||
| 488 | int ret; | ||
| 489 | |||
| 490 | pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type); | ||
| 491 | if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) | ||
| 492 | return -EINVAL; | ||
| 493 | |||
| 494 | group = iommu_group_get(&pdev->dev); | ||
| 495 | if (!group) | ||
| 496 | return -EINVAL; | ||
| 497 | |||
| 498 | vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); | ||
| 499 | if (!vdev) { | ||
| 500 | iommu_group_put(group); | ||
| 501 | return -ENOMEM; | ||
| 502 | } | ||
| 503 | |||
| 504 | vdev->pdev = pdev; | ||
| 505 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
| 506 | mutex_init(&vdev->igate); | ||
| 507 | spin_lock_init(&vdev->irqlock); | ||
| 508 | atomic_set(&vdev->refcnt, 0); | ||
| 509 | |||
| 510 | ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); | ||
| 511 | if (ret) { | ||
| 512 | iommu_group_put(group); | ||
| 513 | kfree(vdev); | ||
| 514 | } | ||
| 515 | |||
| 516 | return ret; | ||
| 517 | } | ||
| 518 | |||
| 519 | static void vfio_pci_remove(struct pci_dev *pdev) | ||
| 520 | { | ||
| 521 | struct vfio_pci_device *vdev; | ||
| 522 | |||
| 523 | vdev = vfio_del_group_dev(&pdev->dev); | ||
| 524 | if (!vdev) | ||
| 525 | return; | ||
| 526 | |||
| 527 | iommu_group_put(pdev->dev.iommu_group); | ||
| 528 | kfree(vdev); | ||
| 529 | } | ||
| 530 | |||
| 531 | static struct pci_driver vfio_pci_driver = { | ||
| 532 | .name = "vfio-pci", | ||
| 533 | .id_table = NULL, /* only dynamic ids */ | ||
| 534 | .probe = vfio_pci_probe, | ||
| 535 | .remove = vfio_pci_remove, | ||
| 536 | }; | ||
| 537 | |||
| 538 | static void __exit vfio_pci_cleanup(void) | ||
| 539 | { | ||
| 540 | pci_unregister_driver(&vfio_pci_driver); | ||
| 541 | vfio_pci_virqfd_exit(); | ||
| 542 | vfio_pci_uninit_perm_bits(); | ||
| 543 | } | ||
| 544 | |||
| 545 | static int __init vfio_pci_init(void) | ||
| 546 | { | ||
| 547 | int ret; | ||
| 548 | |||
| 549 | /* Allocate shared config space permision data used by all devices */ | ||
| 550 | ret = vfio_pci_init_perm_bits(); | ||
| 551 | if (ret) | ||
| 552 | return ret; | ||
| 553 | |||
| 554 | /* Start the virqfd cleanup handler */ | ||
| 555 | ret = vfio_pci_virqfd_init(); | ||
| 556 | if (ret) | ||
| 557 | goto out_virqfd; | ||
| 558 | |||
| 559 | /* Register and scan for devices */ | ||
| 560 | ret = pci_register_driver(&vfio_pci_driver); | ||
| 561 | if (ret) | ||
| 562 | goto out_driver; | ||
| 563 | |||
| 564 | return 0; | ||
| 565 | |||
| 566 | out_virqfd: | ||
| 567 | vfio_pci_virqfd_exit(); | ||
| 568 | out_driver: | ||
| 569 | vfio_pci_uninit_perm_bits(); | ||
| 570 | return ret; | ||
| 571 | } | ||
| 572 | |||
| 573 | module_init(vfio_pci_init); | ||
| 574 | module_exit(vfio_pci_cleanup); | ||
| 575 | |||
| 576 | MODULE_VERSION(DRIVER_VERSION); | ||
| 577 | MODULE_LICENSE("GPL v2"); | ||
| 578 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
| 579 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c new file mode 100644 index 000000000000..8b8f7d11e102 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_config.c | |||
| @@ -0,0 +1,1540 @@ | |||
| 1 | /* | ||
| 2 | * VFIO PCI config space virtualization | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | */ | ||
| 15 | |||
| 16 | /* | ||
| 17 | * This code handles reading and writing of PCI configuration registers. | ||
| 18 | * This is hairy because we want to allow a lot of flexibility to the | ||
| 19 | * user driver, but cannot trust it with all of the config fields. | ||
| 20 | * Tables determine which fields can be read and written, as well as | ||
| 21 | * which fields are 'virtualized' - special actions and translations to | ||
| 22 | * make it appear to the user that he has control, when in fact things | ||
| 23 | * must be negotiated with the underlying OS. | ||
| 24 | */ | ||
| 25 | |||
| 26 | #include <linux/fs.h> | ||
| 27 | #include <linux/pci.h> | ||
| 28 | #include <linux/uaccess.h> | ||
| 29 | #include <linux/vfio.h> | ||
| 30 | |||
| 31 | #include "vfio_pci_private.h" | ||
| 32 | |||
| 33 | #define PCI_CFG_SPACE_SIZE 256 | ||
| 34 | |||
| 35 | /* Useful "pseudo" capabilities */ | ||
| 36 | #define PCI_CAP_ID_BASIC 0 | ||
| 37 | #define PCI_CAP_ID_INVALID 0xFF | ||
| 38 | |||
| 39 | #define is_bar(offset) \ | ||
| 40 | ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ | ||
| 41 | (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4)) | ||
| 42 | |||
| 43 | /* | ||
| 44 | * Lengths of PCI Config Capabilities | ||
| 45 | * 0: Removed from the user visible capability list | ||
| 46 | * FF: Variable length | ||
| 47 | */ | ||
| 48 | static u8 pci_cap_length[] = { | ||
| 49 | [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ | ||
| 50 | [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, | ||
| 51 | [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, | ||
| 52 | [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF, | ||
| 53 | [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */ | ||
| 54 | [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */ | ||
| 55 | [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */ | ||
| 56 | [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */ | ||
| 57 | [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */ | ||
| 58 | [PCI_CAP_ID_VNDR] = 0xFF, /* variable */ | ||
| 59 | [PCI_CAP_ID_DBG] = 0, /* debug - don't care */ | ||
| 60 | [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */ | ||
| 61 | [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */ | ||
| 62 | [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */ | ||
| 63 | [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */ | ||
| 64 | [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */ | ||
| 65 | [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */ | ||
| 66 | [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF, | ||
| 67 | [PCI_CAP_ID_SATA] = 0xFF, | ||
| 68 | [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF, | ||
| 69 | }; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * Lengths of PCIe/PCI-X Extended Config Capabilities | ||
| 73 | * 0: Removed or masked from the user visible capabilty list | ||
| 74 | * FF: Variable length | ||
| 75 | */ | ||
| 76 | static u16 pci_ext_cap_length[] = { | ||
| 77 | [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, | ||
| 78 | [PCI_EXT_CAP_ID_VC] = 0xFF, | ||
| 79 | [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, | ||
| 80 | [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF, | ||
| 81 | [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */ | ||
| 82 | [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */ | ||
| 83 | [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */ | ||
| 84 | [PCI_EXT_CAP_ID_MFVC] = 0xFF, | ||
| 85 | [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */ | ||
| 86 | [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */ | ||
| 87 | [PCI_EXT_CAP_ID_VNDR] = 0xFF, | ||
| 88 | [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */ | ||
| 89 | [PCI_EXT_CAP_ID_ACS] = 0xFF, | ||
| 90 | [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF, | ||
| 91 | [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF, | ||
| 92 | [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF, | ||
| 93 | [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */ | ||
| 94 | [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF, | ||
| 95 | [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF, | ||
| 96 | [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */ | ||
| 97 | [PCI_EXT_CAP_ID_REBAR] = 0xFF, | ||
| 98 | [PCI_EXT_CAP_ID_DPA] = 0xFF, | ||
| 99 | [PCI_EXT_CAP_ID_TPH] = 0xFF, | ||
| 100 | [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, | ||
| 101 | [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ | ||
| 102 | [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ | ||
| 103 | [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ | ||
| 104 | }; | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Read/Write Permission Bits - one bit for each bit in capability | ||
| 108 | * Any field can be read if it exists, but what is read depends on | ||
| 109 | * whether the field is 'virtualized', or just pass thru to the | ||
| 110 | * hardware. Any virtualized field is also virtualized for writes. | ||
| 111 | * Writes are only permitted if they have a 1 bit here. | ||
| 112 | */ | ||
| 113 | struct perm_bits { | ||
| 114 | u8 *virt; /* read/write virtual data, not hw */ | ||
| 115 | u8 *write; /* writeable bits */ | ||
| 116 | int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, | ||
| 117 | struct perm_bits *perm, int offset, __le32 *val); | ||
| 118 | int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, | ||
| 119 | struct perm_bits *perm, int offset, __le32 val); | ||
| 120 | }; | ||
| 121 | |||
| 122 | #define NO_VIRT 0 | ||
| 123 | #define ALL_VIRT 0xFFFFFFFFU | ||
| 124 | #define NO_WRITE 0 | ||
| 125 | #define ALL_WRITE 0xFFFFFFFFU | ||
| 126 | |||
| 127 | static int vfio_user_config_read(struct pci_dev *pdev, int offset, | ||
| 128 | __le32 *val, int count) | ||
| 129 | { | ||
| 130 | int ret = -EINVAL; | ||
| 131 | u32 tmp_val = 0; | ||
| 132 | |||
| 133 | switch (count) { | ||
| 134 | case 1: | ||
| 135 | { | ||
| 136 | u8 tmp; | ||
| 137 | ret = pci_user_read_config_byte(pdev, offset, &tmp); | ||
| 138 | tmp_val = tmp; | ||
| 139 | break; | ||
| 140 | } | ||
| 141 | case 2: | ||
| 142 | { | ||
| 143 | u16 tmp; | ||
| 144 | ret = pci_user_read_config_word(pdev, offset, &tmp); | ||
| 145 | tmp_val = tmp; | ||
| 146 | break; | ||
| 147 | } | ||
| 148 | case 4: | ||
| 149 | ret = pci_user_read_config_dword(pdev, offset, &tmp_val); | ||
| 150 | break; | ||
| 151 | } | ||
| 152 | |||
| 153 | *val = cpu_to_le32(tmp_val); | ||
| 154 | |||
| 155 | return pcibios_err_to_errno(ret); | ||
| 156 | } | ||
| 157 | |||
| 158 | static int vfio_user_config_write(struct pci_dev *pdev, int offset, | ||
| 159 | __le32 val, int count) | ||
| 160 | { | ||
| 161 | int ret = -EINVAL; | ||
| 162 | u32 tmp_val = le32_to_cpu(val); | ||
| 163 | |||
| 164 | switch (count) { | ||
| 165 | case 1: | ||
| 166 | ret = pci_user_write_config_byte(pdev, offset, tmp_val); | ||
| 167 | break; | ||
| 168 | case 2: | ||
| 169 | ret = pci_user_write_config_word(pdev, offset, tmp_val); | ||
| 170 | break; | ||
| 171 | case 4: | ||
| 172 | ret = pci_user_write_config_dword(pdev, offset, tmp_val); | ||
| 173 | break; | ||
| 174 | } | ||
| 175 | |||
| 176 | return pcibios_err_to_errno(ret); | ||
| 177 | } | ||
| 178 | |||
| 179 | static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, | ||
| 180 | int count, struct perm_bits *perm, | ||
| 181 | int offset, __le32 *val) | ||
| 182 | { | ||
| 183 | __le32 virt = 0; | ||
| 184 | |||
| 185 | memcpy(val, vdev->vconfig + pos, count); | ||
| 186 | |||
| 187 | memcpy(&virt, perm->virt + offset, count); | ||
| 188 | |||
| 189 | /* Any non-virtualized bits? */ | ||
| 190 | if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) { | ||
| 191 | struct pci_dev *pdev = vdev->pdev; | ||
| 192 | __le32 phys_val = 0; | ||
| 193 | int ret; | ||
| 194 | |||
| 195 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
| 196 | if (ret) | ||
| 197 | return ret; | ||
| 198 | |||
| 199 | *val = (phys_val & ~virt) | (*val & virt); | ||
| 200 | } | ||
| 201 | |||
| 202 | return count; | ||
| 203 | } | ||
| 204 | |||
| 205 | static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, | ||
| 206 | int count, struct perm_bits *perm, | ||
| 207 | int offset, __le32 val) | ||
| 208 | { | ||
| 209 | __le32 virt = 0, write = 0; | ||
| 210 | |||
| 211 | memcpy(&write, perm->write + offset, count); | ||
| 212 | |||
| 213 | if (!write) | ||
| 214 | return count; /* drop, no writable bits */ | ||
| 215 | |||
| 216 | memcpy(&virt, perm->virt + offset, count); | ||
| 217 | |||
| 218 | /* Virtualized and writable bits go to vconfig */ | ||
| 219 | if (write & virt) { | ||
| 220 | __le32 virt_val = 0; | ||
| 221 | |||
| 222 | memcpy(&virt_val, vdev->vconfig + pos, count); | ||
| 223 | |||
| 224 | virt_val &= ~(write & virt); | ||
| 225 | virt_val |= (val & (write & virt)); | ||
| 226 | |||
| 227 | memcpy(vdev->vconfig + pos, &virt_val, count); | ||
| 228 | } | ||
| 229 | |||
| 230 | /* Non-virtualzed and writable bits go to hardware */ | ||
| 231 | if (write & ~virt) { | ||
| 232 | struct pci_dev *pdev = vdev->pdev; | ||
| 233 | __le32 phys_val = 0; | ||
| 234 | int ret; | ||
| 235 | |||
| 236 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
| 237 | if (ret) | ||
| 238 | return ret; | ||
| 239 | |||
| 240 | phys_val &= ~(write & ~virt); | ||
| 241 | phys_val |= (val & (write & ~virt)); | ||
| 242 | |||
| 243 | ret = vfio_user_config_write(pdev, pos, phys_val, count); | ||
| 244 | if (ret) | ||
| 245 | return ret; | ||
| 246 | } | ||
| 247 | |||
| 248 | return count; | ||
| 249 | } | ||
| 250 | |||
| 251 | /* Allow direct read from hardware, except for capability next pointer */ | ||
| 252 | static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, | ||
| 253 | int count, struct perm_bits *perm, | ||
| 254 | int offset, __le32 *val) | ||
| 255 | { | ||
| 256 | int ret; | ||
| 257 | |||
| 258 | ret = vfio_user_config_read(vdev->pdev, pos, val, count); | ||
| 259 | if (ret) | ||
| 260 | return pcibios_err_to_errno(ret); | ||
| 261 | |||
| 262 | if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ | ||
| 263 | if (offset < 4) | ||
| 264 | memcpy(val, vdev->vconfig + pos, count); | ||
| 265 | } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */ | ||
| 266 | if (offset == PCI_CAP_LIST_ID && count > 1) | ||
| 267 | memcpy(val, vdev->vconfig + pos, | ||
| 268 | min(PCI_CAP_FLAGS, count)); | ||
| 269 | else if (offset == PCI_CAP_LIST_NEXT) | ||
| 270 | memcpy(val, vdev->vconfig + pos, 1); | ||
| 271 | } | ||
| 272 | |||
| 273 | return count; | ||
| 274 | } | ||
| 275 | |||
| 276 | static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, | ||
| 277 | int count, struct perm_bits *perm, | ||
| 278 | int offset, __le32 val) | ||
| 279 | { | ||
| 280 | int ret; | ||
| 281 | |||
| 282 | ret = vfio_user_config_write(vdev->pdev, pos, val, count); | ||
| 283 | if (ret) | ||
| 284 | return ret; | ||
| 285 | |||
| 286 | return count; | ||
| 287 | } | ||
| 288 | |||
| 289 | /* Default all regions to read-only, no-virtualization */ | ||
| 290 | static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { | ||
| 291 | [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
| 292 | }; | ||
| 293 | static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { | ||
| 294 | [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
| 295 | }; | ||
| 296 | |||
| 297 | static void free_perm_bits(struct perm_bits *perm) | ||
| 298 | { | ||
| 299 | kfree(perm->virt); | ||
| 300 | kfree(perm->write); | ||
| 301 | perm->virt = NULL; | ||
| 302 | perm->write = NULL; | ||
| 303 | } | ||
| 304 | |||
| 305 | static int alloc_perm_bits(struct perm_bits *perm, int size) | ||
| 306 | { | ||
| 307 | /* | ||
| 308 | * Round up all permission bits to the next dword, this lets us | ||
| 309 | * ignore whether a read/write exceeds the defined capability | ||
| 310 | * structure. We can do this because: | ||
| 311 | * - Standard config space is already dword aligned | ||
| 312 | * - Capabilities are all dword alinged (bits 0:1 of next reserved) | ||
| 313 | * - Express capabilities defined as dword aligned | ||
| 314 | */ | ||
| 315 | size = round_up(size, 4); | ||
| 316 | |||
| 317 | /* | ||
| 318 | * Zero state is | ||
| 319 | * - All Readable, None Writeable, None Virtualized | ||
| 320 | */ | ||
| 321 | perm->virt = kzalloc(size, GFP_KERNEL); | ||
| 322 | perm->write = kzalloc(size, GFP_KERNEL); | ||
| 323 | if (!perm->virt || !perm->write) { | ||
| 324 | free_perm_bits(perm); | ||
| 325 | return -ENOMEM; | ||
| 326 | } | ||
| 327 | |||
| 328 | perm->readfn = vfio_default_config_read; | ||
| 329 | perm->writefn = vfio_default_config_write; | ||
| 330 | |||
| 331 | return 0; | ||
| 332 | } | ||
| 333 | |||
| 334 | /* | ||
| 335 | * Helper functions for filling in permission tables | ||
| 336 | */ | ||
| 337 | static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write) | ||
| 338 | { | ||
| 339 | p->virt[off] = virt; | ||
| 340 | p->write[off] = write; | ||
| 341 | } | ||
| 342 | |||
| 343 | /* Handle endian-ness - pci and tables are little-endian */ | ||
| 344 | static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write) | ||
| 345 | { | ||
| 346 | *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt); | ||
| 347 | *(__le16 *)(&p->write[off]) = cpu_to_le16(write); | ||
| 348 | } | ||
| 349 | |||
| 350 | /* Handle endian-ness - pci and tables are little-endian */ | ||
| 351 | static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) | ||
| 352 | { | ||
| 353 | *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt); | ||
| 354 | *(__le32 *)(&p->write[off]) = cpu_to_le32(write); | ||
| 355 | } | ||
| 356 | |||
| 357 | /* | ||
| 358 | * Restore the *real* BARs after we detect a FLR or backdoor reset. | ||
| 359 | * (backdoor = some device specific technique that we didn't catch) | ||
| 360 | */ | ||
| 361 | static void vfio_bar_restore(struct vfio_pci_device *vdev) | ||
| 362 | { | ||
| 363 | struct pci_dev *pdev = vdev->pdev; | ||
| 364 | u32 *rbar = vdev->rbar; | ||
| 365 | int i; | ||
| 366 | |||
| 367 | if (pdev->is_virtfn) | ||
| 368 | return; | ||
| 369 | |||
| 370 | pr_info("%s: %s reset recovery - restoring bars\n", | ||
| 371 | __func__, dev_name(&pdev->dev)); | ||
| 372 | |||
| 373 | for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++) | ||
| 374 | pci_user_write_config_dword(pdev, i, *rbar); | ||
| 375 | |||
| 376 | pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); | ||
| 377 | } | ||
| 378 | |||
| 379 | static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) | ||
| 380 | { | ||
| 381 | unsigned long flags = pci_resource_flags(pdev, bar); | ||
| 382 | u32 val; | ||
| 383 | |||
| 384 | if (flags & IORESOURCE_IO) | ||
| 385 | return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO); | ||
| 386 | |||
| 387 | val = PCI_BASE_ADDRESS_SPACE_MEMORY; | ||
| 388 | |||
| 389 | if (flags & IORESOURCE_PREFETCH) | ||
| 390 | val |= PCI_BASE_ADDRESS_MEM_PREFETCH; | ||
| 391 | |||
| 392 | if (flags & IORESOURCE_MEM_64) | ||
| 393 | val |= PCI_BASE_ADDRESS_MEM_TYPE_64; | ||
| 394 | |||
| 395 | return cpu_to_le32(val); | ||
| 396 | } | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Pretend we're hardware and tweak the values of the *virtual* PCI BARs | ||
| 400 | * to reflect the hardware capabilities. This implements BAR sizing. | ||
| 401 | */ | ||
| 402 | static void vfio_bar_fixup(struct vfio_pci_device *vdev) | ||
| 403 | { | ||
| 404 | struct pci_dev *pdev = vdev->pdev; | ||
| 405 | int i; | ||
| 406 | __le32 *bar; | ||
| 407 | u64 mask; | ||
| 408 | |||
| 409 | bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0]; | ||
| 410 | |||
| 411 | for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) { | ||
| 412 | if (!pci_resource_start(pdev, i)) { | ||
| 413 | *bar = 0; /* Unmapped by host = unimplemented to user */ | ||
| 414 | continue; | ||
| 415 | } | ||
| 416 | |||
| 417 | mask = ~(pci_resource_len(pdev, i) - 1); | ||
| 418 | |||
| 419 | *bar &= cpu_to_le32((u32)mask); | ||
| 420 | *bar |= vfio_generate_bar_flags(pdev, i); | ||
| 421 | |||
| 422 | if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) { | ||
| 423 | bar++; | ||
| 424 | *bar &= cpu_to_le32((u32)(mask >> 32)); | ||
| 425 | i++; | ||
| 426 | } | ||
| 427 | } | ||
| 428 | |||
| 429 | bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; | ||
| 430 | |||
| 431 | /* | ||
| 432 | * NB. we expose the actual BAR size here, regardless of whether | ||
| 433 | * we can read it. When we report the REGION_INFO for the ROM | ||
| 434 | * we report what PCI tells us is the actual ROM size. | ||
| 435 | */ | ||
| 436 | if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { | ||
| 437 | mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); | ||
| 438 | mask |= PCI_ROM_ADDRESS_ENABLE; | ||
| 439 | *bar &= cpu_to_le32((u32)mask); | ||
| 440 | } else | ||
| 441 | *bar = 0; | ||
| 442 | |||
| 443 | vdev->bardirty = false; | ||
| 444 | } | ||
| 445 | |||
| 446 | static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, | ||
| 447 | int count, struct perm_bits *perm, | ||
| 448 | int offset, __le32 *val) | ||
| 449 | { | ||
| 450 | if (is_bar(offset)) /* pos == offset for basic config */ | ||
| 451 | vfio_bar_fixup(vdev); | ||
| 452 | |||
| 453 | count = vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
| 454 | |||
| 455 | /* Mask in virtual memory enable for SR-IOV devices */ | ||
| 456 | if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { | ||
| 457 | u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); | ||
| 458 | u32 tmp_val = le32_to_cpu(*val); | ||
| 459 | |||
| 460 | tmp_val |= cmd & PCI_COMMAND_MEMORY; | ||
| 461 | *val = cpu_to_le32(tmp_val); | ||
| 462 | } | ||
| 463 | |||
| 464 | return count; | ||
| 465 | } | ||
| 466 | |||
| 467 | static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, | ||
| 468 | int count, struct perm_bits *perm, | ||
| 469 | int offset, __le32 val) | ||
| 470 | { | ||
| 471 | struct pci_dev *pdev = vdev->pdev; | ||
| 472 | __le16 *virt_cmd; | ||
| 473 | u16 new_cmd = 0; | ||
| 474 | int ret; | ||
| 475 | |||
| 476 | virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND]; | ||
| 477 | |||
| 478 | if (offset == PCI_COMMAND) { | ||
| 479 | bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io; | ||
| 480 | u16 phys_cmd; | ||
| 481 | |||
| 482 | ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd); | ||
| 483 | if (ret) | ||
| 484 | return ret; | ||
| 485 | |||
| 486 | new_cmd = le32_to_cpu(val); | ||
| 487 | |||
| 488 | phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); | ||
| 489 | virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); | ||
| 490 | new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); | ||
| 491 | |||
| 492 | phys_io = !!(phys_cmd & PCI_COMMAND_IO); | ||
| 493 | virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); | ||
| 494 | new_io = !!(new_cmd & PCI_COMMAND_IO); | ||
| 495 | |||
| 496 | /* | ||
| 497 | * If the user is writing mem/io enable (new_mem/io) and we | ||
| 498 | * think it's already enabled (virt_mem/io), but the hardware | ||
| 499 | * shows it disabled (phys_mem/io, then the device has | ||
| 500 | * undergone some kind of backdoor reset and needs to be | ||
| 501 | * restored before we allow it to enable the bars. | ||
| 502 | * SR-IOV devices will trigger this, but we catch them later | ||
| 503 | */ | ||
| 504 | if ((new_mem && virt_mem && !phys_mem) || | ||
| 505 | (new_io && virt_io && !phys_io)) | ||
| 506 | vfio_bar_restore(vdev); | ||
| 507 | } | ||
| 508 | |||
| 509 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
| 510 | if (count < 0) | ||
| 511 | return count; | ||
| 512 | |||
| 513 | /* | ||
| 514 | * Save current memory/io enable bits in vconfig to allow for | ||
| 515 | * the test above next time. | ||
| 516 | */ | ||
| 517 | if (offset == PCI_COMMAND) { | ||
| 518 | u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO; | ||
| 519 | |||
| 520 | *virt_cmd &= cpu_to_le16(~mask); | ||
| 521 | *virt_cmd |= cpu_to_le16(new_cmd & mask); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* Emulate INTx disable */ | ||
| 525 | if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) { | ||
| 526 | bool virt_intx_disable; | ||
| 527 | |||
| 528 | virt_intx_disable = !!(le16_to_cpu(*virt_cmd) & | ||
| 529 | PCI_COMMAND_INTX_DISABLE); | ||
| 530 | |||
| 531 | if (virt_intx_disable && !vdev->virq_disabled) { | ||
| 532 | vdev->virq_disabled = true; | ||
| 533 | vfio_pci_intx_mask(vdev); | ||
| 534 | } else if (!virt_intx_disable && vdev->virq_disabled) { | ||
| 535 | vdev->virq_disabled = false; | ||
| 536 | vfio_pci_intx_unmask(vdev); | ||
| 537 | } | ||
| 538 | } | ||
| 539 | |||
| 540 | if (is_bar(offset)) | ||
| 541 | vdev->bardirty = true; | ||
| 542 | |||
| 543 | return count; | ||
| 544 | } | ||
| 545 | |||
| 546 | /* Permissions for the Basic PCI Header */ | ||
| 547 | static int __init init_pci_cap_basic_perm(struct perm_bits *perm) | ||
| 548 | { | ||
| 549 | if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF)) | ||
| 550 | return -ENOMEM; | ||
| 551 | |||
| 552 | perm->readfn = vfio_basic_config_read; | ||
| 553 | perm->writefn = vfio_basic_config_write; | ||
| 554 | |||
| 555 | /* Virtualized for SR-IOV functions, which just have FFFF */ | ||
| 556 | p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE); | ||
| 557 | p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE); | ||
| 558 | |||
| 559 | /* | ||
| 560 | * Virtualize INTx disable, we use it internally for interrupt | ||
| 561 | * control and can emulate it for non-PCI 2.3 devices. | ||
| 562 | */ | ||
| 563 | p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE); | ||
| 564 | |||
| 565 | /* Virtualize capability list, we might want to skip/disable */ | ||
| 566 | p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE); | ||
| 567 | |||
| 568 | /* No harm to write */ | ||
| 569 | p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE); | ||
| 570 | p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE); | ||
| 571 | p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE); | ||
| 572 | |||
| 573 | /* Virtualize all bars, can't touch the real ones */ | ||
| 574 | p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE); | ||
| 575 | p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE); | ||
| 576 | p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE); | ||
| 577 | p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE); | ||
| 578 | p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE); | ||
| 579 | p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE); | ||
| 580 | p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE); | ||
| 581 | |||
| 582 | /* Allow us to adjust capability chain */ | ||
| 583 | p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE); | ||
| 584 | |||
| 585 | /* Sometimes used by sw, just virtualize */ | ||
| 586 | p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
| 587 | return 0; | ||
| 588 | } | ||
| 589 | |||
| 590 | /* Permissions for the Power Management capability */ | ||
| 591 | static int __init init_pci_cap_pm_perm(struct perm_bits *perm) | ||
| 592 | { | ||
| 593 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) | ||
| 594 | return -ENOMEM; | ||
| 595 | |||
| 596 | /* | ||
| 597 | * We always virtualize the next field so we can remove | ||
| 598 | * capabilities from the chain if we want to. | ||
| 599 | */ | ||
| 600 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
| 601 | |||
| 602 | /* | ||
| 603 | * Power management is defined *per function*, | ||
| 604 | * so we let the user write this | ||
| 605 | */ | ||
| 606 | p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE); | ||
| 607 | return 0; | ||
| 608 | } | ||
| 609 | |||
| 610 | /* Permissions for PCI-X capability */ | ||
| 611 | static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) | ||
| 612 | { | ||
| 613 | /* Alloc 24, but only 8 are used in v0 */ | ||
| 614 | if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2)) | ||
| 615 | return -ENOMEM; | ||
| 616 | |||
| 617 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
| 618 | |||
| 619 | p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE); | ||
| 620 | p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE); | ||
| 621 | return 0; | ||
| 622 | } | ||
| 623 | |||
| 624 | /* Permissions for PCI Express capability */ | ||
| 625 | static int __init init_pci_cap_exp_perm(struct perm_bits *perm) | ||
| 626 | { | ||
| 627 | /* Alloc larger of two possible sizes */ | ||
| 628 | if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) | ||
| 629 | return -ENOMEM; | ||
| 630 | |||
| 631 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
| 632 | |||
| 633 | /* | ||
| 634 | * Allow writes to device control fields (includes FLR!) | ||
| 635 | * but not to devctl_phantom which could confuse IOMMU | ||
| 636 | * or to the ARI bit in devctl2 which is set at probe time | ||
| 637 | */ | ||
| 638 | p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); | ||
| 639 | p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); | ||
| 640 | return 0; | ||
| 641 | } | ||
| 642 | |||
| 643 | /* Permissions for Advanced Function capability */ | ||
| 644 | static int __init init_pci_cap_af_perm(struct perm_bits *perm) | ||
| 645 | { | ||
| 646 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) | ||
| 647 | return -ENOMEM; | ||
| 648 | |||
| 649 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
| 650 | p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); | ||
| 651 | return 0; | ||
| 652 | } | ||
| 653 | |||
| 654 | /* Permissions for Advanced Error Reporting extended capability */ | ||
| 655 | static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) | ||
| 656 | { | ||
| 657 | u32 mask; | ||
| 658 | |||
| 659 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR])) | ||
| 660 | return -ENOMEM; | ||
| 661 | |||
| 662 | /* | ||
| 663 | * Virtualize the first dword of all express capabilities | ||
| 664 | * because it includes the next pointer. This lets us later | ||
| 665 | * remove capabilities from the chain if we need to. | ||
| 666 | */ | ||
| 667 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
| 668 | |||
| 669 | /* Writable bits mask */ | ||
| 670 | mask = PCI_ERR_UNC_TRAIN | /* Training */ | ||
| 671 | PCI_ERR_UNC_DLP | /* Data Link Protocol */ | ||
| 672 | PCI_ERR_UNC_SURPDN | /* Surprise Down */ | ||
| 673 | PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ | ||
| 674 | PCI_ERR_UNC_FCP | /* Flow Control Protocol */ | ||
| 675 | PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */ | ||
| 676 | PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */ | ||
| 677 | PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */ | ||
| 678 | PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */ | ||
| 679 | PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */ | ||
| 680 | PCI_ERR_UNC_ECRC | /* ECRC Error Status */ | ||
| 681 | PCI_ERR_UNC_UNSUP | /* Unsupported Request */ | ||
| 682 | PCI_ERR_UNC_ACSV | /* ACS Violation */ | ||
| 683 | PCI_ERR_UNC_INTN | /* internal error */ | ||
| 684 | PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */ | ||
| 685 | PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */ | ||
| 686 | PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */ | ||
| 687 | p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask); | ||
| 688 | p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask); | ||
| 689 | p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask); | ||
| 690 | |||
| 691 | mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */ | ||
| 692 | PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */ | ||
| 693 | PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */ | ||
| 694 | PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */ | ||
| 695 | PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */ | ||
| 696 | PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */ | ||
| 697 | PCI_ERR_COR_INTERNAL | /* Corrected Internal */ | ||
| 698 | PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */ | ||
| 699 | p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask); | ||
| 700 | p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask); | ||
| 701 | |||
| 702 | mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */ | ||
| 703 | PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */ | ||
| 704 | p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask); | ||
| 705 | return 0; | ||
| 706 | } | ||
| 707 | |||
| 708 | /* Permissions for Power Budgeting extended capability */ | ||
| 709 | static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) | ||
| 710 | { | ||
| 711 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR])) | ||
| 712 | return -ENOMEM; | ||
| 713 | |||
| 714 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
| 715 | |||
| 716 | /* Writing the data selector is OK, the info is still read-only */ | ||
| 717 | p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE); | ||
| 718 | return 0; | ||
| 719 | } | ||
| 720 | |||
| 721 | /* | ||
| 722 | * Initialize the shared permission tables | ||
| 723 | */ | ||
| 724 | void vfio_pci_uninit_perm_bits(void) | ||
| 725 | { | ||
| 726 | free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); | ||
| 727 | |||
| 728 | free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); | ||
| 729 | free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); | ||
| 730 | free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); | ||
| 731 | free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); | ||
| 732 | |||
| 733 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
| 734 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
| 735 | } | ||
| 736 | |||
| 737 | int __init vfio_pci_init_perm_bits(void) | ||
| 738 | { | ||
| 739 | int ret; | ||
| 740 | |||
| 741 | /* Basic config space */ | ||
| 742 | ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]); | ||
| 743 | |||
| 744 | /* Capabilities */ | ||
| 745 | ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); | ||
| 746 | cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; | ||
| 747 | ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); | ||
| 748 | cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
| 749 | ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); | ||
| 750 | ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); | ||
| 751 | |||
| 752 | /* Extended capabilities */ | ||
| 753 | ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
| 754 | ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
| 755 | ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
| 756 | |||
| 757 | if (ret) | ||
| 758 | vfio_pci_uninit_perm_bits(); | ||
| 759 | |||
| 760 | return ret; | ||
| 761 | } | ||
| 762 | |||
| 763 | static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) | ||
| 764 | { | ||
| 765 | u8 cap; | ||
| 766 | int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : | ||
| 767 | PCI_STD_HEADER_SIZEOF; | ||
| 768 | base /= 4; | ||
| 769 | pos /= 4; | ||
| 770 | |||
| 771 | cap = vdev->pci_config_map[pos]; | ||
| 772 | |||
| 773 | if (cap == PCI_CAP_ID_BASIC) | ||
| 774 | return 0; | ||
| 775 | |||
| 776 | /* XXX Can we have to abutting capabilities of the same type? */ | ||
| 777 | while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) | ||
| 778 | pos--; | ||
| 779 | |||
| 780 | return pos * 4; | ||
| 781 | } | ||
| 782 | |||
| 783 | static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, | ||
| 784 | int count, struct perm_bits *perm, | ||
| 785 | int offset, __le32 *val) | ||
| 786 | { | ||
| 787 | /* Update max available queue size from msi_qmax */ | ||
| 788 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
| 789 | __le16 *flags; | ||
| 790 | int start; | ||
| 791 | |||
| 792 | start = vfio_find_cap_start(vdev, pos); | ||
| 793 | |||
| 794 | flags = (__le16 *)&vdev->vconfig[start]; | ||
| 795 | |||
| 796 | *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK); | ||
| 797 | *flags |= cpu_to_le16(vdev->msi_qmax << 1); | ||
| 798 | } | ||
| 799 | |||
| 800 | return vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
| 801 | } | ||
| 802 | |||
| 803 | static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, | ||
| 804 | int count, struct perm_bits *perm, | ||
| 805 | int offset, __le32 val) | ||
| 806 | { | ||
| 807 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
| 808 | if (count < 0) | ||
| 809 | return count; | ||
| 810 | |||
| 811 | /* Fixup and write configured queue size and enable to hardware */ | ||
| 812 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
| 813 | __le16 *pflags; | ||
| 814 | u16 flags; | ||
| 815 | int start, ret; | ||
| 816 | |||
| 817 | start = vfio_find_cap_start(vdev, pos); | ||
| 818 | |||
| 819 | pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS]; | ||
| 820 | |||
| 821 | flags = le16_to_cpu(*pflags); | ||
| 822 | |||
| 823 | /* MSI is enabled via ioctl */ | ||
| 824 | if (!is_msi(vdev)) | ||
| 825 | flags &= ~PCI_MSI_FLAGS_ENABLE; | ||
| 826 | |||
| 827 | /* Check queue size */ | ||
| 828 | if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) { | ||
| 829 | flags &= ~PCI_MSI_FLAGS_QSIZE; | ||
| 830 | flags |= vdev->msi_qmax << 4; | ||
| 831 | } | ||
| 832 | |||
| 833 | /* Write back to virt and to hardware */ | ||
| 834 | *pflags = cpu_to_le16(flags); | ||
| 835 | ret = pci_user_write_config_word(vdev->pdev, | ||
| 836 | start + PCI_MSI_FLAGS, | ||
| 837 | flags); | ||
| 838 | if (ret) | ||
| 839 | return pcibios_err_to_errno(ret); | ||
| 840 | } | ||
| 841 | |||
| 842 | return count; | ||
| 843 | } | ||
| 844 | |||
| 845 | /* | ||
| 846 | * MSI determination is per-device, so this routine gets used beyond | ||
| 847 | * initialization time. Don't add __init | ||
| 848 | */ | ||
| 849 | static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags) | ||
| 850 | { | ||
| 851 | if (alloc_perm_bits(perm, len)) | ||
| 852 | return -ENOMEM; | ||
| 853 | |||
| 854 | perm->readfn = vfio_msi_config_read; | ||
| 855 | perm->writefn = vfio_msi_config_write; | ||
| 856 | |||
| 857 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
| 858 | |||
| 859 | /* | ||
| 860 | * The upper byte of the control register is reserved, | ||
| 861 | * just setup the lower byte. | ||
| 862 | */ | ||
| 863 | p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
| 864 | p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE); | ||
| 865 | if (flags & PCI_MSI_FLAGS_64BIT) { | ||
| 866 | p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE); | ||
| 867 | p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
| 868 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
| 869 | p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE); | ||
| 870 | p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE); | ||
| 871 | } | ||
| 872 | } else { | ||
| 873 | p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
| 874 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
| 875 | p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE); | ||
| 876 | p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE); | ||
| 877 | } | ||
| 878 | } | ||
| 879 | return 0; | ||
| 880 | } | ||
| 881 | |||
| 882 | /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ | ||
| 883 | static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) | ||
| 884 | { | ||
| 885 | struct pci_dev *pdev = vdev->pdev; | ||
| 886 | int len, ret; | ||
| 887 | u16 flags; | ||
| 888 | |||
| 889 | ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags); | ||
| 890 | if (ret) | ||
| 891 | return pcibios_err_to_errno(ret); | ||
| 892 | |||
| 893 | len = 10; /* Minimum size */ | ||
| 894 | if (flags & PCI_MSI_FLAGS_64BIT) | ||
| 895 | len += 4; | ||
| 896 | if (flags & PCI_MSI_FLAGS_MASKBIT) | ||
| 897 | len += 10; | ||
| 898 | |||
| 899 | if (vdev->msi_perm) | ||
| 900 | return len; | ||
| 901 | |||
| 902 | vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); | ||
| 903 | if (!vdev->msi_perm) | ||
| 904 | return -ENOMEM; | ||
| 905 | |||
| 906 | ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags); | ||
| 907 | if (ret) | ||
| 908 | return ret; | ||
| 909 | |||
| 910 | return len; | ||
| 911 | } | ||
| 912 | |||
| 913 | /* Determine extended capability length for VC (2 & 9) and MFVC */ | ||
| 914 | static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) | ||
| 915 | { | ||
| 916 | struct pci_dev *pdev = vdev->pdev; | ||
| 917 | u32 tmp; | ||
| 918 | int ret, evcc, phases, vc_arb; | ||
| 919 | int len = PCI_CAP_VC_BASE_SIZEOF; | ||
| 920 | |||
| 921 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp); | ||
| 922 | if (ret) | ||
| 923 | return pcibios_err_to_errno(ret); | ||
| 924 | |||
| 925 | evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */ | ||
| 926 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp); | ||
| 927 | if (ret) | ||
| 928 | return pcibios_err_to_errno(ret); | ||
| 929 | |||
| 930 | if (tmp & PCI_VC_REG2_128_PHASE) | ||
| 931 | phases = 128; | ||
| 932 | else if (tmp & PCI_VC_REG2_64_PHASE) | ||
| 933 | phases = 64; | ||
| 934 | else if (tmp & PCI_VC_REG2_32_PHASE) | ||
| 935 | phases = 32; | ||
| 936 | else | ||
| 937 | phases = 0; | ||
| 938 | |||
| 939 | vc_arb = phases * 4; | ||
| 940 | |||
| 941 | /* | ||
| 942 | * Port arbitration tables are root & switch only; | ||
| 943 | * function arbitration tables are function 0 only. | ||
| 944 | * In either case, we'll never let user write them so | ||
| 945 | * we don't care how big they are | ||
| 946 | */ | ||
| 947 | len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF; | ||
| 948 | if (vc_arb) { | ||
| 949 | len = round_up(len, 16); | ||
| 950 | len += vc_arb / 8; | ||
| 951 | } | ||
| 952 | return len; | ||
| 953 | } | ||
| 954 | |||
| 955 | static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) | ||
| 956 | { | ||
| 957 | struct pci_dev *pdev = vdev->pdev; | ||
| 958 | u16 word; | ||
| 959 | u8 byte; | ||
| 960 | int ret; | ||
| 961 | |||
| 962 | switch (cap) { | ||
| 963 | case PCI_CAP_ID_MSI: | ||
| 964 | return vfio_msi_cap_len(vdev, pos); | ||
| 965 | case PCI_CAP_ID_PCIX: | ||
| 966 | ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word); | ||
| 967 | if (ret) | ||
| 968 | return pcibios_err_to_errno(ret); | ||
| 969 | |||
| 970 | if (PCI_X_CMD_VERSION(word)) { | ||
| 971 | vdev->extended_caps = true; | ||
| 972 | return PCI_CAP_PCIX_SIZEOF_V2; | ||
| 973 | } else | ||
| 974 | return PCI_CAP_PCIX_SIZEOF_V0; | ||
| 975 | case PCI_CAP_ID_VNDR: | ||
| 976 | /* length follows next field */ | ||
| 977 | ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte); | ||
| 978 | if (ret) | ||
| 979 | return pcibios_err_to_errno(ret); | ||
| 980 | |||
| 981 | return byte; | ||
| 982 | case PCI_CAP_ID_EXP: | ||
| 983 | /* length based on version */ | ||
| 984 | ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); | ||
| 985 | if (ret) | ||
| 986 | return pcibios_err_to_errno(ret); | ||
| 987 | |||
| 988 | if ((word & PCI_EXP_FLAGS_VERS) == 1) | ||
| 989 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; | ||
| 990 | else { | ||
| 991 | vdev->extended_caps = true; | ||
| 992 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; | ||
| 993 | } | ||
| 994 | case PCI_CAP_ID_HT: | ||
| 995 | ret = pci_read_config_byte(pdev, pos + 3, &byte); | ||
| 996 | if (ret) | ||
| 997 | return pcibios_err_to_errno(ret); | ||
| 998 | |||
| 999 | return (byte & HT_3BIT_CAP_MASK) ? | ||
| 1000 | HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG; | ||
| 1001 | case PCI_CAP_ID_SATA: | ||
| 1002 | ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte); | ||
| 1003 | if (ret) | ||
| 1004 | return pcibios_err_to_errno(ret); | ||
| 1005 | |||
| 1006 | byte &= PCI_SATA_REGS_MASK; | ||
| 1007 | if (byte == PCI_SATA_REGS_INLINE) | ||
| 1008 | return PCI_SATA_SIZEOF_LONG; | ||
| 1009 | else | ||
| 1010 | return PCI_SATA_SIZEOF_SHORT; | ||
| 1011 | default: | ||
| 1012 | pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n", | ||
| 1013 | dev_name(&pdev->dev), __func__, cap, pos); | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | return 0; | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) | ||
| 1020 | { | ||
| 1021 | struct pci_dev *pdev = vdev->pdev; | ||
| 1022 | u8 byte; | ||
| 1023 | u32 dword; | ||
| 1024 | int ret; | ||
| 1025 | |||
| 1026 | switch (ecap) { | ||
| 1027 | case PCI_EXT_CAP_ID_VNDR: | ||
| 1028 | ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); | ||
| 1029 | if (ret) | ||
| 1030 | return pcibios_err_to_errno(ret); | ||
| 1031 | |||
| 1032 | return dword >> PCI_VSEC_HDR_LEN_SHIFT; | ||
| 1033 | case PCI_EXT_CAP_ID_VC: | ||
| 1034 | case PCI_EXT_CAP_ID_VC9: | ||
| 1035 | case PCI_EXT_CAP_ID_MFVC: | ||
| 1036 | return vfio_vc_cap_len(vdev, epos); | ||
| 1037 | case PCI_EXT_CAP_ID_ACS: | ||
| 1038 | ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte); | ||
| 1039 | if (ret) | ||
| 1040 | return pcibios_err_to_errno(ret); | ||
| 1041 | |||
| 1042 | if (byte & PCI_ACS_EC) { | ||
| 1043 | int bits; | ||
| 1044 | |||
| 1045 | ret = pci_read_config_byte(pdev, | ||
| 1046 | epos + PCI_ACS_EGRESS_BITS, | ||
| 1047 | &byte); | ||
| 1048 | if (ret) | ||
| 1049 | return pcibios_err_to_errno(ret); | ||
| 1050 | |||
| 1051 | bits = byte ? round_up(byte, 32) : 256; | ||
| 1052 | return 8 + (bits / 8); | ||
| 1053 | } | ||
| 1054 | return 8; | ||
| 1055 | |||
| 1056 | case PCI_EXT_CAP_ID_REBAR: | ||
| 1057 | ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte); | ||
| 1058 | if (ret) | ||
| 1059 | return pcibios_err_to_errno(ret); | ||
| 1060 | |||
| 1061 | byte &= PCI_REBAR_CTRL_NBAR_MASK; | ||
| 1062 | byte >>= PCI_REBAR_CTRL_NBAR_SHIFT; | ||
| 1063 | |||
| 1064 | return 4 + (byte * 8); | ||
| 1065 | case PCI_EXT_CAP_ID_DPA: | ||
| 1066 | ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte); | ||
| 1067 | if (ret) | ||
| 1068 | return pcibios_err_to_errno(ret); | ||
| 1069 | |||
| 1070 | byte &= PCI_DPA_CAP_SUBSTATE_MASK; | ||
| 1071 | byte = round_up(byte + 1, 4); | ||
| 1072 | return PCI_DPA_BASE_SIZEOF + byte; | ||
| 1073 | case PCI_EXT_CAP_ID_TPH: | ||
| 1074 | ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); | ||
| 1075 | if (ret) | ||
| 1076 | return pcibios_err_to_errno(ret); | ||
| 1077 | |||
| 1078 | if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { | ||
| 1079 | int sts; | ||
| 1080 | |||
| 1081 | sts = byte & PCI_TPH_CAP_ST_MASK; | ||
| 1082 | sts >>= PCI_TPH_CAP_ST_SHIFT; | ||
| 1083 | return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4); | ||
| 1084 | } | ||
| 1085 | return PCI_TPH_BASE_SIZEOF; | ||
| 1086 | default: | ||
| 1087 | pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n", | ||
| 1088 | dev_name(&pdev->dev), __func__, ecap, epos); | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | return 0; | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, | ||
| 1095 | int offset, int size) | ||
| 1096 | { | ||
| 1097 | struct pci_dev *pdev = vdev->pdev; | ||
| 1098 | int ret = 0; | ||
| 1099 | |||
| 1100 | /* | ||
| 1101 | * We try to read physical config space in the largest chunks | ||
| 1102 | * we can, assuming that all of the fields support dword access. | ||
| 1103 | * pci_save_state() makes this same assumption and seems to do ok. | ||
| 1104 | */ | ||
| 1105 | while (size) { | ||
| 1106 | int filled; | ||
| 1107 | |||
| 1108 | if (size >= 4 && !(offset % 4)) { | ||
| 1109 | __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; | ||
| 1110 | u32 dword; | ||
| 1111 | |||
| 1112 | ret = pci_read_config_dword(pdev, offset, &dword); | ||
| 1113 | if (ret) | ||
| 1114 | return ret; | ||
| 1115 | *dwordp = cpu_to_le32(dword); | ||
| 1116 | filled = 4; | ||
| 1117 | } else if (size >= 2 && !(offset % 2)) { | ||
| 1118 | __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; | ||
| 1119 | u16 word; | ||
| 1120 | |||
| 1121 | ret = pci_read_config_word(pdev, offset, &word); | ||
| 1122 | if (ret) | ||
| 1123 | return ret; | ||
| 1124 | *wordp = cpu_to_le16(word); | ||
| 1125 | filled = 2; | ||
| 1126 | } else { | ||
| 1127 | u8 *byte = &vdev->vconfig[offset]; | ||
| 1128 | ret = pci_read_config_byte(pdev, offset, byte); | ||
| 1129 | if (ret) | ||
| 1130 | return ret; | ||
| 1131 | filled = 1; | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | offset += filled; | ||
| 1135 | size -= filled; | ||
| 1136 | } | ||
| 1137 | |||
| 1138 | return ret; | ||
| 1139 | } | ||
| 1140 | |||
| 1141 | static int vfio_cap_init(struct vfio_pci_device *vdev) | ||
| 1142 | { | ||
| 1143 | struct pci_dev *pdev = vdev->pdev; | ||
| 1144 | u8 *map = vdev->pci_config_map; | ||
| 1145 | u16 status; | ||
| 1146 | u8 pos, *prev, cap; | ||
| 1147 | int loops, ret, caps = 0; | ||
| 1148 | |||
| 1149 | /* Any capabilities? */ | ||
| 1150 | ret = pci_read_config_word(pdev, PCI_STATUS, &status); | ||
| 1151 | if (ret) | ||
| 1152 | return ret; | ||
| 1153 | |||
| 1154 | if (!(status & PCI_STATUS_CAP_LIST)) | ||
| 1155 | return 0; /* Done */ | ||
| 1156 | |||
| 1157 | ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); | ||
| 1158 | if (ret) | ||
| 1159 | return ret; | ||
| 1160 | |||
| 1161 | /* Mark the previous position in case we want to skip a capability */ | ||
| 1162 | prev = &vdev->vconfig[PCI_CAPABILITY_LIST]; | ||
| 1163 | |||
| 1164 | /* We can bound our loop, capabilities are dword aligned */ | ||
| 1165 | loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF; | ||
| 1166 | while (pos && loops--) { | ||
| 1167 | u8 next; | ||
| 1168 | int i, len = 0; | ||
| 1169 | |||
| 1170 | ret = pci_read_config_byte(pdev, pos, &cap); | ||
| 1171 | if (ret) | ||
| 1172 | return ret; | ||
| 1173 | |||
| 1174 | ret = pci_read_config_byte(pdev, | ||
| 1175 | pos + PCI_CAP_LIST_NEXT, &next); | ||
| 1176 | if (ret) | ||
| 1177 | return ret; | ||
| 1178 | |||
| 1179 | if (cap <= PCI_CAP_ID_MAX) { | ||
| 1180 | len = pci_cap_length[cap]; | ||
| 1181 | if (len == 0xFF) { /* Variable length */ | ||
| 1182 | len = vfio_cap_len(vdev, cap, pos); | ||
| 1183 | if (len < 0) | ||
| 1184 | return len; | ||
| 1185 | } | ||
| 1186 | } | ||
| 1187 | |||
| 1188 | if (!len) { | ||
| 1189 | pr_info("%s: %s hiding cap 0x%x\n", | ||
| 1190 | __func__, dev_name(&pdev->dev), cap); | ||
| 1191 | *prev = next; | ||
| 1192 | pos = next; | ||
| 1193 | continue; | ||
| 1194 | } | ||
| 1195 | |||
| 1196 | /* Sanity check, do we overlap other capabilities? */ | ||
| 1197 | for (i = 0; i < len; i += 4) { | ||
| 1198 | if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
| 1199 | continue; | ||
| 1200 | |||
| 1201 | pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", | ||
| 1202 | __func__, dev_name(&pdev->dev), | ||
| 1203 | pos + i, map[pos + i], cap); | ||
| 1204 | } | ||
| 1205 | |||
| 1206 | memset(map + (pos / 4), cap, len / 4); | ||
| 1207 | ret = vfio_fill_vconfig_bytes(vdev, pos, len); | ||
| 1208 | if (ret) | ||
| 1209 | return ret; | ||
| 1210 | |||
| 1211 | prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; | ||
| 1212 | pos = next; | ||
| 1213 | caps++; | ||
| 1214 | } | ||
| 1215 | |||
| 1216 | /* If we didn't fill any capabilities, clear the status flag */ | ||
| 1217 | if (!caps) { | ||
| 1218 | __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS]; | ||
| 1219 | *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST); | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | return 0; | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | static int vfio_ecap_init(struct vfio_pci_device *vdev) | ||
| 1226 | { | ||
| 1227 | struct pci_dev *pdev = vdev->pdev; | ||
| 1228 | u8 *map = vdev->pci_config_map; | ||
| 1229 | u16 epos; | ||
| 1230 | __le32 *prev = NULL; | ||
| 1231 | int loops, ret, ecaps = 0; | ||
| 1232 | |||
| 1233 | if (!vdev->extended_caps) | ||
| 1234 | return 0; | ||
| 1235 | |||
| 1236 | epos = PCI_CFG_SPACE_SIZE; | ||
| 1237 | |||
| 1238 | loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; | ||
| 1239 | |||
| 1240 | while (loops-- && epos >= PCI_CFG_SPACE_SIZE) { | ||
| 1241 | u32 header; | ||
| 1242 | u16 ecap; | ||
| 1243 | int i, len = 0; | ||
| 1244 | bool hidden = false; | ||
| 1245 | |||
| 1246 | ret = pci_read_config_dword(pdev, epos, &header); | ||
| 1247 | if (ret) | ||
| 1248 | return ret; | ||
| 1249 | |||
| 1250 | ecap = PCI_EXT_CAP_ID(header); | ||
| 1251 | |||
| 1252 | if (ecap <= PCI_EXT_CAP_ID_MAX) { | ||
| 1253 | len = pci_ext_cap_length[ecap]; | ||
| 1254 | if (len == 0xFF) { | ||
| 1255 | len = vfio_ext_cap_len(vdev, ecap, epos); | ||
| 1256 | if (len < 0) | ||
| 1257 | return ret; | ||
| 1258 | } | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | if (!len) { | ||
| 1262 | pr_info("%s: %s hiding ecap 0x%x@0x%x\n", | ||
| 1263 | __func__, dev_name(&pdev->dev), ecap, epos); | ||
| 1264 | |||
| 1265 | /* If not the first in the chain, we can skip over it */ | ||
| 1266 | if (prev) { | ||
| 1267 | u32 val = epos = PCI_EXT_CAP_NEXT(header); | ||
| 1268 | *prev &= cpu_to_le32(~(0xffcU << 20)); | ||
| 1269 | *prev |= cpu_to_le32(val << 20); | ||
| 1270 | continue; | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | /* | ||
| 1274 | * Otherwise, fill in a placeholder, the direct | ||
| 1275 | * readfn will virtualize this automatically | ||
| 1276 | */ | ||
| 1277 | len = PCI_CAP_SIZEOF; | ||
| 1278 | hidden = true; | ||
| 1279 | } | ||
| 1280 | |||
| 1281 | for (i = 0; i < len; i += 4) { | ||
| 1282 | if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
| 1283 | continue; | ||
| 1284 | |||
| 1285 | pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", | ||
| 1286 | __func__, dev_name(&pdev->dev), | ||
| 1287 | epos + i, map[epos + i], ecap); | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | /* | ||
| 1291 | * Even though ecap is 2 bytes, we're currently a long way | ||
| 1292 | * from exceeding 1 byte capabilities. If we ever make it | ||
| 1293 | * up to 0xFF we'll need to up this to a two-byte, byte map. | ||
| 1294 | */ | ||
| 1295 | BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); | ||
| 1296 | |||
| 1297 | memset(map + (epos / 4), ecap, len / 4); | ||
| 1298 | ret = vfio_fill_vconfig_bytes(vdev, epos, len); | ||
| 1299 | if (ret) | ||
| 1300 | return ret; | ||
| 1301 | |||
| 1302 | /* | ||
| 1303 | * If we're just using this capability to anchor the list, | ||
| 1304 | * hide the real ID. Only count real ecaps. XXX PCI spec | ||
| 1305 | * indicates to use cap id = 0, version = 0, next = 0 if | ||
| 1306 | * ecaps are absent, hope users check all the way to next. | ||
| 1307 | */ | ||
| 1308 | if (hidden) | ||
| 1309 | *(__le32 *)&vdev->vconfig[epos] &= | ||
| 1310 | cpu_to_le32((0xffcU << 20)); | ||
| 1311 | else | ||
| 1312 | ecaps++; | ||
| 1313 | |||
| 1314 | prev = (__le32 *)&vdev->vconfig[epos]; | ||
| 1315 | epos = PCI_EXT_CAP_NEXT(header); | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | if (!ecaps) | ||
| 1319 | *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; | ||
| 1320 | |||
| 1321 | return 0; | ||
| 1322 | } | ||
| 1323 | |||
| 1324 | /* | ||
| 1325 | * For each device we allocate a pci_config_map that indicates the | ||
| 1326 | * capability occupying each dword and thus the struct perm_bits we | ||
| 1327 | * use for read and write. We also allocate a virtualized config | ||
| 1328 | * space which tracks reads and writes to bits that we emulate for | ||
| 1329 | * the user. Initial values filled from device. | ||
| 1330 | * | ||
| 1331 | * Using shared stuct perm_bits between all vfio-pci devices saves | ||
| 1332 | * us from allocating cfg_size buffers for virt and write for every | ||
| 1333 | * device. We could remove vconfig and allocate individual buffers | ||
| 1334 | * for each area requring emulated bits, but the array of pointers | ||
| 1335 | * would be comparable in size (at least for standard config space). | ||
| 1336 | */ | ||
| 1337 | int vfio_config_init(struct vfio_pci_device *vdev) | ||
| 1338 | { | ||
| 1339 | struct pci_dev *pdev = vdev->pdev; | ||
| 1340 | u8 *map, *vconfig; | ||
| 1341 | int ret; | ||
| 1342 | |||
| 1343 | /* | ||
| 1344 | * Config space, caps and ecaps are all dword aligned, so we can | ||
| 1345 | * use one byte per dword to record the type. | ||
| 1346 | */ | ||
| 1347 | map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); | ||
| 1348 | if (!map) | ||
| 1349 | return -ENOMEM; | ||
| 1350 | |||
| 1351 | vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); | ||
| 1352 | if (!vconfig) { | ||
| 1353 | kfree(map); | ||
| 1354 | return -ENOMEM; | ||
| 1355 | } | ||
| 1356 | |||
| 1357 | vdev->pci_config_map = map; | ||
| 1358 | vdev->vconfig = vconfig; | ||
| 1359 | |||
| 1360 | memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); | ||
| 1361 | memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, | ||
| 1362 | (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); | ||
| 1363 | |||
| 1364 | ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); | ||
| 1365 | if (ret) | ||
| 1366 | goto out; | ||
| 1367 | |||
| 1368 | vdev->bardirty = true; | ||
| 1369 | |||
| 1370 | /* | ||
| 1371 | * XXX can we just pci_load_saved_state/pci_restore_state? | ||
| 1372 | * may need to rebuild vconfig after that | ||
| 1373 | */ | ||
| 1374 | |||
| 1375 | /* For restore after reset */ | ||
| 1376 | vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]); | ||
| 1377 | vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]); | ||
| 1378 | vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]); | ||
| 1379 | vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]); | ||
| 1380 | vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]); | ||
| 1381 | vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]); | ||
| 1382 | vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]); | ||
| 1383 | |||
| 1384 | if (pdev->is_virtfn) { | ||
| 1385 | *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor); | ||
| 1386 | *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | ret = vfio_cap_init(vdev); | ||
| 1390 | if (ret) | ||
| 1391 | goto out; | ||
| 1392 | |||
| 1393 | ret = vfio_ecap_init(vdev); | ||
| 1394 | if (ret) | ||
| 1395 | goto out; | ||
| 1396 | |||
| 1397 | return 0; | ||
| 1398 | |||
| 1399 | out: | ||
| 1400 | kfree(map); | ||
| 1401 | vdev->pci_config_map = NULL; | ||
| 1402 | kfree(vconfig); | ||
| 1403 | vdev->vconfig = NULL; | ||
| 1404 | return pcibios_err_to_errno(ret); | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | void vfio_config_free(struct vfio_pci_device *vdev) | ||
| 1408 | { | ||
| 1409 | kfree(vdev->vconfig); | ||
| 1410 | vdev->vconfig = NULL; | ||
| 1411 | kfree(vdev->pci_config_map); | ||
| 1412 | vdev->pci_config_map = NULL; | ||
| 1413 | kfree(vdev->msi_perm); | ||
| 1414 | vdev->msi_perm = NULL; | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, | ||
| 1418 | size_t count, loff_t *ppos, bool iswrite) | ||
| 1419 | { | ||
| 1420 | struct pci_dev *pdev = vdev->pdev; | ||
| 1421 | struct perm_bits *perm; | ||
| 1422 | __le32 val = 0; | ||
| 1423 | int cap_start = 0, offset; | ||
| 1424 | u8 cap_id; | ||
| 1425 | ssize_t ret = count; | ||
| 1426 | |||
| 1427 | if (*ppos < 0 || *ppos + count > pdev->cfg_size) | ||
| 1428 | return -EFAULT; | ||
| 1429 | |||
| 1430 | /* | ||
| 1431 | * gcc can't seem to figure out we're a static function, only called | ||
| 1432 | * with count of 1/2/4 and hits copy_from_user_overflow without this. | ||
| 1433 | */ | ||
| 1434 | if (count > sizeof(val)) | ||
| 1435 | return -EINVAL; | ||
| 1436 | |||
| 1437 | cap_id = vdev->pci_config_map[*ppos / 4]; | ||
| 1438 | |||
| 1439 | if (cap_id == PCI_CAP_ID_INVALID) { | ||
| 1440 | if (iswrite) | ||
| 1441 | return ret; /* drop */ | ||
| 1442 | |||
| 1443 | /* | ||
| 1444 | * Per PCI spec 3.0, section 6.1, reads from reserved and | ||
| 1445 | * unimplemented registers return 0 | ||
| 1446 | */ | ||
| 1447 | if (copy_to_user(buf, &val, count)) | ||
| 1448 | return -EFAULT; | ||
| 1449 | |||
| 1450 | return ret; | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | /* | ||
| 1454 | * All capabilities are minimum 4 bytes and aligned on dword | ||
| 1455 | * boundaries. Since we don't support unaligned accesses, we're | ||
| 1456 | * only ever accessing a single capability. | ||
| 1457 | */ | ||
| 1458 | if (*ppos >= PCI_CFG_SPACE_SIZE) { | ||
| 1459 | WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); | ||
| 1460 | |||
| 1461 | perm = &ecap_perms[cap_id]; | ||
| 1462 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
| 1463 | |||
| 1464 | } else { | ||
| 1465 | WARN_ON(cap_id > PCI_CAP_ID_MAX); | ||
| 1466 | |||
| 1467 | perm = &cap_perms[cap_id]; | ||
| 1468 | |||
| 1469 | if (cap_id == PCI_CAP_ID_MSI) | ||
| 1470 | perm = vdev->msi_perm; | ||
| 1471 | |||
| 1472 | if (cap_id > PCI_CAP_ID_BASIC) | ||
| 1473 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); | ||
| 1477 | WARN_ON(cap_start > *ppos); | ||
| 1478 | |||
| 1479 | offset = *ppos - cap_start; | ||
| 1480 | |||
| 1481 | if (iswrite) { | ||
| 1482 | if (!perm->writefn) | ||
| 1483 | return ret; | ||
| 1484 | |||
| 1485 | if (copy_from_user(&val, buf, count)) | ||
| 1486 | return -EFAULT; | ||
| 1487 | |||
| 1488 | ret = perm->writefn(vdev, *ppos, count, perm, offset, val); | ||
| 1489 | } else { | ||
| 1490 | if (perm->readfn) { | ||
| 1491 | ret = perm->readfn(vdev, *ppos, count, | ||
| 1492 | perm, offset, &val); | ||
| 1493 | if (ret < 0) | ||
| 1494 | return ret; | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | if (copy_to_user(buf, &val, count)) | ||
| 1498 | return -EFAULT; | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | return ret; | ||
| 1502 | } | ||
| 1503 | |||
| 1504 | ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
| 1505 | char __user *buf, size_t count, | ||
| 1506 | loff_t *ppos, bool iswrite) | ||
| 1507 | { | ||
| 1508 | size_t done = 0; | ||
| 1509 | int ret = 0; | ||
| 1510 | loff_t pos = *ppos; | ||
| 1511 | |||
| 1512 | pos &= VFIO_PCI_OFFSET_MASK; | ||
| 1513 | |||
| 1514 | /* | ||
| 1515 | * We want to both keep the access size the caller users as well as | ||
| 1516 | * support reading large chunks of config space in a single call. | ||
| 1517 | * PCI doesn't support unaligned accesses, so we can safely break | ||
| 1518 | * those apart. | ||
| 1519 | */ | ||
| 1520 | while (count) { | ||
| 1521 | if (count >= 4 && !(pos % 4)) | ||
| 1522 | ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); | ||
| 1523 | else if (count >= 2 && !(pos % 2)) | ||
| 1524 | ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); | ||
| 1525 | else | ||
| 1526 | ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); | ||
| 1527 | |||
| 1528 | if (ret < 0) | ||
| 1529 | return ret; | ||
| 1530 | |||
| 1531 | count -= ret; | ||
| 1532 | done += ret; | ||
| 1533 | buf += ret; | ||
| 1534 | pos += ret; | ||
| 1535 | } | ||
| 1536 | |||
| 1537 | *ppos += done; | ||
| 1538 | |||
| 1539 | return done; | ||
| 1540 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c new file mode 100644 index 000000000000..211a4920b88a --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_intrs.c | |||
| @@ -0,0 +1,740 @@ | |||
| 1 | /* | ||
| 2 | * VFIO PCI interrupt handling | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/device.h> | ||
| 17 | #include <linux/interrupt.h> | ||
| 18 | #include <linux/eventfd.h> | ||
| 19 | #include <linux/pci.h> | ||
| 20 | #include <linux/file.h> | ||
| 21 | #include <linux/poll.h> | ||
| 22 | #include <linux/vfio.h> | ||
| 23 | #include <linux/wait.h> | ||
| 24 | #include <linux/workqueue.h> | ||
| 25 | |||
| 26 | #include "vfio_pci_private.h" | ||
| 27 | |||
| 28 | /* | ||
| 29 | * IRQfd - generic | ||
| 30 | */ | ||
| 31 | struct virqfd { | ||
| 32 | struct vfio_pci_device *vdev; | ||
| 33 | struct eventfd_ctx *eventfd; | ||
| 34 | int (*handler)(struct vfio_pci_device *, void *); | ||
| 35 | void (*thread)(struct vfio_pci_device *, void *); | ||
| 36 | void *data; | ||
| 37 | struct work_struct inject; | ||
| 38 | wait_queue_t wait; | ||
| 39 | poll_table pt; | ||
| 40 | struct work_struct shutdown; | ||
| 41 | struct virqfd **pvirqfd; | ||
| 42 | }; | ||
| 43 | |||
| 44 | static struct workqueue_struct *vfio_irqfd_cleanup_wq; | ||
| 45 | |||
| 46 | int __init vfio_pci_virqfd_init(void) | ||
| 47 | { | ||
| 48 | vfio_irqfd_cleanup_wq = | ||
| 49 | create_singlethread_workqueue("vfio-irqfd-cleanup"); | ||
| 50 | if (!vfio_irqfd_cleanup_wq) | ||
| 51 | return -ENOMEM; | ||
| 52 | |||
| 53 | return 0; | ||
| 54 | } | ||
| 55 | |||
| 56 | void vfio_pci_virqfd_exit(void) | ||
| 57 | { | ||
| 58 | destroy_workqueue(vfio_irqfd_cleanup_wq); | ||
| 59 | } | ||
| 60 | |||
| 61 | static void virqfd_deactivate(struct virqfd *virqfd) | ||
| 62 | { | ||
| 63 | queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); | ||
| 64 | } | ||
| 65 | |||
| 66 | static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
| 67 | { | ||
| 68 | struct virqfd *virqfd = container_of(wait, struct virqfd, wait); | ||
| 69 | unsigned long flags = (unsigned long)key; | ||
| 70 | |||
| 71 | if (flags & POLLIN) { | ||
| 72 | /* An event has been signaled, call function */ | ||
| 73 | if ((!virqfd->handler || | ||
| 74 | virqfd->handler(virqfd->vdev, virqfd->data)) && | ||
| 75 | virqfd->thread) | ||
| 76 | schedule_work(&virqfd->inject); | ||
| 77 | } | ||
| 78 | |||
| 79 | if (flags & POLLHUP) | ||
| 80 | /* The eventfd is closing, detach from VFIO */ | ||
| 81 | virqfd_deactivate(virqfd); | ||
| 82 | |||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | |||
| 86 | static void virqfd_ptable_queue_proc(struct file *file, | ||
| 87 | wait_queue_head_t *wqh, poll_table *pt) | ||
| 88 | { | ||
| 89 | struct virqfd *virqfd = container_of(pt, struct virqfd, pt); | ||
| 90 | add_wait_queue(wqh, &virqfd->wait); | ||
| 91 | } | ||
| 92 | |||
| 93 | static void virqfd_shutdown(struct work_struct *work) | ||
| 94 | { | ||
| 95 | struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); | ||
| 96 | struct virqfd **pvirqfd = virqfd->pvirqfd; | ||
| 97 | u64 cnt; | ||
| 98 | |||
| 99 | eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt); | ||
| 100 | flush_work(&virqfd->inject); | ||
| 101 | eventfd_ctx_put(virqfd->eventfd); | ||
| 102 | |||
| 103 | kfree(virqfd); | ||
| 104 | *pvirqfd = NULL; | ||
| 105 | } | ||
| 106 | |||
| 107 | static void virqfd_inject(struct work_struct *work) | ||
| 108 | { | ||
| 109 | struct virqfd *virqfd = container_of(work, struct virqfd, inject); | ||
| 110 | if (virqfd->thread) | ||
| 111 | virqfd->thread(virqfd->vdev, virqfd->data); | ||
| 112 | } | ||
| 113 | |||
| 114 | static int virqfd_enable(struct vfio_pci_device *vdev, | ||
| 115 | int (*handler)(struct vfio_pci_device *, void *), | ||
| 116 | void (*thread)(struct vfio_pci_device *, void *), | ||
| 117 | void *data, struct virqfd **pvirqfd, int fd) | ||
| 118 | { | ||
| 119 | struct file *file = NULL; | ||
| 120 | struct eventfd_ctx *ctx = NULL; | ||
| 121 | struct virqfd *virqfd; | ||
| 122 | int ret = 0; | ||
| 123 | unsigned int events; | ||
| 124 | |||
| 125 | if (*pvirqfd) | ||
| 126 | return -EBUSY; | ||
| 127 | |||
| 128 | virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); | ||
| 129 | if (!virqfd) | ||
| 130 | return -ENOMEM; | ||
| 131 | |||
| 132 | virqfd->pvirqfd = pvirqfd; | ||
| 133 | *pvirqfd = virqfd; | ||
| 134 | virqfd->vdev = vdev; | ||
| 135 | virqfd->handler = handler; | ||
| 136 | virqfd->thread = thread; | ||
| 137 | virqfd->data = data; | ||
| 138 | |||
| 139 | INIT_WORK(&virqfd->shutdown, virqfd_shutdown); | ||
| 140 | INIT_WORK(&virqfd->inject, virqfd_inject); | ||
| 141 | |||
| 142 | file = eventfd_fget(fd); | ||
| 143 | if (IS_ERR(file)) { | ||
| 144 | ret = PTR_ERR(file); | ||
| 145 | goto fail; | ||
| 146 | } | ||
| 147 | |||
| 148 | ctx = eventfd_ctx_fileget(file); | ||
| 149 | if (IS_ERR(ctx)) { | ||
| 150 | ret = PTR_ERR(ctx); | ||
| 151 | goto fail; | ||
| 152 | } | ||
| 153 | |||
| 154 | virqfd->eventfd = ctx; | ||
| 155 | |||
| 156 | /* | ||
| 157 | * Install our own custom wake-up handling so we are notified via | ||
| 158 | * a callback whenever someone signals the underlying eventfd. | ||
| 159 | */ | ||
| 160 | init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); | ||
| 161 | init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); | ||
| 162 | |||
| 163 | events = file->f_op->poll(file, &virqfd->pt); | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Check if there was an event already pending on the eventfd | ||
| 167 | * before we registered and trigger it as if we didn't miss it. | ||
| 168 | */ | ||
| 169 | if (events & POLLIN) { | ||
| 170 | if ((!handler || handler(vdev, data)) && thread) | ||
| 171 | schedule_work(&virqfd->inject); | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Do not drop the file until the irqfd is fully initialized, | ||
| 176 | * otherwise we might race against the POLLHUP. | ||
| 177 | */ | ||
| 178 | fput(file); | ||
| 179 | |||
| 180 | return 0; | ||
| 181 | |||
| 182 | fail: | ||
| 183 | if (ctx && !IS_ERR(ctx)) | ||
| 184 | eventfd_ctx_put(ctx); | ||
| 185 | |||
| 186 | if (file && !IS_ERR(file)) | ||
| 187 | fput(file); | ||
| 188 | |||
| 189 | kfree(virqfd); | ||
| 190 | *pvirqfd = NULL; | ||
| 191 | |||
| 192 | return ret; | ||
| 193 | } | ||
| 194 | |||
| 195 | static void virqfd_disable(struct virqfd *virqfd) | ||
| 196 | { | ||
| 197 | if (!virqfd) | ||
| 198 | return; | ||
| 199 | |||
| 200 | virqfd_deactivate(virqfd); | ||
| 201 | |||
| 202 | /* Block until we know all outstanding shutdown jobs have completed. */ | ||
| 203 | flush_workqueue(vfio_irqfd_cleanup_wq); | ||
| 204 | } | ||
| 205 | |||
| 206 | /* | ||
| 207 | * INTx | ||
| 208 | */ | ||
| 209 | static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused) | ||
| 210 | { | ||
| 211 | if (likely(is_intx(vdev) && !vdev->virq_disabled)) | ||
| 212 | eventfd_signal(vdev->ctx[0].trigger, 1); | ||
| 213 | } | ||
| 214 | |||
| 215 | void vfio_pci_intx_mask(struct vfio_pci_device *vdev) | ||
| 216 | { | ||
| 217 | struct pci_dev *pdev = vdev->pdev; | ||
| 218 | unsigned long flags; | ||
| 219 | |||
| 220 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Masking can come from interrupt, ioctl, or config space | ||
| 224 | * via INTx disable. The latter means this can get called | ||
| 225 | * even when not using intx delivery. In this case, just | ||
| 226 | * try to have the physical bit follow the virtual bit. | ||
| 227 | */ | ||
| 228 | if (unlikely(!is_intx(vdev))) { | ||
| 229 | if (vdev->pci_2_3) | ||
| 230 | pci_intx(pdev, 0); | ||
| 231 | } else if (!vdev->ctx[0].masked) { | ||
| 232 | /* | ||
| 233 | * Can't use check_and_mask here because we always want to | ||
| 234 | * mask, not just when something is pending. | ||
| 235 | */ | ||
| 236 | if (vdev->pci_2_3) | ||
| 237 | pci_intx(pdev, 0); | ||
| 238 | else | ||
| 239 | disable_irq_nosync(pdev->irq); | ||
| 240 | |||
| 241 | vdev->ctx[0].masked = true; | ||
| 242 | } | ||
| 243 | |||
| 244 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
| 245 | } | ||
| 246 | |||
| 247 | /* | ||
| 248 | * If this is triggered by an eventfd, we can't call eventfd_signal | ||
| 249 | * or else we'll deadlock on the eventfd wait queue. Return >0 when | ||
| 250 | * a signal is necessary, which can then be handled via a work queue | ||
| 251 | * or directly depending on the caller. | ||
| 252 | */ | ||
| 253 | int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) | ||
| 254 | { | ||
| 255 | struct pci_dev *pdev = vdev->pdev; | ||
| 256 | unsigned long flags; | ||
| 257 | int ret = 0; | ||
| 258 | |||
| 259 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
| 260 | |||
| 261 | /* | ||
| 262 | * Unmasking comes from ioctl or config, so again, have the | ||
| 263 | * physical bit follow the virtual even when not using INTx. | ||
| 264 | */ | ||
| 265 | if (unlikely(!is_intx(vdev))) { | ||
| 266 | if (vdev->pci_2_3) | ||
| 267 | pci_intx(pdev, 1); | ||
| 268 | } else if (vdev->ctx[0].masked && !vdev->virq_disabled) { | ||
| 269 | /* | ||
| 270 | * A pending interrupt here would immediately trigger, | ||
| 271 | * but we can avoid that overhead by just re-sending | ||
| 272 | * the interrupt to the user. | ||
| 273 | */ | ||
| 274 | if (vdev->pci_2_3) { | ||
| 275 | if (!pci_check_and_unmask_intx(pdev)) | ||
| 276 | ret = 1; | ||
| 277 | } else | ||
| 278 | enable_irq(pdev->irq); | ||
| 279 | |||
| 280 | vdev->ctx[0].masked = (ret > 0); | ||
| 281 | } | ||
| 282 | |||
| 283 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
| 284 | |||
| 285 | return ret; | ||
| 286 | } | ||
| 287 | |||
| 288 | void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) | ||
| 289 | { | ||
| 290 | if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) | ||
| 291 | vfio_send_intx_eventfd(vdev, NULL); | ||
| 292 | } | ||
| 293 | |||
| 294 | static irqreturn_t vfio_intx_handler(int irq, void *dev_id) | ||
| 295 | { | ||
| 296 | struct vfio_pci_device *vdev = dev_id; | ||
| 297 | unsigned long flags; | ||
| 298 | int ret = IRQ_NONE; | ||
| 299 | |||
| 300 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
| 301 | |||
| 302 | if (!vdev->pci_2_3) { | ||
| 303 | disable_irq_nosync(vdev->pdev->irq); | ||
| 304 | vdev->ctx[0].masked = true; | ||
| 305 | ret = IRQ_HANDLED; | ||
| 306 | } else if (!vdev->ctx[0].masked && /* may be shared */ | ||
| 307 | pci_check_and_mask_intx(vdev->pdev)) { | ||
| 308 | vdev->ctx[0].masked = true; | ||
| 309 | ret = IRQ_HANDLED; | ||
| 310 | } | ||
| 311 | |||
| 312 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
| 313 | |||
| 314 | if (ret == IRQ_HANDLED) | ||
| 315 | vfio_send_intx_eventfd(vdev, NULL); | ||
| 316 | |||
| 317 | return ret; | ||
| 318 | } | ||
| 319 | |||
| 320 | static int vfio_intx_enable(struct vfio_pci_device *vdev) | ||
| 321 | { | ||
| 322 | if (!is_irq_none(vdev)) | ||
| 323 | return -EINVAL; | ||
| 324 | |||
| 325 | if (!vdev->pdev->irq) | ||
| 326 | return -ENODEV; | ||
| 327 | |||
| 328 | vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
| 329 | if (!vdev->ctx) | ||
| 330 | return -ENOMEM; | ||
| 331 | |||
| 332 | vdev->num_ctx = 1; | ||
| 333 | vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; | ||
| 334 | |||
| 335 | return 0; | ||
| 336 | } | ||
| 337 | |||
| 338 | static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) | ||
| 339 | { | ||
| 340 | struct pci_dev *pdev = vdev->pdev; | ||
| 341 | unsigned long irqflags = IRQF_SHARED; | ||
| 342 | struct eventfd_ctx *trigger; | ||
| 343 | unsigned long flags; | ||
| 344 | int ret; | ||
| 345 | |||
| 346 | if (vdev->ctx[0].trigger) { | ||
| 347 | free_irq(pdev->irq, vdev); | ||
| 348 | kfree(vdev->ctx[0].name); | ||
| 349 | eventfd_ctx_put(vdev->ctx[0].trigger); | ||
| 350 | vdev->ctx[0].trigger = NULL; | ||
| 351 | } | ||
| 352 | |||
| 353 | if (fd < 0) /* Disable only */ | ||
| 354 | return 0; | ||
| 355 | |||
| 356 | vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", | ||
| 357 | pci_name(pdev)); | ||
| 358 | if (!vdev->ctx[0].name) | ||
| 359 | return -ENOMEM; | ||
| 360 | |||
| 361 | trigger = eventfd_ctx_fdget(fd); | ||
| 362 | if (IS_ERR(trigger)) { | ||
| 363 | kfree(vdev->ctx[0].name); | ||
| 364 | return PTR_ERR(trigger); | ||
| 365 | } | ||
| 366 | |||
| 367 | if (!vdev->pci_2_3) | ||
| 368 | irqflags = 0; | ||
| 369 | |||
| 370 | ret = request_irq(pdev->irq, vfio_intx_handler, | ||
| 371 | irqflags, vdev->ctx[0].name, vdev); | ||
| 372 | if (ret) { | ||
| 373 | kfree(vdev->ctx[0].name); | ||
| 374 | eventfd_ctx_put(trigger); | ||
| 375 | return ret; | ||
| 376 | } | ||
| 377 | |||
| 378 | vdev->ctx[0].trigger = trigger; | ||
| 379 | |||
| 380 | /* | ||
| 381 | * INTx disable will stick across the new irq setup, | ||
| 382 | * disable_irq won't. | ||
| 383 | */ | ||
| 384 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
| 385 | if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled)) | ||
| 386 | disable_irq_nosync(pdev->irq); | ||
| 387 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
| 388 | |||
| 389 | return 0; | ||
| 390 | } | ||
| 391 | |||
| 392 | static void vfio_intx_disable(struct vfio_pci_device *vdev) | ||
| 393 | { | ||
| 394 | vfio_intx_set_signal(vdev, -1); | ||
| 395 | virqfd_disable(vdev->ctx[0].unmask); | ||
| 396 | virqfd_disable(vdev->ctx[0].mask); | ||
| 397 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
| 398 | vdev->num_ctx = 0; | ||
| 399 | kfree(vdev->ctx); | ||
| 400 | } | ||
| 401 | |||
| 402 | /* | ||
| 403 | * MSI/MSI-X | ||
| 404 | */ | ||
| 405 | static irqreturn_t vfio_msihandler(int irq, void *arg) | ||
| 406 | { | ||
| 407 | struct eventfd_ctx *trigger = arg; | ||
| 408 | |||
| 409 | eventfd_signal(trigger, 1); | ||
| 410 | return IRQ_HANDLED; | ||
| 411 | } | ||
| 412 | |||
| 413 | static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) | ||
| 414 | { | ||
| 415 | struct pci_dev *pdev = vdev->pdev; | ||
| 416 | int ret; | ||
| 417 | |||
| 418 | if (!is_irq_none(vdev)) | ||
| 419 | return -EINVAL; | ||
| 420 | |||
| 421 | vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
| 422 | if (!vdev->ctx) | ||
| 423 | return -ENOMEM; | ||
| 424 | |||
| 425 | if (msix) { | ||
| 426 | int i; | ||
| 427 | |||
| 428 | vdev->msix = kzalloc(nvec * sizeof(struct msix_entry), | ||
| 429 | GFP_KERNEL); | ||
| 430 | if (!vdev->msix) { | ||
| 431 | kfree(vdev->ctx); | ||
| 432 | return -ENOMEM; | ||
| 433 | } | ||
| 434 | |||
| 435 | for (i = 0; i < nvec; i++) | ||
| 436 | vdev->msix[i].entry = i; | ||
| 437 | |||
| 438 | ret = pci_enable_msix(pdev, vdev->msix, nvec); | ||
| 439 | if (ret) { | ||
| 440 | kfree(vdev->msix); | ||
| 441 | kfree(vdev->ctx); | ||
| 442 | return ret; | ||
| 443 | } | ||
| 444 | } else { | ||
| 445 | ret = pci_enable_msi_block(pdev, nvec); | ||
| 446 | if (ret) { | ||
| 447 | kfree(vdev->ctx); | ||
| 448 | return ret; | ||
| 449 | } | ||
| 450 | } | ||
| 451 | |||
| 452 | vdev->num_ctx = nvec; | ||
| 453 | vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : | ||
| 454 | VFIO_PCI_MSI_IRQ_INDEX; | ||
| 455 | |||
| 456 | if (!msix) { | ||
| 457 | /* | ||
| 458 | * Compute the virtual hardware field for max msi vectors - | ||
| 459 | * it is the log base 2 of the number of vectors. | ||
| 460 | */ | ||
| 461 | vdev->msi_qmax = fls(nvec * 2 - 1) - 1; | ||
| 462 | } | ||
| 463 | |||
| 464 | return 0; | ||
| 465 | } | ||
| 466 | |||
| 467 | static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, | ||
| 468 | int vector, int fd, bool msix) | ||
| 469 | { | ||
| 470 | struct pci_dev *pdev = vdev->pdev; | ||
| 471 | int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; | ||
| 472 | char *name = msix ? "vfio-msix" : "vfio-msi"; | ||
| 473 | struct eventfd_ctx *trigger; | ||
| 474 | int ret; | ||
| 475 | |||
| 476 | if (vector >= vdev->num_ctx) | ||
| 477 | return -EINVAL; | ||
| 478 | |||
| 479 | if (vdev->ctx[vector].trigger) { | ||
| 480 | free_irq(irq, vdev->ctx[vector].trigger); | ||
| 481 | kfree(vdev->ctx[vector].name); | ||
| 482 | eventfd_ctx_put(vdev->ctx[vector].trigger); | ||
| 483 | vdev->ctx[vector].trigger = NULL; | ||
| 484 | } | ||
| 485 | |||
| 486 | if (fd < 0) | ||
| 487 | return 0; | ||
| 488 | |||
| 489 | vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", | ||
| 490 | name, vector, pci_name(pdev)); | ||
| 491 | if (!vdev->ctx[vector].name) | ||
| 492 | return -ENOMEM; | ||
| 493 | |||
| 494 | trigger = eventfd_ctx_fdget(fd); | ||
| 495 | if (IS_ERR(trigger)) { | ||
| 496 | kfree(vdev->ctx[vector].name); | ||
| 497 | return PTR_ERR(trigger); | ||
| 498 | } | ||
| 499 | |||
| 500 | ret = request_irq(irq, vfio_msihandler, 0, | ||
| 501 | vdev->ctx[vector].name, trigger); | ||
| 502 | if (ret) { | ||
| 503 | kfree(vdev->ctx[vector].name); | ||
| 504 | eventfd_ctx_put(trigger); | ||
| 505 | return ret; | ||
| 506 | } | ||
| 507 | |||
| 508 | vdev->ctx[vector].trigger = trigger; | ||
| 509 | |||
| 510 | return 0; | ||
| 511 | } | ||
| 512 | |||
| 513 | static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, | ||
| 514 | unsigned count, int32_t *fds, bool msix) | ||
| 515 | { | ||
| 516 | int i, j, ret = 0; | ||
| 517 | |||
| 518 | if (start + count > vdev->num_ctx) | ||
| 519 | return -EINVAL; | ||
| 520 | |||
| 521 | for (i = 0, j = start; i < count && !ret; i++, j++) { | ||
| 522 | int fd = fds ? fds[i] : -1; | ||
| 523 | ret = vfio_msi_set_vector_signal(vdev, j, fd, msix); | ||
| 524 | } | ||
| 525 | |||
| 526 | if (ret) { | ||
| 527 | for (--j; j >= start; j--) | ||
| 528 | vfio_msi_set_vector_signal(vdev, j, -1, msix); | ||
| 529 | } | ||
| 530 | |||
| 531 | return ret; | ||
| 532 | } | ||
| 533 | |||
| 534 | static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) | ||
| 535 | { | ||
| 536 | struct pci_dev *pdev = vdev->pdev; | ||
| 537 | int i; | ||
| 538 | |||
| 539 | vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix); | ||
| 540 | |||
| 541 | for (i = 0; i < vdev->num_ctx; i++) { | ||
| 542 | virqfd_disable(vdev->ctx[i].unmask); | ||
| 543 | virqfd_disable(vdev->ctx[i].mask); | ||
| 544 | } | ||
| 545 | |||
| 546 | if (msix) { | ||
| 547 | pci_disable_msix(vdev->pdev); | ||
| 548 | kfree(vdev->msix); | ||
| 549 | } else | ||
| 550 | pci_disable_msi(pdev); | ||
| 551 | |||
| 552 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
| 553 | vdev->num_ctx = 0; | ||
| 554 | kfree(vdev->ctx); | ||
| 555 | } | ||
| 556 | |||
| 557 | /* | ||
| 558 | * IOCTL support | ||
| 559 | */ | ||
| 560 | static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, | ||
| 561 | unsigned index, unsigned start, | ||
| 562 | unsigned count, uint32_t flags, void *data) | ||
| 563 | { | ||
| 564 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
| 565 | return -EINVAL; | ||
| 566 | |||
| 567 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
| 568 | vfio_pci_intx_unmask(vdev); | ||
| 569 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
| 570 | uint8_t unmask = *(uint8_t *)data; | ||
| 571 | if (unmask) | ||
| 572 | vfio_pci_intx_unmask(vdev); | ||
| 573 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
| 574 | int32_t fd = *(int32_t *)data; | ||
| 575 | if (fd >= 0) | ||
| 576 | return virqfd_enable(vdev, vfio_pci_intx_unmask_handler, | ||
| 577 | vfio_send_intx_eventfd, NULL, | ||
| 578 | &vdev->ctx[0].unmask, fd); | ||
| 579 | |||
| 580 | virqfd_disable(vdev->ctx[0].unmask); | ||
| 581 | } | ||
| 582 | |||
| 583 | return 0; | ||
| 584 | } | ||
| 585 | |||
| 586 | static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, | ||
| 587 | unsigned index, unsigned start, | ||
| 588 | unsigned count, uint32_t flags, void *data) | ||
| 589 | { | ||
| 590 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
| 591 | return -EINVAL; | ||
| 592 | |||
| 593 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
| 594 | vfio_pci_intx_mask(vdev); | ||
| 595 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
| 596 | uint8_t mask = *(uint8_t *)data; | ||
| 597 | if (mask) | ||
| 598 | vfio_pci_intx_mask(vdev); | ||
| 599 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
| 600 | return -ENOTTY; /* XXX implement me */ | ||
| 601 | } | ||
| 602 | |||
| 603 | return 0; | ||
| 604 | } | ||
| 605 | |||
| 606 | static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, | ||
| 607 | unsigned index, unsigned start, | ||
| 608 | unsigned count, uint32_t flags, void *data) | ||
| 609 | { | ||
| 610 | if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
| 611 | vfio_intx_disable(vdev); | ||
| 612 | return 0; | ||
| 613 | } | ||
| 614 | |||
| 615 | if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1) | ||
| 616 | return -EINVAL; | ||
| 617 | |||
| 618 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
| 619 | int32_t fd = *(int32_t *)data; | ||
| 620 | int ret; | ||
| 621 | |||
| 622 | if (is_intx(vdev)) | ||
| 623 | return vfio_intx_set_signal(vdev, fd); | ||
| 624 | |||
| 625 | ret = vfio_intx_enable(vdev); | ||
| 626 | if (ret) | ||
| 627 | return ret; | ||
| 628 | |||
| 629 | ret = vfio_intx_set_signal(vdev, fd); | ||
| 630 | if (ret) | ||
| 631 | vfio_intx_disable(vdev); | ||
| 632 | |||
| 633 | return ret; | ||
| 634 | } | ||
| 635 | |||
| 636 | if (!is_intx(vdev)) | ||
| 637 | return -EINVAL; | ||
| 638 | |||
| 639 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
| 640 | vfio_send_intx_eventfd(vdev, NULL); | ||
| 641 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
| 642 | uint8_t trigger = *(uint8_t *)data; | ||
| 643 | if (trigger) | ||
| 644 | vfio_send_intx_eventfd(vdev, NULL); | ||
| 645 | } | ||
| 646 | return 0; | ||
| 647 | } | ||
| 648 | |||
| 649 | static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, | ||
| 650 | unsigned index, unsigned start, | ||
| 651 | unsigned count, uint32_t flags, void *data) | ||
| 652 | { | ||
| 653 | int i; | ||
| 654 | bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; | ||
| 655 | |||
| 656 | if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
| 657 | vfio_msi_disable(vdev, msix); | ||
| 658 | return 0; | ||
| 659 | } | ||
| 660 | |||
| 661 | if (!(irq_is(vdev, index) || is_irq_none(vdev))) | ||
| 662 | return -EINVAL; | ||
| 663 | |||
| 664 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
| 665 | int32_t *fds = data; | ||
| 666 | int ret; | ||
| 667 | |||
| 668 | if (vdev->irq_type == index) | ||
| 669 | return vfio_msi_set_block(vdev, start, count, | ||
| 670 | fds, msix); | ||
| 671 | |||
| 672 | ret = vfio_msi_enable(vdev, start + count, msix); | ||
| 673 | if (ret) | ||
| 674 | return ret; | ||
| 675 | |||
| 676 | ret = vfio_msi_set_block(vdev, start, count, fds, msix); | ||
| 677 | if (ret) | ||
| 678 | vfio_msi_disable(vdev, msix); | ||
| 679 | |||
| 680 | return ret; | ||
| 681 | } | ||
| 682 | |||
| 683 | if (!irq_is(vdev, index) || start + count > vdev->num_ctx) | ||
| 684 | return -EINVAL; | ||
| 685 | |||
| 686 | for (i = start; i < start + count; i++) { | ||
| 687 | if (!vdev->ctx[i].trigger) | ||
| 688 | continue; | ||
| 689 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
| 690 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
| 691 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
| 692 | uint8_t *bools = data; | ||
| 693 | if (bools[i - start]) | ||
| 694 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
| 695 | } | ||
| 696 | } | ||
| 697 | return 0; | ||
| 698 | } | ||
| 699 | |||
| 700 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | ||
| 701 | unsigned index, unsigned start, unsigned count, | ||
| 702 | void *data) | ||
| 703 | { | ||
| 704 | int (*func)(struct vfio_pci_device *vdev, unsigned index, | ||
| 705 | unsigned start, unsigned count, uint32_t flags, | ||
| 706 | void *data) = NULL; | ||
| 707 | |||
| 708 | switch (index) { | ||
| 709 | case VFIO_PCI_INTX_IRQ_INDEX: | ||
| 710 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
| 711 | case VFIO_IRQ_SET_ACTION_MASK: | ||
| 712 | func = vfio_pci_set_intx_mask; | ||
| 713 | break; | ||
| 714 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
| 715 | func = vfio_pci_set_intx_unmask; | ||
| 716 | break; | ||
| 717 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
| 718 | func = vfio_pci_set_intx_trigger; | ||
| 719 | break; | ||
| 720 | } | ||
| 721 | break; | ||
| 722 | case VFIO_PCI_MSI_IRQ_INDEX: | ||
| 723 | case VFIO_PCI_MSIX_IRQ_INDEX: | ||
| 724 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
| 725 | case VFIO_IRQ_SET_ACTION_MASK: | ||
| 726 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
| 727 | /* XXX Need masking support exported */ | ||
| 728 | break; | ||
| 729 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
| 730 | func = vfio_pci_set_msi_trigger; | ||
| 731 | break; | ||
| 732 | } | ||
| 733 | break; | ||
| 734 | } | ||
| 735 | |||
| 736 | if (!func) | ||
| 737 | return -ENOTTY; | ||
| 738 | |||
| 739 | return func(vdev, index, start, count, flags, data); | ||
| 740 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h new file mode 100644 index 000000000000..611827cba8cd --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License version 2 as | ||
| 7 | * published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * Derived from original vfio: | ||
| 10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 11 | * Author: Tom Lyon, pugs@cisco.com | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mutex.h> | ||
| 15 | #include <linux/pci.h> | ||
| 16 | |||
| 17 | #ifndef VFIO_PCI_PRIVATE_H | ||
| 18 | #define VFIO_PCI_PRIVATE_H | ||
| 19 | |||
| 20 | #define VFIO_PCI_OFFSET_SHIFT 40 | ||
| 21 | |||
| 22 | #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) | ||
| 23 | #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) | ||
| 24 | #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) | ||
| 25 | |||
| 26 | struct vfio_pci_irq_ctx { | ||
| 27 | struct eventfd_ctx *trigger; | ||
| 28 | struct virqfd *unmask; | ||
| 29 | struct virqfd *mask; | ||
| 30 | char *name; | ||
| 31 | bool masked; | ||
| 32 | }; | ||
| 33 | |||
| 34 | struct vfio_pci_device { | ||
| 35 | struct pci_dev *pdev; | ||
| 36 | void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; | ||
| 37 | u8 *pci_config_map; | ||
| 38 | u8 *vconfig; | ||
| 39 | struct perm_bits *msi_perm; | ||
| 40 | spinlock_t irqlock; | ||
| 41 | struct mutex igate; | ||
| 42 | struct msix_entry *msix; | ||
| 43 | struct vfio_pci_irq_ctx *ctx; | ||
| 44 | int num_ctx; | ||
| 45 | int irq_type; | ||
| 46 | u8 msi_qmax; | ||
| 47 | u8 msix_bar; | ||
| 48 | u16 msix_size; | ||
| 49 | u32 msix_offset; | ||
| 50 | u32 rbar[7]; | ||
| 51 | bool pci_2_3; | ||
| 52 | bool virq_disabled; | ||
| 53 | bool reset_works; | ||
| 54 | bool extended_caps; | ||
| 55 | bool bardirty; | ||
| 56 | struct pci_saved_state *pci_saved_state; | ||
| 57 | atomic_t refcnt; | ||
| 58 | }; | ||
| 59 | |||
| 60 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | ||
| 61 | #define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) | ||
| 62 | #define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) | ||
| 63 | #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) | ||
| 64 | #define irq_is(vdev, type) (vdev->irq_type == type) | ||
| 65 | |||
| 66 | extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); | ||
| 67 | extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); | ||
| 68 | |||
| 69 | extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, | ||
| 70 | uint32_t flags, unsigned index, | ||
| 71 | unsigned start, unsigned count, void *data); | ||
| 72 | |||
| 73 | extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
| 74 | char __user *buf, size_t count, | ||
| 75 | loff_t *ppos, bool iswrite); | ||
| 76 | extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, | ||
| 77 | char __user *buf, size_t count, | ||
| 78 | loff_t *ppos, bool iswrite); | ||
| 79 | extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, | ||
| 80 | char __user *buf, size_t count, | ||
| 81 | loff_t *ppos, bool iswrite); | ||
| 82 | |||
| 83 | extern int vfio_pci_init_perm_bits(void); | ||
| 84 | extern void vfio_pci_uninit_perm_bits(void); | ||
| 85 | |||
| 86 | extern int vfio_pci_virqfd_init(void); | ||
| 87 | extern void vfio_pci_virqfd_exit(void); | ||
| 88 | |||
| 89 | extern int vfio_config_init(struct vfio_pci_device *vdev); | ||
| 90 | extern void vfio_config_free(struct vfio_pci_device *vdev); | ||
| 91 | #endif /* VFIO_PCI_PRIVATE_H */ | ||
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c new file mode 100644 index 000000000000..4362d9e7baa3 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_rdwr.c | |||
| @@ -0,0 +1,269 @@ | |||
| 1 | /* | ||
| 2 | * VFIO PCI I/O Port & MMIO access | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/fs.h> | ||
| 17 | #include <linux/pci.h> | ||
| 18 | #include <linux/uaccess.h> | ||
| 19 | #include <linux/io.h> | ||
| 20 | |||
| 21 | #include "vfio_pci_private.h" | ||
| 22 | |||
| 23 | /* I/O Port BAR access */ | ||
| 24 | ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
| 25 | size_t count, loff_t *ppos, bool iswrite) | ||
| 26 | { | ||
| 27 | struct pci_dev *pdev = vdev->pdev; | ||
| 28 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
| 29 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
| 30 | void __iomem *io; | ||
| 31 | size_t done = 0; | ||
| 32 | |||
| 33 | if (!pci_resource_start(pdev, bar)) | ||
| 34 | return -EINVAL; | ||
| 35 | |||
| 36 | if (pos + count > pci_resource_len(pdev, bar)) | ||
| 37 | return -EINVAL; | ||
| 38 | |||
| 39 | if (!vdev->barmap[bar]) { | ||
| 40 | int ret; | ||
| 41 | |||
| 42 | ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); | ||
| 43 | if (ret) | ||
| 44 | return ret; | ||
| 45 | |||
| 46 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
| 47 | |||
| 48 | if (!vdev->barmap[bar]) { | ||
| 49 | pci_release_selected_regions(pdev, 1 << bar); | ||
| 50 | return -EINVAL; | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | io = vdev->barmap[bar]; | ||
| 55 | |||
| 56 | while (count) { | ||
| 57 | int filled; | ||
| 58 | |||
| 59 | if (count >= 3 && !(pos % 4)) { | ||
| 60 | __le32 val; | ||
| 61 | |||
| 62 | if (iswrite) { | ||
| 63 | if (copy_from_user(&val, buf, 4)) | ||
| 64 | return -EFAULT; | ||
| 65 | |||
| 66 | iowrite32(le32_to_cpu(val), io + pos); | ||
| 67 | } else { | ||
| 68 | val = cpu_to_le32(ioread32(io + pos)); | ||
| 69 | |||
| 70 | if (copy_to_user(buf, &val, 4)) | ||
| 71 | return -EFAULT; | ||
| 72 | } | ||
| 73 | |||
| 74 | filled = 4; | ||
| 75 | |||
| 76 | } else if ((pos % 2) == 0 && count >= 2) { | ||
| 77 | __le16 val; | ||
| 78 | |||
| 79 | if (iswrite) { | ||
| 80 | if (copy_from_user(&val, buf, 2)) | ||
| 81 | return -EFAULT; | ||
| 82 | |||
| 83 | iowrite16(le16_to_cpu(val), io + pos); | ||
| 84 | } else { | ||
| 85 | val = cpu_to_le16(ioread16(io + pos)); | ||
| 86 | |||
| 87 | if (copy_to_user(buf, &val, 2)) | ||
| 88 | return -EFAULT; | ||
| 89 | } | ||
| 90 | |||
| 91 | filled = 2; | ||
| 92 | } else { | ||
| 93 | u8 val; | ||
| 94 | |||
| 95 | if (iswrite) { | ||
| 96 | if (copy_from_user(&val, buf, 1)) | ||
| 97 | return -EFAULT; | ||
| 98 | |||
| 99 | iowrite8(val, io + pos); | ||
| 100 | } else { | ||
| 101 | val = ioread8(io + pos); | ||
| 102 | |||
| 103 | if (copy_to_user(buf, &val, 1)) | ||
| 104 | return -EFAULT; | ||
| 105 | } | ||
| 106 | |||
| 107 | filled = 1; | ||
| 108 | } | ||
| 109 | |||
| 110 | count -= filled; | ||
| 111 | done += filled; | ||
| 112 | buf += filled; | ||
| 113 | pos += filled; | ||
| 114 | } | ||
| 115 | |||
| 116 | *ppos += done; | ||
| 117 | |||
| 118 | return done; | ||
| 119 | } | ||
| 120 | |||
| 121 | /* | ||
| 122 | * MMIO BAR access | ||
| 123 | * We handle two excluded ranges here as well, if the user tries to read | ||
| 124 | * the ROM beyond what PCI tells us is available or the MSI-X table region, | ||
| 125 | * we return 0xFF and writes are dropped. | ||
| 126 | */ | ||
| 127 | ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
| 128 | size_t count, loff_t *ppos, bool iswrite) | ||
| 129 | { | ||
| 130 | struct pci_dev *pdev = vdev->pdev; | ||
| 131 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
| 132 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
| 133 | void __iomem *io; | ||
| 134 | resource_size_t end; | ||
| 135 | size_t done = 0; | ||
| 136 | size_t x_start = 0, x_end = 0; /* excluded range */ | ||
| 137 | |||
| 138 | if (!pci_resource_start(pdev, bar)) | ||
| 139 | return -EINVAL; | ||
| 140 | |||
| 141 | end = pci_resource_len(pdev, bar); | ||
| 142 | |||
| 143 | if (pos > end) | ||
| 144 | return -EINVAL; | ||
| 145 | |||
| 146 | if (pos == end) | ||
| 147 | return 0; | ||
| 148 | |||
| 149 | if (pos + count > end) | ||
| 150 | count = end - pos; | ||
| 151 | |||
| 152 | if (bar == PCI_ROM_RESOURCE) { | ||
| 153 | io = pci_map_rom(pdev, &x_start); | ||
| 154 | x_end = end; | ||
| 155 | } else { | ||
| 156 | if (!vdev->barmap[bar]) { | ||
| 157 | int ret; | ||
| 158 | |||
| 159 | ret = pci_request_selected_regions(pdev, 1 << bar, | ||
| 160 | "vfio"); | ||
| 161 | if (ret) | ||
| 162 | return ret; | ||
| 163 | |||
| 164 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
| 165 | |||
| 166 | if (!vdev->barmap[bar]) { | ||
| 167 | pci_release_selected_regions(pdev, 1 << bar); | ||
| 168 | return -EINVAL; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | io = vdev->barmap[bar]; | ||
| 173 | |||
| 174 | if (bar == vdev->msix_bar) { | ||
| 175 | x_start = vdev->msix_offset; | ||
| 176 | x_end = vdev->msix_offset + vdev->msix_size; | ||
| 177 | } | ||
| 178 | } | ||
| 179 | |||
| 180 | if (!io) | ||
| 181 | return -EINVAL; | ||
| 182 | |||
| 183 | while (count) { | ||
| 184 | size_t fillable, filled; | ||
| 185 | |||
| 186 | if (pos < x_start) | ||
| 187 | fillable = x_start - pos; | ||
| 188 | else if (pos >= x_end) | ||
| 189 | fillable = end - pos; | ||
| 190 | else | ||
| 191 | fillable = 0; | ||
| 192 | |||
| 193 | if (fillable >= 4 && !(pos % 4) && (count >= 4)) { | ||
| 194 | __le32 val; | ||
| 195 | |||
| 196 | if (iswrite) { | ||
| 197 | if (copy_from_user(&val, buf, 4)) | ||
| 198 | goto out; | ||
| 199 | |||
| 200 | iowrite32(le32_to_cpu(val), io + pos); | ||
| 201 | } else { | ||
| 202 | val = cpu_to_le32(ioread32(io + pos)); | ||
| 203 | |||
| 204 | if (copy_to_user(buf, &val, 4)) | ||
| 205 | goto out; | ||
| 206 | } | ||
| 207 | |||
| 208 | filled = 4; | ||
| 209 | } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) { | ||
| 210 | __le16 val; | ||
| 211 | |||
| 212 | if (iswrite) { | ||
| 213 | if (copy_from_user(&val, buf, 2)) | ||
| 214 | goto out; | ||
| 215 | |||
| 216 | iowrite16(le16_to_cpu(val), io + pos); | ||
| 217 | } else { | ||
| 218 | val = cpu_to_le16(ioread16(io + pos)); | ||
| 219 | |||
| 220 | if (copy_to_user(buf, &val, 2)) | ||
| 221 | goto out; | ||
| 222 | } | ||
| 223 | |||
| 224 | filled = 2; | ||
| 225 | } else if (fillable) { | ||
| 226 | u8 val; | ||
| 227 | |||
| 228 | if (iswrite) { | ||
| 229 | if (copy_from_user(&val, buf, 1)) | ||
| 230 | goto out; | ||
| 231 | |||
| 232 | iowrite8(val, io + pos); | ||
| 233 | } else { | ||
| 234 | val = ioread8(io + pos); | ||
| 235 | |||
| 236 | if (copy_to_user(buf, &val, 1)) | ||
| 237 | goto out; | ||
| 238 | } | ||
| 239 | |||
| 240 | filled = 1; | ||
| 241 | } else { | ||
| 242 | /* Drop writes, fill reads with FF */ | ||
| 243 | if (!iswrite) { | ||
| 244 | char val = 0xFF; | ||
| 245 | size_t i; | ||
| 246 | |||
| 247 | for (i = 0; i < x_end - pos; i++) { | ||
| 248 | if (put_user(val, buf + i)) | ||
| 249 | goto out; | ||
| 250 | } | ||
| 251 | } | ||
| 252 | |||
| 253 | filled = x_end - pos; | ||
| 254 | } | ||
| 255 | |||
| 256 | count -= filled; | ||
| 257 | done += filled; | ||
| 258 | buf += filled; | ||
| 259 | pos += filled; | ||
| 260 | } | ||
| 261 | |||
| 262 | *ppos += done; | ||
| 263 | |||
| 264 | out: | ||
| 265 | if (bar == PCI_ROM_RESOURCE) | ||
| 266 | pci_unmap_rom(pdev, io); | ||
| 267 | |||
| 268 | return count ? -EFAULT : done; | ||
| 269 | } | ||
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c new file mode 100644 index 000000000000..9591e2b509d7 --- /dev/null +++ b/drivers/vfio/vfio.c | |||
| @@ -0,0 +1,1420 @@ | |||
| 1 | /* | ||
| 2 | * VFIO core | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/cdev.h> | ||
| 17 | #include <linux/compat.h> | ||
| 18 | #include <linux/device.h> | ||
| 19 | #include <linux/file.h> | ||
| 20 | #include <linux/anon_inodes.h> | ||
| 21 | #include <linux/fs.h> | ||
| 22 | #include <linux/idr.h> | ||
| 23 | #include <linux/iommu.h> | ||
| 24 | #include <linux/list.h> | ||
| 25 | #include <linux/module.h> | ||
| 26 | #include <linux/mutex.h> | ||
| 27 | #include <linux/sched.h> | ||
| 28 | #include <linux/slab.h> | ||
| 29 | #include <linux/string.h> | ||
| 30 | #include <linux/uaccess.h> | ||
| 31 | #include <linux/vfio.h> | ||
| 32 | #include <linux/wait.h> | ||
| 33 | |||
| 34 | #define DRIVER_VERSION "0.3" | ||
| 35 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
| 36 | #define DRIVER_DESC "VFIO - User Level meta-driver" | ||
| 37 | |||
| 38 | static struct vfio { | ||
| 39 | struct class *class; | ||
| 40 | struct list_head iommu_drivers_list; | ||
| 41 | struct mutex iommu_drivers_lock; | ||
| 42 | struct list_head group_list; | ||
| 43 | struct idr group_idr; | ||
| 44 | struct mutex group_lock; | ||
| 45 | struct cdev group_cdev; | ||
| 46 | struct device *dev; | ||
| 47 | dev_t devt; | ||
| 48 | struct cdev cdev; | ||
| 49 | wait_queue_head_t release_q; | ||
| 50 | } vfio; | ||
| 51 | |||
| 52 | struct vfio_iommu_driver { | ||
| 53 | const struct vfio_iommu_driver_ops *ops; | ||
| 54 | struct list_head vfio_next; | ||
| 55 | }; | ||
| 56 | |||
| 57 | struct vfio_container { | ||
| 58 | struct kref kref; | ||
| 59 | struct list_head group_list; | ||
| 60 | struct mutex group_lock; | ||
| 61 | struct vfio_iommu_driver *iommu_driver; | ||
| 62 | void *iommu_data; | ||
| 63 | }; | ||
| 64 | |||
| 65 | struct vfio_group { | ||
| 66 | struct kref kref; | ||
| 67 | int minor; | ||
| 68 | atomic_t container_users; | ||
| 69 | struct iommu_group *iommu_group; | ||
| 70 | struct vfio_container *container; | ||
| 71 | struct list_head device_list; | ||
| 72 | struct mutex device_lock; | ||
| 73 | struct device *dev; | ||
| 74 | struct notifier_block nb; | ||
| 75 | struct list_head vfio_next; | ||
| 76 | struct list_head container_next; | ||
| 77 | }; | ||
| 78 | |||
| 79 | struct vfio_device { | ||
| 80 | struct kref kref; | ||
| 81 | struct device *dev; | ||
| 82 | const struct vfio_device_ops *ops; | ||
| 83 | struct vfio_group *group; | ||
| 84 | struct list_head group_next; | ||
| 85 | void *device_data; | ||
| 86 | }; | ||
| 87 | |||
| 88 | /** | ||
| 89 | * IOMMU driver registration | ||
| 90 | */ | ||
| 91 | int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) | ||
| 92 | { | ||
| 93 | struct vfio_iommu_driver *driver, *tmp; | ||
| 94 | |||
| 95 | driver = kzalloc(sizeof(*driver), GFP_KERNEL); | ||
| 96 | if (!driver) | ||
| 97 | return -ENOMEM; | ||
| 98 | |||
| 99 | driver->ops = ops; | ||
| 100 | |||
| 101 | mutex_lock(&vfio.iommu_drivers_lock); | ||
| 102 | |||
| 103 | /* Check for duplicates */ | ||
| 104 | list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { | ||
| 105 | if (tmp->ops == ops) { | ||
| 106 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 107 | kfree(driver); | ||
| 108 | return -EINVAL; | ||
| 109 | } | ||
| 110 | } | ||
| 111 | |||
| 112 | list_add(&driver->vfio_next, &vfio.iommu_drivers_list); | ||
| 113 | |||
| 114 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 115 | |||
| 116 | return 0; | ||
| 117 | } | ||
| 118 | EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); | ||
| 119 | |||
| 120 | void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) | ||
| 121 | { | ||
| 122 | struct vfio_iommu_driver *driver; | ||
| 123 | |||
| 124 | mutex_lock(&vfio.iommu_drivers_lock); | ||
| 125 | list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { | ||
| 126 | if (driver->ops == ops) { | ||
| 127 | list_del(&driver->vfio_next); | ||
| 128 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 129 | kfree(driver); | ||
| 130 | return; | ||
| 131 | } | ||
| 132 | } | ||
| 133 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 134 | } | ||
| 135 | EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); | ||
| 136 | |||
| 137 | /** | ||
| 138 | * Group minor allocation/free - both called with vfio.group_lock held | ||
| 139 | */ | ||
| 140 | static int vfio_alloc_group_minor(struct vfio_group *group) | ||
| 141 | { | ||
| 142 | int ret, minor; | ||
| 143 | |||
| 144 | again: | ||
| 145 | if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0)) | ||
| 146 | return -ENOMEM; | ||
| 147 | |||
| 148 | /* index 0 is used by /dev/vfio/vfio */ | ||
| 149 | ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor); | ||
| 150 | if (ret == -EAGAIN) | ||
| 151 | goto again; | ||
| 152 | if (ret || minor > MINORMASK) { | ||
| 153 | if (minor > MINORMASK) | ||
| 154 | idr_remove(&vfio.group_idr, minor); | ||
| 155 | return -ENOSPC; | ||
| 156 | } | ||
| 157 | |||
| 158 | return minor; | ||
| 159 | } | ||
| 160 | |||
| 161 | static void vfio_free_group_minor(int minor) | ||
| 162 | { | ||
| 163 | idr_remove(&vfio.group_idr, minor); | ||
| 164 | } | ||
| 165 | |||
| 166 | static int vfio_iommu_group_notifier(struct notifier_block *nb, | ||
| 167 | unsigned long action, void *data); | ||
| 168 | static void vfio_group_get(struct vfio_group *group); | ||
| 169 | |||
| 170 | /** | ||
| 171 | * Container objects - containers are created when /dev/vfio/vfio is | ||
| 172 | * opened, but their lifecycle extends until the last user is done, so | ||
| 173 | * it's freed via kref. Must support container/group/device being | ||
| 174 | * closed in any order. | ||
| 175 | */ | ||
| 176 | static void vfio_container_get(struct vfio_container *container) | ||
| 177 | { | ||
| 178 | kref_get(&container->kref); | ||
| 179 | } | ||
| 180 | |||
| 181 | static void vfio_container_release(struct kref *kref) | ||
| 182 | { | ||
| 183 | struct vfio_container *container; | ||
| 184 | container = container_of(kref, struct vfio_container, kref); | ||
| 185 | |||
| 186 | kfree(container); | ||
| 187 | } | ||
| 188 | |||
| 189 | static void vfio_container_put(struct vfio_container *container) | ||
| 190 | { | ||
| 191 | kref_put(&container->kref, vfio_container_release); | ||
| 192 | } | ||
| 193 | |||
| 194 | /** | ||
| 195 | * Group objects - create, release, get, put, search | ||
| 196 | */ | ||
| 197 | static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | ||
| 198 | { | ||
| 199 | struct vfio_group *group, *tmp; | ||
| 200 | struct device *dev; | ||
| 201 | int ret, minor; | ||
| 202 | |||
| 203 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
| 204 | if (!group) | ||
| 205 | return ERR_PTR(-ENOMEM); | ||
| 206 | |||
| 207 | kref_init(&group->kref); | ||
| 208 | INIT_LIST_HEAD(&group->device_list); | ||
| 209 | mutex_init(&group->device_lock); | ||
| 210 | atomic_set(&group->container_users, 0); | ||
| 211 | group->iommu_group = iommu_group; | ||
| 212 | |||
| 213 | group->nb.notifier_call = vfio_iommu_group_notifier; | ||
| 214 | |||
| 215 | /* | ||
| 216 | * blocking notifiers acquire a rwsem around registering and hold | ||
| 217 | * it around callback. Therefore, need to register outside of | ||
| 218 | * vfio.group_lock to avoid A-B/B-A contention. Our callback won't | ||
| 219 | * do anything unless it can find the group in vfio.group_list, so | ||
| 220 | * no harm in registering early. | ||
| 221 | */ | ||
| 222 | ret = iommu_group_register_notifier(iommu_group, &group->nb); | ||
| 223 | if (ret) { | ||
| 224 | kfree(group); | ||
| 225 | return ERR_PTR(ret); | ||
| 226 | } | ||
| 227 | |||
| 228 | mutex_lock(&vfio.group_lock); | ||
| 229 | |||
| 230 | minor = vfio_alloc_group_minor(group); | ||
| 231 | if (minor < 0) { | ||
| 232 | mutex_unlock(&vfio.group_lock); | ||
| 233 | kfree(group); | ||
| 234 | return ERR_PTR(minor); | ||
| 235 | } | ||
| 236 | |||
| 237 | /* Did we race creating this group? */ | ||
| 238 | list_for_each_entry(tmp, &vfio.group_list, vfio_next) { | ||
| 239 | if (tmp->iommu_group == iommu_group) { | ||
| 240 | vfio_group_get(tmp); | ||
| 241 | vfio_free_group_minor(minor); | ||
| 242 | mutex_unlock(&vfio.group_lock); | ||
| 243 | kfree(group); | ||
| 244 | return tmp; | ||
| 245 | } | ||
| 246 | } | ||
| 247 | |||
| 248 | dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor), | ||
| 249 | group, "%d", iommu_group_id(iommu_group)); | ||
| 250 | if (IS_ERR(dev)) { | ||
| 251 | vfio_free_group_minor(minor); | ||
| 252 | mutex_unlock(&vfio.group_lock); | ||
| 253 | kfree(group); | ||
| 254 | return (struct vfio_group *)dev; /* ERR_PTR */ | ||
| 255 | } | ||
| 256 | |||
| 257 | group->minor = minor; | ||
| 258 | group->dev = dev; | ||
| 259 | |||
| 260 | list_add(&group->vfio_next, &vfio.group_list); | ||
| 261 | |||
| 262 | mutex_unlock(&vfio.group_lock); | ||
| 263 | |||
| 264 | return group; | ||
| 265 | } | ||
| 266 | |||
| 267 | static void vfio_group_release(struct kref *kref) | ||
| 268 | { | ||
| 269 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); | ||
| 270 | |||
| 271 | WARN_ON(!list_empty(&group->device_list)); | ||
| 272 | |||
| 273 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor)); | ||
| 274 | list_del(&group->vfio_next); | ||
| 275 | vfio_free_group_minor(group->minor); | ||
| 276 | |||
| 277 | mutex_unlock(&vfio.group_lock); | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Unregister outside of lock. A spurious callback is harmless now | ||
| 281 | * that the group is no longer in vfio.group_list. | ||
| 282 | */ | ||
| 283 | iommu_group_unregister_notifier(group->iommu_group, &group->nb); | ||
| 284 | |||
| 285 | kfree(group); | ||
| 286 | } | ||
| 287 | |||
| 288 | static void vfio_group_put(struct vfio_group *group) | ||
| 289 | { | ||
| 290 | mutex_lock(&vfio.group_lock); | ||
| 291 | /* | ||
| 292 | * Release needs to unlock to unregister the notifier, so only | ||
| 293 | * unlock if not released. | ||
| 294 | */ | ||
| 295 | if (!kref_put(&group->kref, vfio_group_release)) | ||
| 296 | mutex_unlock(&vfio.group_lock); | ||
| 297 | } | ||
| 298 | |||
| 299 | /* Assume group_lock or group reference is held */ | ||
| 300 | static void vfio_group_get(struct vfio_group *group) | ||
| 301 | { | ||
| 302 | kref_get(&group->kref); | ||
| 303 | } | ||
| 304 | |||
| 305 | /* | ||
| 306 | * Not really a try as we will sleep for mutex, but we need to make | ||
| 307 | * sure the group pointer is valid under lock and get a reference. | ||
| 308 | */ | ||
| 309 | static struct vfio_group *vfio_group_try_get(struct vfio_group *group) | ||
| 310 | { | ||
| 311 | struct vfio_group *target = group; | ||
| 312 | |||
| 313 | mutex_lock(&vfio.group_lock); | ||
| 314 | list_for_each_entry(group, &vfio.group_list, vfio_next) { | ||
| 315 | if (group == target) { | ||
| 316 | vfio_group_get(group); | ||
| 317 | mutex_unlock(&vfio.group_lock); | ||
| 318 | return group; | ||
| 319 | } | ||
| 320 | } | ||
| 321 | mutex_unlock(&vfio.group_lock); | ||
| 322 | |||
| 323 | return NULL; | ||
| 324 | } | ||
| 325 | |||
| 326 | static | ||
| 327 | struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) | ||
| 328 | { | ||
| 329 | struct vfio_group *group; | ||
| 330 | |||
| 331 | mutex_lock(&vfio.group_lock); | ||
| 332 | list_for_each_entry(group, &vfio.group_list, vfio_next) { | ||
| 333 | if (group->iommu_group == iommu_group) { | ||
| 334 | vfio_group_get(group); | ||
| 335 | mutex_unlock(&vfio.group_lock); | ||
| 336 | return group; | ||
| 337 | } | ||
| 338 | } | ||
| 339 | mutex_unlock(&vfio.group_lock); | ||
| 340 | |||
| 341 | return NULL; | ||
| 342 | } | ||
| 343 | |||
| 344 | static struct vfio_group *vfio_group_get_from_minor(int minor) | ||
| 345 | { | ||
| 346 | struct vfio_group *group; | ||
| 347 | |||
| 348 | mutex_lock(&vfio.group_lock); | ||
| 349 | group = idr_find(&vfio.group_idr, minor); | ||
| 350 | if (!group) { | ||
| 351 | mutex_unlock(&vfio.group_lock); | ||
| 352 | return NULL; | ||
| 353 | } | ||
| 354 | vfio_group_get(group); | ||
| 355 | mutex_unlock(&vfio.group_lock); | ||
| 356 | |||
| 357 | return group; | ||
| 358 | } | ||
| 359 | |||
| 360 | /** | ||
| 361 | * Device objects - create, release, get, put, search | ||
| 362 | */ | ||
| 363 | static | ||
| 364 | struct vfio_device *vfio_group_create_device(struct vfio_group *group, | ||
| 365 | struct device *dev, | ||
| 366 | const struct vfio_device_ops *ops, | ||
| 367 | void *device_data) | ||
| 368 | { | ||
| 369 | struct vfio_device *device; | ||
| 370 | int ret; | ||
| 371 | |||
| 372 | device = kzalloc(sizeof(*device), GFP_KERNEL); | ||
| 373 | if (!device) | ||
| 374 | return ERR_PTR(-ENOMEM); | ||
| 375 | |||
| 376 | kref_init(&device->kref); | ||
| 377 | device->dev = dev; | ||
| 378 | device->group = group; | ||
| 379 | device->ops = ops; | ||
| 380 | device->device_data = device_data; | ||
| 381 | |||
| 382 | ret = dev_set_drvdata(dev, device); | ||
| 383 | if (ret) { | ||
| 384 | kfree(device); | ||
| 385 | return ERR_PTR(ret); | ||
| 386 | } | ||
| 387 | |||
| 388 | /* No need to get group_lock, caller has group reference */ | ||
| 389 | vfio_group_get(group); | ||
| 390 | |||
| 391 | mutex_lock(&group->device_lock); | ||
| 392 | list_add(&device->group_next, &group->device_list); | ||
| 393 | mutex_unlock(&group->device_lock); | ||
| 394 | |||
| 395 | return device; | ||
| 396 | } | ||
| 397 | |||
| 398 | static void vfio_device_release(struct kref *kref) | ||
| 399 | { | ||
| 400 | struct vfio_device *device = container_of(kref, | ||
| 401 | struct vfio_device, kref); | ||
| 402 | struct vfio_group *group = device->group; | ||
| 403 | |||
| 404 | mutex_lock(&group->device_lock); | ||
| 405 | list_del(&device->group_next); | ||
| 406 | mutex_unlock(&group->device_lock); | ||
| 407 | |||
| 408 | dev_set_drvdata(device->dev, NULL); | ||
| 409 | |||
| 410 | kfree(device); | ||
| 411 | |||
| 412 | /* vfio_del_group_dev may be waiting for this device */ | ||
| 413 | wake_up(&vfio.release_q); | ||
| 414 | } | ||
| 415 | |||
| 416 | /* Device reference always implies a group reference */ | ||
| 417 | static void vfio_device_put(struct vfio_device *device) | ||
| 418 | { | ||
| 419 | kref_put(&device->kref, vfio_device_release); | ||
| 420 | vfio_group_put(device->group); | ||
| 421 | } | ||
| 422 | |||
| 423 | static void vfio_device_get(struct vfio_device *device) | ||
| 424 | { | ||
| 425 | vfio_group_get(device->group); | ||
| 426 | kref_get(&device->kref); | ||
| 427 | } | ||
| 428 | |||
| 429 | static struct vfio_device *vfio_group_get_device(struct vfio_group *group, | ||
| 430 | struct device *dev) | ||
| 431 | { | ||
| 432 | struct vfio_device *device; | ||
| 433 | |||
| 434 | mutex_lock(&group->device_lock); | ||
| 435 | list_for_each_entry(device, &group->device_list, group_next) { | ||
| 436 | if (device->dev == dev) { | ||
| 437 | vfio_device_get(device); | ||
| 438 | mutex_unlock(&group->device_lock); | ||
| 439 | return device; | ||
| 440 | } | ||
| 441 | } | ||
| 442 | mutex_unlock(&group->device_lock); | ||
| 443 | return NULL; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* | ||
| 447 | * Whitelist some drivers that we know are safe (no dma) or just sit on | ||
| 448 | * a device. It's not always practical to leave a device within a group | ||
| 449 | * driverless as it could get re-bound to something unsafe. | ||
| 450 | */ | ||
| 451 | static const char * const vfio_driver_whitelist[] = { "pci-stub" }; | ||
| 452 | |||
| 453 | static bool vfio_whitelisted_driver(struct device_driver *drv) | ||
| 454 | { | ||
| 455 | int i; | ||
| 456 | |||
| 457 | for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { | ||
| 458 | if (!strcmp(drv->name, vfio_driver_whitelist[i])) | ||
| 459 | return true; | ||
| 460 | } | ||
| 461 | |||
| 462 | return false; | ||
| 463 | } | ||
| 464 | |||
| 465 | /* | ||
| 466 | * A vfio group is viable for use by userspace if all devices are either | ||
| 467 | * driver-less or bound to a vfio or whitelisted driver. We test the | ||
| 468 | * latter by the existence of a struct vfio_device matching the dev. | ||
| 469 | */ | ||
| 470 | static int vfio_dev_viable(struct device *dev, void *data) | ||
| 471 | { | ||
| 472 | struct vfio_group *group = data; | ||
| 473 | struct vfio_device *device; | ||
| 474 | |||
| 475 | if (!dev->driver || vfio_whitelisted_driver(dev->driver)) | ||
| 476 | return 0; | ||
| 477 | |||
| 478 | device = vfio_group_get_device(group, dev); | ||
| 479 | if (device) { | ||
| 480 | vfio_device_put(device); | ||
| 481 | return 0; | ||
| 482 | } | ||
| 483 | |||
| 484 | return -EINVAL; | ||
| 485 | } | ||
| 486 | |||
| 487 | /** | ||
| 488 | * Async device support | ||
| 489 | */ | ||
| 490 | static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) | ||
| 491 | { | ||
| 492 | struct vfio_device *device; | ||
| 493 | |||
| 494 | /* Do we already know about it? We shouldn't */ | ||
| 495 | device = vfio_group_get_device(group, dev); | ||
| 496 | if (WARN_ON_ONCE(device)) { | ||
| 497 | vfio_device_put(device); | ||
| 498 | return 0; | ||
| 499 | } | ||
| 500 | |||
| 501 | /* Nothing to do for idle groups */ | ||
| 502 | if (!atomic_read(&group->container_users)) | ||
| 503 | return 0; | ||
| 504 | |||
| 505 | /* TODO Prevent device auto probing */ | ||
| 506 | WARN("Device %s added to live group %d!\n", dev_name(dev), | ||
| 507 | iommu_group_id(group->iommu_group)); | ||
| 508 | |||
| 509 | return 0; | ||
| 510 | } | ||
| 511 | |||
| 512 | static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev) | ||
| 513 | { | ||
| 514 | struct vfio_device *device; | ||
| 515 | |||
| 516 | /* | ||
| 517 | * Expect to fall out here. If a device was in use, it would | ||
| 518 | * have been bound to a vfio sub-driver, which would have blocked | ||
| 519 | * in .remove at vfio_del_group_dev. Sanity check that we no | ||
| 520 | * longer track the device, so it's safe to remove. | ||
| 521 | */ | ||
| 522 | device = vfio_group_get_device(group, dev); | ||
| 523 | if (likely(!device)) | ||
| 524 | return 0; | ||
| 525 | |||
| 526 | WARN("Device %s removed from live group %d!\n", dev_name(dev), | ||
| 527 | iommu_group_id(group->iommu_group)); | ||
| 528 | |||
| 529 | vfio_device_put(device); | ||
| 530 | return 0; | ||
| 531 | } | ||
| 532 | |||
| 533 | static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) | ||
| 534 | { | ||
| 535 | /* We don't care what happens when the group isn't in use */ | ||
| 536 | if (!atomic_read(&group->container_users)) | ||
| 537 | return 0; | ||
| 538 | |||
| 539 | return vfio_dev_viable(dev, group); | ||
| 540 | } | ||
| 541 | |||
| 542 | static int vfio_iommu_group_notifier(struct notifier_block *nb, | ||
| 543 | unsigned long action, void *data) | ||
| 544 | { | ||
| 545 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); | ||
| 546 | struct device *dev = data; | ||
| 547 | |||
| 548 | /* | ||
| 549 | * Need to go through a group_lock lookup to get a reference or | ||
| 550 | * we risk racing a group being removed. Leave a WARN_ON for | ||
| 551 | * debuging, but if the group no longer exists, a spurious notify | ||
| 552 | * is harmless. | ||
| 553 | */ | ||
| 554 | group = vfio_group_try_get(group); | ||
| 555 | if (WARN_ON(!group)) | ||
| 556 | return NOTIFY_OK; | ||
| 557 | |||
| 558 | switch (action) { | ||
| 559 | case IOMMU_GROUP_NOTIFY_ADD_DEVICE: | ||
| 560 | vfio_group_nb_add_dev(group, dev); | ||
| 561 | break; | ||
| 562 | case IOMMU_GROUP_NOTIFY_DEL_DEVICE: | ||
| 563 | vfio_group_nb_del_dev(group, dev); | ||
| 564 | break; | ||
| 565 | case IOMMU_GROUP_NOTIFY_BIND_DRIVER: | ||
| 566 | pr_debug("%s: Device %s, group %d binding to driver\n", | ||
| 567 | __func__, dev_name(dev), | ||
| 568 | iommu_group_id(group->iommu_group)); | ||
| 569 | break; | ||
| 570 | case IOMMU_GROUP_NOTIFY_BOUND_DRIVER: | ||
| 571 | pr_debug("%s: Device %s, group %d bound to driver %s\n", | ||
| 572 | __func__, dev_name(dev), | ||
| 573 | iommu_group_id(group->iommu_group), dev->driver->name); | ||
| 574 | BUG_ON(vfio_group_nb_verify(group, dev)); | ||
| 575 | break; | ||
| 576 | case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER: | ||
| 577 | pr_debug("%s: Device %s, group %d unbinding from driver %s\n", | ||
| 578 | __func__, dev_name(dev), | ||
| 579 | iommu_group_id(group->iommu_group), dev->driver->name); | ||
| 580 | break; | ||
| 581 | case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER: | ||
| 582 | pr_debug("%s: Device %s, group %d unbound from driver\n", | ||
| 583 | __func__, dev_name(dev), | ||
| 584 | iommu_group_id(group->iommu_group)); | ||
| 585 | /* | ||
| 586 | * XXX An unbound device in a live group is ok, but we'd | ||
| 587 | * really like to avoid the above BUG_ON by preventing other | ||
| 588 | * drivers from binding to it. Once that occurs, we have to | ||
| 589 | * stop the system to maintain isolation. At a minimum, we'd | ||
| 590 | * want a toggle to disable driver auto probe for this device. | ||
| 591 | */ | ||
| 592 | break; | ||
| 593 | } | ||
| 594 | |||
| 595 | vfio_group_put(group); | ||
| 596 | return NOTIFY_OK; | ||
| 597 | } | ||
| 598 | |||
| 599 | /** | ||
| 600 | * VFIO driver API | ||
| 601 | */ | ||
| 602 | int vfio_add_group_dev(struct device *dev, | ||
| 603 | const struct vfio_device_ops *ops, void *device_data) | ||
| 604 | { | ||
| 605 | struct iommu_group *iommu_group; | ||
| 606 | struct vfio_group *group; | ||
| 607 | struct vfio_device *device; | ||
| 608 | |||
| 609 | iommu_group = iommu_group_get(dev); | ||
| 610 | if (!iommu_group) | ||
| 611 | return -EINVAL; | ||
| 612 | |||
| 613 | group = vfio_group_get_from_iommu(iommu_group); | ||
| 614 | if (!group) { | ||
| 615 | group = vfio_create_group(iommu_group); | ||
| 616 | if (IS_ERR(group)) { | ||
| 617 | iommu_group_put(iommu_group); | ||
| 618 | return PTR_ERR(group); | ||
| 619 | } | ||
| 620 | } | ||
| 621 | |||
| 622 | device = vfio_group_get_device(group, dev); | ||
| 623 | if (device) { | ||
| 624 | WARN(1, "Device %s already exists on group %d\n", | ||
| 625 | dev_name(dev), iommu_group_id(iommu_group)); | ||
| 626 | vfio_device_put(device); | ||
| 627 | vfio_group_put(group); | ||
| 628 | iommu_group_put(iommu_group); | ||
| 629 | return -EBUSY; | ||
| 630 | } | ||
| 631 | |||
| 632 | device = vfio_group_create_device(group, dev, ops, device_data); | ||
| 633 | if (IS_ERR(device)) { | ||
| 634 | vfio_group_put(group); | ||
| 635 | iommu_group_put(iommu_group); | ||
| 636 | return PTR_ERR(device); | ||
| 637 | } | ||
| 638 | |||
| 639 | /* | ||
| 640 | * Added device holds reference to iommu_group and vfio_device | ||
| 641 | * (which in turn holds reference to vfio_group). Drop extra | ||
| 642 | * group reference used while acquiring device. | ||
| 643 | */ | ||
| 644 | vfio_group_put(group); | ||
| 645 | |||
| 646 | return 0; | ||
| 647 | } | ||
| 648 | EXPORT_SYMBOL_GPL(vfio_add_group_dev); | ||
| 649 | |||
| 650 | /* Test whether a struct device is present in our tracking */ | ||
| 651 | static bool vfio_dev_present(struct device *dev) | ||
| 652 | { | ||
| 653 | struct iommu_group *iommu_group; | ||
| 654 | struct vfio_group *group; | ||
| 655 | struct vfio_device *device; | ||
| 656 | |||
| 657 | iommu_group = iommu_group_get(dev); | ||
| 658 | if (!iommu_group) | ||
| 659 | return false; | ||
| 660 | |||
| 661 | group = vfio_group_get_from_iommu(iommu_group); | ||
| 662 | if (!group) { | ||
| 663 | iommu_group_put(iommu_group); | ||
| 664 | return false; | ||
| 665 | } | ||
| 666 | |||
| 667 | device = vfio_group_get_device(group, dev); | ||
| 668 | if (!device) { | ||
| 669 | vfio_group_put(group); | ||
| 670 | iommu_group_put(iommu_group); | ||
| 671 | return false; | ||
| 672 | } | ||
| 673 | |||
| 674 | vfio_device_put(device); | ||
| 675 | vfio_group_put(group); | ||
| 676 | iommu_group_put(iommu_group); | ||
| 677 | return true; | ||
| 678 | } | ||
| 679 | |||
| 680 | /* | ||
| 681 | * Decrement the device reference count and wait for the device to be | ||
| 682 | * removed. Open file descriptors for the device... */ | ||
| 683 | void *vfio_del_group_dev(struct device *dev) | ||
| 684 | { | ||
| 685 | struct vfio_device *device = dev_get_drvdata(dev); | ||
| 686 | struct vfio_group *group = device->group; | ||
| 687 | struct iommu_group *iommu_group = group->iommu_group; | ||
| 688 | void *device_data = device->device_data; | ||
| 689 | |||
| 690 | vfio_device_put(device); | ||
| 691 | |||
| 692 | /* TODO send a signal to encourage this to be released */ | ||
| 693 | wait_event(vfio.release_q, !vfio_dev_present(dev)); | ||
| 694 | |||
| 695 | iommu_group_put(iommu_group); | ||
| 696 | |||
| 697 | return device_data; | ||
| 698 | } | ||
| 699 | EXPORT_SYMBOL_GPL(vfio_del_group_dev); | ||
| 700 | |||
| 701 | /** | ||
| 702 | * VFIO base fd, /dev/vfio/vfio | ||
| 703 | */ | ||
| 704 | static long vfio_ioctl_check_extension(struct vfio_container *container, | ||
| 705 | unsigned long arg) | ||
| 706 | { | ||
| 707 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
| 708 | long ret = 0; | ||
| 709 | |||
| 710 | switch (arg) { | ||
| 711 | /* No base extensions yet */ | ||
| 712 | default: | ||
| 713 | /* | ||
| 714 | * If no driver is set, poll all registered drivers for | ||
| 715 | * extensions and return the first positive result. If | ||
| 716 | * a driver is already set, further queries will be passed | ||
| 717 | * only to that driver. | ||
| 718 | */ | ||
| 719 | if (!driver) { | ||
| 720 | mutex_lock(&vfio.iommu_drivers_lock); | ||
| 721 | list_for_each_entry(driver, &vfio.iommu_drivers_list, | ||
| 722 | vfio_next) { | ||
| 723 | if (!try_module_get(driver->ops->owner)) | ||
| 724 | continue; | ||
| 725 | |||
| 726 | ret = driver->ops->ioctl(NULL, | ||
| 727 | VFIO_CHECK_EXTENSION, | ||
| 728 | arg); | ||
| 729 | module_put(driver->ops->owner); | ||
| 730 | if (ret > 0) | ||
| 731 | break; | ||
| 732 | } | ||
| 733 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 734 | } else | ||
| 735 | ret = driver->ops->ioctl(container->iommu_data, | ||
| 736 | VFIO_CHECK_EXTENSION, arg); | ||
| 737 | } | ||
| 738 | |||
| 739 | return ret; | ||
| 740 | } | ||
| 741 | |||
| 742 | /* hold container->group_lock */ | ||
| 743 | static int __vfio_container_attach_groups(struct vfio_container *container, | ||
| 744 | struct vfio_iommu_driver *driver, | ||
| 745 | void *data) | ||
| 746 | { | ||
| 747 | struct vfio_group *group; | ||
| 748 | int ret = -ENODEV; | ||
| 749 | |||
| 750 | list_for_each_entry(group, &container->group_list, container_next) { | ||
| 751 | ret = driver->ops->attach_group(data, group->iommu_group); | ||
| 752 | if (ret) | ||
| 753 | goto unwind; | ||
| 754 | } | ||
| 755 | |||
| 756 | return ret; | ||
| 757 | |||
| 758 | unwind: | ||
| 759 | list_for_each_entry_continue_reverse(group, &container->group_list, | ||
| 760 | container_next) { | ||
| 761 | driver->ops->detach_group(data, group->iommu_group); | ||
| 762 | } | ||
| 763 | |||
| 764 | return ret; | ||
| 765 | } | ||
| 766 | |||
| 767 | static long vfio_ioctl_set_iommu(struct vfio_container *container, | ||
| 768 | unsigned long arg) | ||
| 769 | { | ||
| 770 | struct vfio_iommu_driver *driver; | ||
| 771 | long ret = -ENODEV; | ||
| 772 | |||
| 773 | mutex_lock(&container->group_lock); | ||
| 774 | |||
| 775 | /* | ||
| 776 | * The container is designed to be an unprivileged interface while | ||
| 777 | * the group can be assigned to specific users. Therefore, only by | ||
| 778 | * adding a group to a container does the user get the privilege of | ||
| 779 | * enabling the iommu, which may allocate finite resources. There | ||
| 780 | * is no unset_iommu, but by removing all the groups from a container, | ||
| 781 | * the container is deprivileged and returns to an unset state. | ||
| 782 | */ | ||
| 783 | if (list_empty(&container->group_list) || container->iommu_driver) { | ||
| 784 | mutex_unlock(&container->group_lock); | ||
| 785 | return -EINVAL; | ||
| 786 | } | ||
| 787 | |||
| 788 | mutex_lock(&vfio.iommu_drivers_lock); | ||
| 789 | list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { | ||
| 790 | void *data; | ||
| 791 | |||
| 792 | if (!try_module_get(driver->ops->owner)) | ||
| 793 | continue; | ||
| 794 | |||
| 795 | /* | ||
| 796 | * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, | ||
| 797 | * so test which iommu driver reported support for this | ||
| 798 | * extension and call open on them. We also pass them the | ||
| 799 | * magic, allowing a single driver to support multiple | ||
| 800 | * interfaces if they'd like. | ||
| 801 | */ | ||
| 802 | if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { | ||
| 803 | module_put(driver->ops->owner); | ||
| 804 | continue; | ||
| 805 | } | ||
| 806 | |||
| 807 | /* module reference holds the driver we're working on */ | ||
| 808 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 809 | |||
| 810 | data = driver->ops->open(arg); | ||
| 811 | if (IS_ERR(data)) { | ||
| 812 | ret = PTR_ERR(data); | ||
| 813 | module_put(driver->ops->owner); | ||
| 814 | goto skip_drivers_unlock; | ||
| 815 | } | ||
| 816 | |||
| 817 | ret = __vfio_container_attach_groups(container, driver, data); | ||
| 818 | if (!ret) { | ||
| 819 | container->iommu_driver = driver; | ||
| 820 | container->iommu_data = data; | ||
| 821 | } else { | ||
| 822 | driver->ops->release(data); | ||
| 823 | module_put(driver->ops->owner); | ||
| 824 | } | ||
| 825 | |||
| 826 | goto skip_drivers_unlock; | ||
| 827 | } | ||
| 828 | |||
| 829 | mutex_unlock(&vfio.iommu_drivers_lock); | ||
| 830 | skip_drivers_unlock: | ||
| 831 | mutex_unlock(&container->group_lock); | ||
| 832 | |||
| 833 | return ret; | ||
| 834 | } | ||
| 835 | |||
| 836 | static long vfio_fops_unl_ioctl(struct file *filep, | ||
| 837 | unsigned int cmd, unsigned long arg) | ||
| 838 | { | ||
| 839 | struct vfio_container *container = filep->private_data; | ||
| 840 | struct vfio_iommu_driver *driver; | ||
| 841 | void *data; | ||
| 842 | long ret = -EINVAL; | ||
| 843 | |||
| 844 | if (!container) | ||
| 845 | return ret; | ||
| 846 | |||
| 847 | driver = container->iommu_driver; | ||
| 848 | data = container->iommu_data; | ||
| 849 | |||
| 850 | switch (cmd) { | ||
| 851 | case VFIO_GET_API_VERSION: | ||
| 852 | ret = VFIO_API_VERSION; | ||
| 853 | break; | ||
| 854 | case VFIO_CHECK_EXTENSION: | ||
| 855 | ret = vfio_ioctl_check_extension(container, arg); | ||
| 856 | break; | ||
| 857 | case VFIO_SET_IOMMU: | ||
| 858 | ret = vfio_ioctl_set_iommu(container, arg); | ||
| 859 | break; | ||
| 860 | default: | ||
| 861 | if (driver) /* passthrough all unrecognized ioctls */ | ||
| 862 | ret = driver->ops->ioctl(data, cmd, arg); | ||
| 863 | } | ||
| 864 | |||
| 865 | return ret; | ||
| 866 | } | ||
| 867 | |||
| 868 | #ifdef CONFIG_COMPAT | ||
| 869 | static long vfio_fops_compat_ioctl(struct file *filep, | ||
| 870 | unsigned int cmd, unsigned long arg) | ||
| 871 | { | ||
| 872 | arg = (unsigned long)compat_ptr(arg); | ||
| 873 | return vfio_fops_unl_ioctl(filep, cmd, arg); | ||
| 874 | } | ||
| 875 | #endif /* CONFIG_COMPAT */ | ||
| 876 | |||
| 877 | static int vfio_fops_open(struct inode *inode, struct file *filep) | ||
| 878 | { | ||
| 879 | struct vfio_container *container; | ||
| 880 | |||
| 881 | container = kzalloc(sizeof(*container), GFP_KERNEL); | ||
| 882 | if (!container) | ||
| 883 | return -ENOMEM; | ||
| 884 | |||
| 885 | INIT_LIST_HEAD(&container->group_list); | ||
| 886 | mutex_init(&container->group_lock); | ||
| 887 | kref_init(&container->kref); | ||
| 888 | |||
| 889 | filep->private_data = container; | ||
| 890 | |||
| 891 | return 0; | ||
| 892 | } | ||
| 893 | |||
| 894 | static int vfio_fops_release(struct inode *inode, struct file *filep) | ||
| 895 | { | ||
| 896 | struct vfio_container *container = filep->private_data; | ||
| 897 | |||
| 898 | filep->private_data = NULL; | ||
| 899 | |||
| 900 | vfio_container_put(container); | ||
| 901 | |||
| 902 | return 0; | ||
| 903 | } | ||
| 904 | |||
| 905 | /* | ||
| 906 | * Once an iommu driver is set, we optionally pass read/write/mmap | ||
| 907 | * on to the driver, allowing management interfaces beyond ioctl. | ||
| 908 | */ | ||
| 909 | static ssize_t vfio_fops_read(struct file *filep, char __user *buf, | ||
| 910 | size_t count, loff_t *ppos) | ||
| 911 | { | ||
| 912 | struct vfio_container *container = filep->private_data; | ||
| 913 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
| 914 | |||
| 915 | if (unlikely(!driver || !driver->ops->read)) | ||
| 916 | return -EINVAL; | ||
| 917 | |||
| 918 | return driver->ops->read(container->iommu_data, buf, count, ppos); | ||
| 919 | } | ||
| 920 | |||
| 921 | static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, | ||
| 922 | size_t count, loff_t *ppos) | ||
| 923 | { | ||
| 924 | struct vfio_container *container = filep->private_data; | ||
| 925 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
| 926 | |||
| 927 | if (unlikely(!driver || !driver->ops->write)) | ||
| 928 | return -EINVAL; | ||
| 929 | |||
| 930 | return driver->ops->write(container->iommu_data, buf, count, ppos); | ||
| 931 | } | ||
| 932 | |||
| 933 | static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) | ||
| 934 | { | ||
| 935 | struct vfio_container *container = filep->private_data; | ||
| 936 | struct vfio_iommu_driver *driver = container->iommu_driver; | ||
| 937 | |||
| 938 | if (unlikely(!driver || !driver->ops->mmap)) | ||
| 939 | return -EINVAL; | ||
| 940 | |||
| 941 | return driver->ops->mmap(container->iommu_data, vma); | ||
| 942 | } | ||
| 943 | |||
| 944 | static const struct file_operations vfio_fops = { | ||
| 945 | .owner = THIS_MODULE, | ||
| 946 | .open = vfio_fops_open, | ||
| 947 | .release = vfio_fops_release, | ||
| 948 | .read = vfio_fops_read, | ||
| 949 | .write = vfio_fops_write, | ||
| 950 | .unlocked_ioctl = vfio_fops_unl_ioctl, | ||
| 951 | #ifdef CONFIG_COMPAT | ||
| 952 | .compat_ioctl = vfio_fops_compat_ioctl, | ||
| 953 | #endif | ||
| 954 | .mmap = vfio_fops_mmap, | ||
| 955 | }; | ||
| 956 | |||
| 957 | /** | ||
| 958 | * VFIO Group fd, /dev/vfio/$GROUP | ||
| 959 | */ | ||
| 960 | static void __vfio_group_unset_container(struct vfio_group *group) | ||
| 961 | { | ||
| 962 | struct vfio_container *container = group->container; | ||
| 963 | struct vfio_iommu_driver *driver; | ||
| 964 | |||
| 965 | mutex_lock(&container->group_lock); | ||
| 966 | |||
| 967 | driver = container->iommu_driver; | ||
| 968 | if (driver) | ||
| 969 | driver->ops->detach_group(container->iommu_data, | ||
| 970 | group->iommu_group); | ||
| 971 | |||
| 972 | group->container = NULL; | ||
| 973 | list_del(&group->container_next); | ||
| 974 | |||
| 975 | /* Detaching the last group deprivileges a container, remove iommu */ | ||
| 976 | if (driver && list_empty(&container->group_list)) { | ||
| 977 | driver->ops->release(container->iommu_data); | ||
| 978 | module_put(driver->ops->owner); | ||
| 979 | container->iommu_driver = NULL; | ||
| 980 | container->iommu_data = NULL; | ||
| 981 | } | ||
| 982 | |||
| 983 | mutex_unlock(&container->group_lock); | ||
| 984 | |||
| 985 | vfio_container_put(container); | ||
| 986 | } | ||
| 987 | |||
| 988 | /* | ||
| 989 | * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or | ||
| 990 | * if there was no container to unset. Since the ioctl is called on | ||
| 991 | * the group, we know that still exists, therefore the only valid | ||
| 992 | * transition here is 1->0. | ||
| 993 | */ | ||
| 994 | static int vfio_group_unset_container(struct vfio_group *group) | ||
| 995 | { | ||
| 996 | int users = atomic_cmpxchg(&group->container_users, 1, 0); | ||
| 997 | |||
| 998 | if (!users) | ||
| 999 | return -EINVAL; | ||
| 1000 | if (users != 1) | ||
| 1001 | return -EBUSY; | ||
| 1002 | |||
| 1003 | __vfio_group_unset_container(group); | ||
| 1004 | |||
| 1005 | return 0; | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | /* | ||
| 1009 | * When removing container users, anything that removes the last user | ||
| 1010 | * implicitly removes the group from the container. That is, if the | ||
| 1011 | * group file descriptor is closed, as well as any device file descriptors, | ||
| 1012 | * the group is free. | ||
| 1013 | */ | ||
| 1014 | static void vfio_group_try_dissolve_container(struct vfio_group *group) | ||
| 1015 | { | ||
| 1016 | if (0 == atomic_dec_if_positive(&group->container_users)) | ||
| 1017 | __vfio_group_unset_container(group); | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | static int vfio_group_set_container(struct vfio_group *group, int container_fd) | ||
| 1021 | { | ||
| 1022 | struct file *filep; | ||
| 1023 | struct vfio_container *container; | ||
| 1024 | struct vfio_iommu_driver *driver; | ||
| 1025 | int ret = 0; | ||
| 1026 | |||
| 1027 | if (atomic_read(&group->container_users)) | ||
| 1028 | return -EINVAL; | ||
| 1029 | |||
| 1030 | filep = fget(container_fd); | ||
| 1031 | if (!filep) | ||
| 1032 | return -EBADF; | ||
| 1033 | |||
| 1034 | /* Sanity check, is this really our fd? */ | ||
| 1035 | if (filep->f_op != &vfio_fops) { | ||
| 1036 | fput(filep); | ||
| 1037 | return -EINVAL; | ||
| 1038 | } | ||
| 1039 | |||
| 1040 | container = filep->private_data; | ||
| 1041 | WARN_ON(!container); /* fget ensures we don't race vfio_release */ | ||
| 1042 | |||
| 1043 | mutex_lock(&container->group_lock); | ||
| 1044 | |||
| 1045 | driver = container->iommu_driver; | ||
| 1046 | if (driver) { | ||
| 1047 | ret = driver->ops->attach_group(container->iommu_data, | ||
| 1048 | group->iommu_group); | ||
| 1049 | if (ret) | ||
| 1050 | goto unlock_out; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | group->container = container; | ||
| 1054 | list_add(&group->container_next, &container->group_list); | ||
| 1055 | |||
| 1056 | /* Get a reference on the container and mark a user within the group */ | ||
| 1057 | vfio_container_get(container); | ||
| 1058 | atomic_inc(&group->container_users); | ||
| 1059 | |||
| 1060 | unlock_out: | ||
| 1061 | mutex_unlock(&container->group_lock); | ||
| 1062 | fput(filep); | ||
| 1063 | |||
| 1064 | return ret; | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | static bool vfio_group_viable(struct vfio_group *group) | ||
| 1068 | { | ||
| 1069 | return (iommu_group_for_each_dev(group->iommu_group, | ||
| 1070 | group, vfio_dev_viable) == 0); | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static const struct file_operations vfio_device_fops; | ||
| 1074 | |||
| 1075 | static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) | ||
| 1076 | { | ||
| 1077 | struct vfio_device *device; | ||
| 1078 | struct file *filep; | ||
| 1079 | int ret = -ENODEV; | ||
| 1080 | |||
| 1081 | if (0 == atomic_read(&group->container_users) || | ||
| 1082 | !group->container->iommu_driver || !vfio_group_viable(group)) | ||
| 1083 | return -EINVAL; | ||
| 1084 | |||
| 1085 | mutex_lock(&group->device_lock); | ||
| 1086 | list_for_each_entry(device, &group->device_list, group_next) { | ||
| 1087 | if (strcmp(dev_name(device->dev), buf)) | ||
| 1088 | continue; | ||
| 1089 | |||
| 1090 | ret = device->ops->open(device->device_data); | ||
| 1091 | if (ret) | ||
| 1092 | break; | ||
| 1093 | /* | ||
| 1094 | * We can't use anon_inode_getfd() because we need to modify | ||
| 1095 | * the f_mode flags directly to allow more than just ioctls | ||
| 1096 | */ | ||
| 1097 | ret = get_unused_fd(); | ||
| 1098 | if (ret < 0) { | ||
| 1099 | device->ops->release(device->device_data); | ||
| 1100 | break; | ||
| 1101 | } | ||
| 1102 | |||
| 1103 | filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, | ||
| 1104 | device, O_RDWR); | ||
| 1105 | if (IS_ERR(filep)) { | ||
| 1106 | put_unused_fd(ret); | ||
| 1107 | ret = PTR_ERR(filep); | ||
| 1108 | device->ops->release(device->device_data); | ||
| 1109 | break; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | /* | ||
| 1113 | * TODO: add an anon_inode interface to do this. | ||
| 1114 | * Appears to be missing by lack of need rather than | ||
| 1115 | * explicitly prevented. Now there's need. | ||
| 1116 | */ | ||
| 1117 | filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); | ||
| 1118 | |||
| 1119 | fd_install(ret, filep); | ||
| 1120 | |||
| 1121 | vfio_device_get(device); | ||
| 1122 | atomic_inc(&group->container_users); | ||
| 1123 | break; | ||
| 1124 | } | ||
| 1125 | mutex_unlock(&group->device_lock); | ||
| 1126 | |||
| 1127 | return ret; | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | static long vfio_group_fops_unl_ioctl(struct file *filep, | ||
| 1131 | unsigned int cmd, unsigned long arg) | ||
| 1132 | { | ||
| 1133 | struct vfio_group *group = filep->private_data; | ||
| 1134 | long ret = -ENOTTY; | ||
| 1135 | |||
| 1136 | switch (cmd) { | ||
| 1137 | case VFIO_GROUP_GET_STATUS: | ||
| 1138 | { | ||
| 1139 | struct vfio_group_status status; | ||
| 1140 | unsigned long minsz; | ||
| 1141 | |||
| 1142 | minsz = offsetofend(struct vfio_group_status, flags); | ||
| 1143 | |||
| 1144 | if (copy_from_user(&status, (void __user *)arg, minsz)) | ||
| 1145 | return -EFAULT; | ||
| 1146 | |||
| 1147 | if (status.argsz < minsz) | ||
| 1148 | return -EINVAL; | ||
| 1149 | |||
| 1150 | status.flags = 0; | ||
| 1151 | |||
| 1152 | if (vfio_group_viable(group)) | ||
| 1153 | status.flags |= VFIO_GROUP_FLAGS_VIABLE; | ||
| 1154 | |||
| 1155 | if (group->container) | ||
| 1156 | status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET; | ||
| 1157 | |||
| 1158 | if (copy_to_user((void __user *)arg, &status, minsz)) | ||
| 1159 | return -EFAULT; | ||
| 1160 | |||
| 1161 | ret = 0; | ||
| 1162 | break; | ||
| 1163 | } | ||
| 1164 | case VFIO_GROUP_SET_CONTAINER: | ||
| 1165 | { | ||
| 1166 | int fd; | ||
| 1167 | |||
| 1168 | if (get_user(fd, (int __user *)arg)) | ||
| 1169 | return -EFAULT; | ||
| 1170 | |||
| 1171 | if (fd < 0) | ||
| 1172 | return -EINVAL; | ||
| 1173 | |||
| 1174 | ret = vfio_group_set_container(group, fd); | ||
| 1175 | break; | ||
| 1176 | } | ||
| 1177 | case VFIO_GROUP_UNSET_CONTAINER: | ||
| 1178 | ret = vfio_group_unset_container(group); | ||
| 1179 | break; | ||
| 1180 | case VFIO_GROUP_GET_DEVICE_FD: | ||
| 1181 | { | ||
| 1182 | char *buf; | ||
| 1183 | |||
| 1184 | buf = strndup_user((const char __user *)arg, PAGE_SIZE); | ||
| 1185 | if (IS_ERR(buf)) | ||
| 1186 | return PTR_ERR(buf); | ||
| 1187 | |||
| 1188 | ret = vfio_group_get_device_fd(group, buf); | ||
| 1189 | kfree(buf); | ||
| 1190 | break; | ||
| 1191 | } | ||
| 1192 | } | ||
| 1193 | |||
| 1194 | return ret; | ||
| 1195 | } | ||
| 1196 | |||
| 1197 | #ifdef CONFIG_COMPAT | ||
| 1198 | static long vfio_group_fops_compat_ioctl(struct file *filep, | ||
| 1199 | unsigned int cmd, unsigned long arg) | ||
| 1200 | { | ||
| 1201 | arg = (unsigned long)compat_ptr(arg); | ||
| 1202 | return vfio_group_fops_unl_ioctl(filep, cmd, arg); | ||
| 1203 | } | ||
| 1204 | #endif /* CONFIG_COMPAT */ | ||
| 1205 | |||
| 1206 | static int vfio_group_fops_open(struct inode *inode, struct file *filep) | ||
| 1207 | { | ||
| 1208 | struct vfio_group *group; | ||
| 1209 | |||
| 1210 | group = vfio_group_get_from_minor(iminor(inode)); | ||
| 1211 | if (!group) | ||
| 1212 | return -ENODEV; | ||
| 1213 | |||
| 1214 | if (group->container) { | ||
| 1215 | vfio_group_put(group); | ||
| 1216 | return -EBUSY; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | filep->private_data = group; | ||
| 1220 | |||
| 1221 | return 0; | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | static int vfio_group_fops_release(struct inode *inode, struct file *filep) | ||
| 1225 | { | ||
| 1226 | struct vfio_group *group = filep->private_data; | ||
| 1227 | |||
| 1228 | filep->private_data = NULL; | ||
| 1229 | |||
| 1230 | vfio_group_try_dissolve_container(group); | ||
| 1231 | |||
| 1232 | vfio_group_put(group); | ||
| 1233 | |||
| 1234 | return 0; | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | static const struct file_operations vfio_group_fops = { | ||
| 1238 | .owner = THIS_MODULE, | ||
| 1239 | .unlocked_ioctl = vfio_group_fops_unl_ioctl, | ||
| 1240 | #ifdef CONFIG_COMPAT | ||
| 1241 | .compat_ioctl = vfio_group_fops_compat_ioctl, | ||
| 1242 | #endif | ||
| 1243 | .open = vfio_group_fops_open, | ||
| 1244 | .release = vfio_group_fops_release, | ||
| 1245 | }; | ||
| 1246 | |||
| 1247 | /** | ||
| 1248 | * VFIO Device fd | ||
| 1249 | */ | ||
| 1250 | static int vfio_device_fops_release(struct inode *inode, struct file *filep) | ||
| 1251 | { | ||
| 1252 | struct vfio_device *device = filep->private_data; | ||
| 1253 | |||
| 1254 | device->ops->release(device->device_data); | ||
| 1255 | |||
| 1256 | vfio_group_try_dissolve_container(device->group); | ||
| 1257 | |||
| 1258 | vfio_device_put(device); | ||
| 1259 | |||
| 1260 | return 0; | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | static long vfio_device_fops_unl_ioctl(struct file *filep, | ||
| 1264 | unsigned int cmd, unsigned long arg) | ||
| 1265 | { | ||
| 1266 | struct vfio_device *device = filep->private_data; | ||
| 1267 | |||
| 1268 | if (unlikely(!device->ops->ioctl)) | ||
| 1269 | return -EINVAL; | ||
| 1270 | |||
| 1271 | return device->ops->ioctl(device->device_data, cmd, arg); | ||
| 1272 | } | ||
| 1273 | |||
| 1274 | static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, | ||
| 1275 | size_t count, loff_t *ppos) | ||
| 1276 | { | ||
| 1277 | struct vfio_device *device = filep->private_data; | ||
| 1278 | |||
| 1279 | if (unlikely(!device->ops->read)) | ||
| 1280 | return -EINVAL; | ||
| 1281 | |||
| 1282 | return device->ops->read(device->device_data, buf, count, ppos); | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | static ssize_t vfio_device_fops_write(struct file *filep, | ||
| 1286 | const char __user *buf, | ||
| 1287 | size_t count, loff_t *ppos) | ||
| 1288 | { | ||
| 1289 | struct vfio_device *device = filep->private_data; | ||
| 1290 | |||
| 1291 | if (unlikely(!device->ops->write)) | ||
| 1292 | return -EINVAL; | ||
| 1293 | |||
| 1294 | return device->ops->write(device->device_data, buf, count, ppos); | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) | ||
| 1298 | { | ||
| 1299 | struct vfio_device *device = filep->private_data; | ||
| 1300 | |||
| 1301 | if (unlikely(!device->ops->mmap)) | ||
| 1302 | return -EINVAL; | ||
| 1303 | |||
| 1304 | return device->ops->mmap(device->device_data, vma); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | #ifdef CONFIG_COMPAT | ||
| 1308 | static long vfio_device_fops_compat_ioctl(struct file *filep, | ||
| 1309 | unsigned int cmd, unsigned long arg) | ||
| 1310 | { | ||
| 1311 | arg = (unsigned long)compat_ptr(arg); | ||
| 1312 | return vfio_device_fops_unl_ioctl(filep, cmd, arg); | ||
| 1313 | } | ||
| 1314 | #endif /* CONFIG_COMPAT */ | ||
| 1315 | |||
| 1316 | static const struct file_operations vfio_device_fops = { | ||
| 1317 | .owner = THIS_MODULE, | ||
| 1318 | .release = vfio_device_fops_release, | ||
| 1319 | .read = vfio_device_fops_read, | ||
| 1320 | .write = vfio_device_fops_write, | ||
| 1321 | .unlocked_ioctl = vfio_device_fops_unl_ioctl, | ||
| 1322 | #ifdef CONFIG_COMPAT | ||
| 1323 | .compat_ioctl = vfio_device_fops_compat_ioctl, | ||
| 1324 | #endif | ||
| 1325 | .mmap = vfio_device_fops_mmap, | ||
| 1326 | }; | ||
| 1327 | |||
| 1328 | /** | ||
| 1329 | * Module/class support | ||
| 1330 | */ | ||
| 1331 | static char *vfio_devnode(struct device *dev, umode_t *mode) | ||
| 1332 | { | ||
| 1333 | return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); | ||
| 1334 | } | ||
| 1335 | |||
| 1336 | static int __init vfio_init(void) | ||
| 1337 | { | ||
| 1338 | int ret; | ||
| 1339 | |||
| 1340 | idr_init(&vfio.group_idr); | ||
| 1341 | mutex_init(&vfio.group_lock); | ||
| 1342 | mutex_init(&vfio.iommu_drivers_lock); | ||
| 1343 | INIT_LIST_HEAD(&vfio.group_list); | ||
| 1344 | INIT_LIST_HEAD(&vfio.iommu_drivers_list); | ||
| 1345 | init_waitqueue_head(&vfio.release_q); | ||
| 1346 | |||
| 1347 | vfio.class = class_create(THIS_MODULE, "vfio"); | ||
| 1348 | if (IS_ERR(vfio.class)) { | ||
| 1349 | ret = PTR_ERR(vfio.class); | ||
| 1350 | goto err_class; | ||
| 1351 | } | ||
| 1352 | |||
| 1353 | vfio.class->devnode = vfio_devnode; | ||
| 1354 | |||
| 1355 | ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio"); | ||
| 1356 | if (ret) | ||
| 1357 | goto err_base_chrdev; | ||
| 1358 | |||
| 1359 | cdev_init(&vfio.cdev, &vfio_fops); | ||
| 1360 | ret = cdev_add(&vfio.cdev, vfio.devt, 1); | ||
| 1361 | if (ret) | ||
| 1362 | goto err_base_cdev; | ||
| 1363 | |||
| 1364 | vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio"); | ||
| 1365 | if (IS_ERR(vfio.dev)) { | ||
| 1366 | ret = PTR_ERR(vfio.dev); | ||
| 1367 | goto err_base_dev; | ||
| 1368 | } | ||
| 1369 | |||
| 1370 | /* /dev/vfio/$GROUP */ | ||
| 1371 | cdev_init(&vfio.group_cdev, &vfio_group_fops); | ||
| 1372 | ret = cdev_add(&vfio.group_cdev, | ||
| 1373 | MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1); | ||
| 1374 | if (ret) | ||
| 1375 | goto err_groups_cdev; | ||
| 1376 | |||
| 1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); | ||
| 1378 | |||
| 1379 | /* | ||
| 1380 | * Attempt to load known iommu-drivers. This gives us a working | ||
| 1381 | * environment without the user needing to explicitly load iommu | ||
| 1382 | * drivers. | ||
| 1383 | */ | ||
| 1384 | request_module_nowait("vfio_iommu_type1"); | ||
| 1385 | |||
| 1386 | return 0; | ||
| 1387 | |||
| 1388 | err_groups_cdev: | ||
| 1389 | device_destroy(vfio.class, vfio.devt); | ||
| 1390 | err_base_dev: | ||
| 1391 | cdev_del(&vfio.cdev); | ||
| 1392 | err_base_cdev: | ||
| 1393 | unregister_chrdev_region(vfio.devt, MINORMASK); | ||
| 1394 | err_base_chrdev: | ||
| 1395 | class_destroy(vfio.class); | ||
| 1396 | vfio.class = NULL; | ||
| 1397 | err_class: | ||
| 1398 | return ret; | ||
| 1399 | } | ||
| 1400 | |||
| 1401 | static void __exit vfio_cleanup(void) | ||
| 1402 | { | ||
| 1403 | WARN_ON(!list_empty(&vfio.group_list)); | ||
| 1404 | |||
| 1405 | idr_destroy(&vfio.group_idr); | ||
| 1406 | cdev_del(&vfio.group_cdev); | ||
| 1407 | device_destroy(vfio.class, vfio.devt); | ||
| 1408 | cdev_del(&vfio.cdev); | ||
| 1409 | unregister_chrdev_region(vfio.devt, MINORMASK); | ||
| 1410 | class_destroy(vfio.class); | ||
| 1411 | vfio.class = NULL; | ||
| 1412 | } | ||
| 1413 | |||
| 1414 | module_init(vfio_init); | ||
| 1415 | module_exit(vfio_cleanup); | ||
| 1416 | |||
| 1417 | MODULE_VERSION(DRIVER_VERSION); | ||
| 1418 | MODULE_LICENSE("GPL v2"); | ||
| 1419 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
| 1420 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c new file mode 100644 index 000000000000..6f3fbc48a6c7 --- /dev/null +++ b/drivers/vfio/vfio_iommu_type1.c | |||
| @@ -0,0 +1,753 @@ | |||
| 1 | /* | ||
| 2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | * | ||
| 15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | ||
| 16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | ||
| 17 | * VT-d, but that makes it harder to re-use as theoretically anyone | ||
| 18 | * implementing a similar IOMMU could make use of this. We expect the | ||
| 19 | * IOMMU to support the IOMMU API and have few to no restrictions around | ||
| 20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | ||
| 21 | * optimized for relatively static mappings of a userspace process with | ||
| 22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | ||
| 23 | * domains are PCI based as the IOMMU API is still centered around a | ||
| 24 | * device/bus interface rather than a group interface. | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/compat.h> | ||
| 28 | #include <linux/device.h> | ||
| 29 | #include <linux/fs.h> | ||
| 30 | #include <linux/iommu.h> | ||
| 31 | #include <linux/module.h> | ||
| 32 | #include <linux/mm.h> | ||
| 33 | #include <linux/pci.h> /* pci_bus_type */ | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/slab.h> | ||
| 36 | #include <linux/uaccess.h> | ||
| 37 | #include <linux/vfio.h> | ||
| 38 | #include <linux/workqueue.h> | ||
| 39 | |||
| 40 | #define DRIVER_VERSION "0.2" | ||
| 41 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
| 42 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | ||
| 43 | |||
| 44 | static bool allow_unsafe_interrupts; | ||
| 45 | module_param_named(allow_unsafe_interrupts, | ||
| 46 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | ||
| 47 | MODULE_PARM_DESC(allow_unsafe_interrupts, | ||
| 48 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | ||
| 49 | |||
| 50 | struct vfio_iommu { | ||
| 51 | struct iommu_domain *domain; | ||
| 52 | struct mutex lock; | ||
| 53 | struct list_head dma_list; | ||
| 54 | struct list_head group_list; | ||
| 55 | bool cache; | ||
| 56 | }; | ||
| 57 | |||
| 58 | struct vfio_dma { | ||
| 59 | struct list_head next; | ||
| 60 | dma_addr_t iova; /* Device address */ | ||
| 61 | unsigned long vaddr; /* Process virtual addr */ | ||
| 62 | long npage; /* Number of pages */ | ||
| 63 | int prot; /* IOMMU_READ/WRITE */ | ||
| 64 | }; | ||
| 65 | |||
| 66 | struct vfio_group { | ||
| 67 | struct iommu_group *iommu_group; | ||
| 68 | struct list_head next; | ||
| 69 | }; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * This code handles mapping and unmapping of user data buffers | ||
| 73 | * into DMA'ble space using the IOMMU | ||
| 74 | */ | ||
| 75 | |||
| 76 | #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) | ||
| 77 | |||
| 78 | struct vwork { | ||
| 79 | struct mm_struct *mm; | ||
| 80 | long npage; | ||
| 81 | struct work_struct work; | ||
| 82 | }; | ||
| 83 | |||
| 84 | /* delayed decrement/increment for locked_vm */ | ||
| 85 | static void vfio_lock_acct_bg(struct work_struct *work) | ||
| 86 | { | ||
| 87 | struct vwork *vwork = container_of(work, struct vwork, work); | ||
| 88 | struct mm_struct *mm; | ||
| 89 | |||
| 90 | mm = vwork->mm; | ||
| 91 | down_write(&mm->mmap_sem); | ||
| 92 | mm->locked_vm += vwork->npage; | ||
| 93 | up_write(&mm->mmap_sem); | ||
| 94 | mmput(mm); | ||
| 95 | kfree(vwork); | ||
| 96 | } | ||
| 97 | |||
| 98 | static void vfio_lock_acct(long npage) | ||
| 99 | { | ||
| 100 | struct vwork *vwork; | ||
| 101 | struct mm_struct *mm; | ||
| 102 | |||
| 103 | if (!current->mm) | ||
| 104 | return; /* process exited */ | ||
| 105 | |||
| 106 | if (down_write_trylock(¤t->mm->mmap_sem)) { | ||
| 107 | current->mm->locked_vm += npage; | ||
| 108 | up_write(¤t->mm->mmap_sem); | ||
| 109 | return; | ||
| 110 | } | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Couldn't get mmap_sem lock, so must setup to update | ||
| 114 | * mm->locked_vm later. If locked_vm were atomic, we | ||
| 115 | * wouldn't need this silliness | ||
| 116 | */ | ||
| 117 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | ||
| 118 | if (!vwork) | ||
| 119 | return; | ||
| 120 | mm = get_task_mm(current); | ||
| 121 | if (!mm) { | ||
| 122 | kfree(vwork); | ||
| 123 | return; | ||
| 124 | } | ||
| 125 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | ||
| 126 | vwork->mm = mm; | ||
| 127 | vwork->npage = npage; | ||
| 128 | schedule_work(&vwork->work); | ||
| 129 | } | ||
| 130 | |||
| 131 | /* | ||
| 132 | * Some mappings aren't backed by a struct page, for example an mmap'd | ||
| 133 | * MMIO range for our own or another device. These use a different | ||
| 134 | * pfn conversion and shouldn't be tracked as locked pages. | ||
| 135 | */ | ||
| 136 | static bool is_invalid_reserved_pfn(unsigned long pfn) | ||
| 137 | { | ||
| 138 | if (pfn_valid(pfn)) { | ||
| 139 | bool reserved; | ||
| 140 | struct page *tail = pfn_to_page(pfn); | ||
| 141 | struct page *head = compound_trans_head(tail); | ||
| 142 | reserved = !!(PageReserved(head)); | ||
| 143 | if (head != tail) { | ||
| 144 | /* | ||
| 145 | * "head" is not a dangling pointer | ||
| 146 | * (compound_trans_head takes care of that) | ||
| 147 | * but the hugepage may have been split | ||
| 148 | * from under us (and we may not hold a | ||
| 149 | * reference count on the head page so it can | ||
| 150 | * be reused before we run PageReferenced), so | ||
| 151 | * we've to check PageTail before returning | ||
| 152 | * what we just read. | ||
| 153 | */ | ||
| 154 | smp_rmb(); | ||
| 155 | if (PageTail(tail)) | ||
| 156 | return reserved; | ||
| 157 | } | ||
| 158 | return PageReserved(tail); | ||
| 159 | } | ||
| 160 | |||
| 161 | return true; | ||
| 162 | } | ||
| 163 | |||
| 164 | static int put_pfn(unsigned long pfn, int prot) | ||
| 165 | { | ||
| 166 | if (!is_invalid_reserved_pfn(pfn)) { | ||
| 167 | struct page *page = pfn_to_page(pfn); | ||
| 168 | if (prot & IOMMU_WRITE) | ||
| 169 | SetPageDirty(page); | ||
| 170 | put_page(page); | ||
| 171 | return 1; | ||
| 172 | } | ||
| 173 | return 0; | ||
| 174 | } | ||
| 175 | |||
| 176 | /* Unmap DMA region */ | ||
| 177 | static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 178 | long npage, int prot) | ||
| 179 | { | ||
| 180 | long i, unlocked = 0; | ||
| 181 | |||
| 182 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { | ||
| 183 | unsigned long pfn; | ||
| 184 | |||
| 185 | pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; | ||
| 186 | if (pfn) { | ||
| 187 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
| 188 | unlocked += put_pfn(pfn, prot); | ||
| 189 | } | ||
| 190 | } | ||
| 191 | return unlocked; | ||
| 192 | } | ||
| 193 | |||
| 194 | static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 195 | long npage, int prot) | ||
| 196 | { | ||
| 197 | long unlocked; | ||
| 198 | |||
| 199 | unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); | ||
| 200 | vfio_lock_acct(-unlocked); | ||
| 201 | } | ||
| 202 | |||
| 203 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | ||
| 204 | { | ||
| 205 | struct page *page[1]; | ||
| 206 | struct vm_area_struct *vma; | ||
| 207 | int ret = -EFAULT; | ||
| 208 | |||
| 209 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | ||
| 210 | *pfn = page_to_pfn(page[0]); | ||
| 211 | return 0; | ||
| 212 | } | ||
| 213 | |||
| 214 | down_read(¤t->mm->mmap_sem); | ||
| 215 | |||
| 216 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | ||
| 217 | |||
| 218 | if (vma && vma->vm_flags & VM_PFNMAP) { | ||
| 219 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 220 | if (is_invalid_reserved_pfn(*pfn)) | ||
| 221 | ret = 0; | ||
| 222 | } | ||
| 223 | |||
| 224 | up_read(¤t->mm->mmap_sem); | ||
| 225 | |||
| 226 | return ret; | ||
| 227 | } | ||
| 228 | |||
| 229 | /* Map DMA region */ | ||
| 230 | static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 231 | unsigned long vaddr, long npage, int prot) | ||
| 232 | { | ||
| 233 | dma_addr_t start = iova; | ||
| 234 | long i, locked = 0; | ||
| 235 | int ret; | ||
| 236 | |||
| 237 | /* Verify that pages are not already mapped */ | ||
| 238 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) | ||
| 239 | if (iommu_iova_to_phys(iommu->domain, iova)) | ||
| 240 | return -EBUSY; | ||
| 241 | |||
| 242 | iova = start; | ||
| 243 | |||
| 244 | if (iommu->cache) | ||
| 245 | prot |= IOMMU_CACHE; | ||
| 246 | |||
| 247 | /* | ||
| 248 | * XXX We break mappings into pages and use get_user_pages_fast to | ||
| 249 | * pin the pages in memory. It's been suggested that mlock might | ||
| 250 | * provide a more efficient mechanism, but nothing prevents the | ||
| 251 | * user from munlocking the pages, which could then allow the user | ||
| 252 | * access to random host memory. We also have no guarantee from the | ||
| 253 | * IOMMU API that the iommu driver can unmap sub-pages of previous | ||
| 254 | * mappings. This means we might lose an entire range if a single | ||
| 255 | * page within it is unmapped. Single page mappings are inefficient, | ||
| 256 | * but provide the most flexibility for now. | ||
| 257 | */ | ||
| 258 | for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { | ||
| 259 | unsigned long pfn = 0; | ||
| 260 | |||
| 261 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | ||
| 262 | if (ret) { | ||
| 263 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
| 264 | return ret; | ||
| 265 | } | ||
| 266 | |||
| 267 | /* | ||
| 268 | * Only add actual locked pages to accounting | ||
| 269 | * XXX We're effectively marking a page locked for every | ||
| 270 | * IOVA page even though it's possible the user could be | ||
| 271 | * backing multiple IOVAs with the same vaddr. This over- | ||
| 272 | * penalizes the user process, but we currently have no | ||
| 273 | * easy way to do this properly. | ||
| 274 | */ | ||
| 275 | if (!is_invalid_reserved_pfn(pfn)) | ||
| 276 | locked++; | ||
| 277 | |||
| 278 | ret = iommu_map(iommu->domain, iova, | ||
| 279 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
| 280 | PAGE_SIZE, prot); | ||
| 281 | if (ret) { | ||
| 282 | /* Back out mappings on error */ | ||
| 283 | put_pfn(pfn, prot); | ||
| 284 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
| 285 | return ret; | ||
| 286 | } | ||
| 287 | } | ||
| 288 | vfio_lock_acct(locked); | ||
| 289 | return 0; | ||
| 290 | } | ||
| 291 | |||
| 292 | static inline bool ranges_overlap(dma_addr_t start1, size_t size1, | ||
| 293 | dma_addr_t start2, size_t size2) | ||
| 294 | { | ||
| 295 | if (start1 < start2) | ||
| 296 | return (start2 - start1 < size1); | ||
| 297 | else if (start2 < start1) | ||
| 298 | return (start1 - start2 < size2); | ||
| 299 | return (size1 > 0 && size2 > 0); | ||
| 300 | } | ||
| 301 | |||
| 302 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | ||
| 303 | dma_addr_t start, size_t size) | ||
| 304 | { | ||
| 305 | struct vfio_dma *dma; | ||
| 306 | |||
| 307 | list_for_each_entry(dma, &iommu->dma_list, next) { | ||
| 308 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
| 309 | start, size)) | ||
| 310 | return dma; | ||
| 311 | } | ||
| 312 | return NULL; | ||
| 313 | } | ||
| 314 | |||
| 315 | static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | ||
| 316 | size_t size, struct vfio_dma *dma) | ||
| 317 | { | ||
| 318 | struct vfio_dma *split; | ||
| 319 | long npage_lo, npage_hi; | ||
| 320 | |||
| 321 | /* Existing dma region is completely covered, unmap all */ | ||
| 322 | if (start <= dma->iova && | ||
| 323 | start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
| 324 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
| 325 | list_del(&dma->next); | ||
| 326 | npage_lo = dma->npage; | ||
| 327 | kfree(dma); | ||
| 328 | return npage_lo; | ||
| 329 | } | ||
| 330 | |||
| 331 | /* Overlap low address of existing range */ | ||
| 332 | if (start <= dma->iova) { | ||
| 333 | size_t overlap; | ||
| 334 | |||
| 335 | overlap = start + size - dma->iova; | ||
| 336 | npage_lo = overlap >> PAGE_SHIFT; | ||
| 337 | |||
| 338 | vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); | ||
| 339 | dma->iova += overlap; | ||
| 340 | dma->vaddr += overlap; | ||
| 341 | dma->npage -= npage_lo; | ||
| 342 | return npage_lo; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* Overlap high address of existing range */ | ||
| 346 | if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
| 347 | size_t overlap; | ||
| 348 | |||
| 349 | overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; | ||
| 350 | npage_hi = overlap >> PAGE_SHIFT; | ||
| 351 | |||
| 352 | vfio_dma_unmap(iommu, start, npage_hi, dma->prot); | ||
| 353 | dma->npage -= npage_hi; | ||
| 354 | return npage_hi; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* Split existing */ | ||
| 358 | npage_lo = (start - dma->iova) >> PAGE_SHIFT; | ||
| 359 | npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; | ||
| 360 | |||
| 361 | split = kzalloc(sizeof *split, GFP_KERNEL); | ||
| 362 | if (!split) | ||
| 363 | return -ENOMEM; | ||
| 364 | |||
| 365 | vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); | ||
| 366 | |||
| 367 | dma->npage = npage_lo; | ||
| 368 | |||
| 369 | split->npage = npage_hi; | ||
| 370 | split->iova = start + size; | ||
| 371 | split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; | ||
| 372 | split->prot = dma->prot; | ||
| 373 | list_add(&split->next, &iommu->dma_list); | ||
| 374 | return size >> PAGE_SHIFT; | ||
| 375 | } | ||
| 376 | |||
| 377 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | ||
| 378 | struct vfio_iommu_type1_dma_unmap *unmap) | ||
| 379 | { | ||
| 380 | long ret = 0, npage = unmap->size >> PAGE_SHIFT; | ||
| 381 | struct vfio_dma *dma, *tmp; | ||
| 382 | uint64_t mask; | ||
| 383 | |||
| 384 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
| 385 | |||
| 386 | if (unmap->iova & mask) | ||
| 387 | return -EINVAL; | ||
| 388 | if (unmap->size & mask) | ||
| 389 | return -EINVAL; | ||
| 390 | |||
| 391 | /* XXX We still break these down into PAGE_SIZE */ | ||
| 392 | WARN_ON(mask & PAGE_MASK); | ||
| 393 | |||
| 394 | mutex_lock(&iommu->lock); | ||
| 395 | |||
| 396 | list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { | ||
| 397 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
| 398 | unmap->iova, unmap->size)) { | ||
| 399 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, | ||
| 400 | unmap->size, dma); | ||
| 401 | if (ret > 0) | ||
| 402 | npage -= ret; | ||
| 403 | if (ret < 0 || npage == 0) | ||
| 404 | break; | ||
| 405 | } | ||
| 406 | } | ||
| 407 | mutex_unlock(&iommu->lock); | ||
| 408 | return ret > 0 ? 0 : (int)ret; | ||
| 409 | } | ||
| 410 | |||
| 411 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | ||
| 412 | struct vfio_iommu_type1_dma_map *map) | ||
| 413 | { | ||
| 414 | struct vfio_dma *dma, *pdma = NULL; | ||
| 415 | dma_addr_t iova = map->iova; | ||
| 416 | unsigned long locked, lock_limit, vaddr = map->vaddr; | ||
| 417 | size_t size = map->size; | ||
| 418 | int ret = 0, prot = 0; | ||
| 419 | uint64_t mask; | ||
| 420 | long npage; | ||
| 421 | |||
| 422 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
| 423 | |||
| 424 | /* READ/WRITE from device perspective */ | ||
| 425 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | ||
| 426 | prot |= IOMMU_WRITE; | ||
| 427 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | ||
| 428 | prot |= IOMMU_READ; | ||
| 429 | |||
| 430 | if (!prot) | ||
| 431 | return -EINVAL; /* No READ/WRITE? */ | ||
| 432 | |||
| 433 | if (vaddr & mask) | ||
| 434 | return -EINVAL; | ||
| 435 | if (iova & mask) | ||
| 436 | return -EINVAL; | ||
| 437 | if (size & mask) | ||
| 438 | return -EINVAL; | ||
| 439 | |||
| 440 | /* XXX We still break these down into PAGE_SIZE */ | ||
| 441 | WARN_ON(mask & PAGE_MASK); | ||
| 442 | |||
| 443 | /* Don't allow IOVA wrap */ | ||
| 444 | if (iova + size && iova + size < iova) | ||
| 445 | return -EINVAL; | ||
| 446 | |||
| 447 | /* Don't allow virtual address wrap */ | ||
| 448 | if (vaddr + size && vaddr + size < vaddr) | ||
| 449 | return -EINVAL; | ||
| 450 | |||
| 451 | npage = size >> PAGE_SHIFT; | ||
| 452 | if (!npage) | ||
| 453 | return -EINVAL; | ||
| 454 | |||
| 455 | mutex_lock(&iommu->lock); | ||
| 456 | |||
| 457 | if (vfio_find_dma(iommu, iova, size)) { | ||
| 458 | ret = -EBUSY; | ||
| 459 | goto out_lock; | ||
| 460 | } | ||
| 461 | |||
| 462 | /* account for locked pages */ | ||
| 463 | locked = current->mm->locked_vm + npage; | ||
| 464 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
| 465 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { | ||
| 466 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | ||
| 467 | __func__, rlimit(RLIMIT_MEMLOCK)); | ||
| 468 | ret = -ENOMEM; | ||
| 469 | goto out_lock; | ||
| 470 | } | ||
| 471 | |||
| 472 | ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); | ||
| 473 | if (ret) | ||
| 474 | goto out_lock; | ||
| 475 | |||
| 476 | /* Check if we abut a region below - nothing below 0 */ | ||
| 477 | if (iova) { | ||
| 478 | dma = vfio_find_dma(iommu, iova - 1, 1); | ||
| 479 | if (dma && dma->prot == prot && | ||
| 480 | dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { | ||
| 481 | |||
| 482 | dma->npage += npage; | ||
| 483 | iova = dma->iova; | ||
| 484 | vaddr = dma->vaddr; | ||
| 485 | npage = dma->npage; | ||
| 486 | size = NPAGE_TO_SIZE(npage); | ||
| 487 | |||
| 488 | pdma = dma; | ||
| 489 | } | ||
| 490 | } | ||
| 491 | |||
| 492 | /* Check if we abut a region above - nothing above ~0 + 1 */ | ||
| 493 | if (iova + size) { | ||
| 494 | dma = vfio_find_dma(iommu, iova + size, 1); | ||
| 495 | if (dma && dma->prot == prot && | ||
| 496 | dma->vaddr == vaddr + size) { | ||
| 497 | |||
| 498 | dma->npage += npage; | ||
| 499 | dma->iova = iova; | ||
| 500 | dma->vaddr = vaddr; | ||
| 501 | |||
| 502 | /* | ||
| 503 | * If merged above and below, remove previously | ||
| 504 | * merged entry. New entry covers it. | ||
| 505 | */ | ||
| 506 | if (pdma) { | ||
| 507 | list_del(&pdma->next); | ||
| 508 | kfree(pdma); | ||
| 509 | } | ||
| 510 | pdma = dma; | ||
| 511 | } | ||
| 512 | } | ||
| 513 | |||
| 514 | /* Isolated, new region */ | ||
| 515 | if (!pdma) { | ||
| 516 | dma = kzalloc(sizeof *dma, GFP_KERNEL); | ||
| 517 | if (!dma) { | ||
| 518 | ret = -ENOMEM; | ||
| 519 | vfio_dma_unmap(iommu, iova, npage, prot); | ||
| 520 | goto out_lock; | ||
| 521 | } | ||
| 522 | |||
| 523 | dma->npage = npage; | ||
| 524 | dma->iova = iova; | ||
| 525 | dma->vaddr = vaddr; | ||
| 526 | dma->prot = prot; | ||
| 527 | list_add(&dma->next, &iommu->dma_list); | ||
| 528 | } | ||
| 529 | |||
| 530 | out_lock: | ||
| 531 | mutex_unlock(&iommu->lock); | ||
| 532 | return ret; | ||
| 533 | } | ||
| 534 | |||
| 535 | static int vfio_iommu_type1_attach_group(void *iommu_data, | ||
| 536 | struct iommu_group *iommu_group) | ||
| 537 | { | ||
| 538 | struct vfio_iommu *iommu = iommu_data; | ||
| 539 | struct vfio_group *group, *tmp; | ||
| 540 | int ret; | ||
| 541 | |||
| 542 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
| 543 | if (!group) | ||
| 544 | return -ENOMEM; | ||
| 545 | |||
| 546 | mutex_lock(&iommu->lock); | ||
| 547 | |||
| 548 | list_for_each_entry(tmp, &iommu->group_list, next) { | ||
| 549 | if (tmp->iommu_group == iommu_group) { | ||
| 550 | mutex_unlock(&iommu->lock); | ||
| 551 | kfree(group); | ||
| 552 | return -EINVAL; | ||
| 553 | } | ||
| 554 | } | ||
| 555 | |||
| 556 | /* | ||
| 557 | * TODO: Domain have capabilities that might change as we add | ||
| 558 | * groups (see iommu->cache, currently never set). Check for | ||
| 559 | * them and potentially disallow groups to be attached when it | ||
| 560 | * would change capabilities (ugh). | ||
| 561 | */ | ||
| 562 | ret = iommu_attach_group(iommu->domain, iommu_group); | ||
| 563 | if (ret) { | ||
| 564 | mutex_unlock(&iommu->lock); | ||
| 565 | kfree(group); | ||
| 566 | return ret; | ||
| 567 | } | ||
| 568 | |||
| 569 | group->iommu_group = iommu_group; | ||
| 570 | list_add(&group->next, &iommu->group_list); | ||
| 571 | |||
| 572 | mutex_unlock(&iommu->lock); | ||
| 573 | |||
| 574 | return 0; | ||
| 575 | } | ||
| 576 | |||
| 577 | static void vfio_iommu_type1_detach_group(void *iommu_data, | ||
| 578 | struct iommu_group *iommu_group) | ||
| 579 | { | ||
| 580 | struct vfio_iommu *iommu = iommu_data; | ||
| 581 | struct vfio_group *group; | ||
| 582 | |||
| 583 | mutex_lock(&iommu->lock); | ||
| 584 | |||
| 585 | list_for_each_entry(group, &iommu->group_list, next) { | ||
| 586 | if (group->iommu_group == iommu_group) { | ||
| 587 | iommu_detach_group(iommu->domain, iommu_group); | ||
| 588 | list_del(&group->next); | ||
| 589 | kfree(group); | ||
| 590 | break; | ||
| 591 | } | ||
| 592 | } | ||
| 593 | |||
| 594 | mutex_unlock(&iommu->lock); | ||
| 595 | } | ||
| 596 | |||
| 597 | static void *vfio_iommu_type1_open(unsigned long arg) | ||
| 598 | { | ||
| 599 | struct vfio_iommu *iommu; | ||
| 600 | |||
| 601 | if (arg != VFIO_TYPE1_IOMMU) | ||
| 602 | return ERR_PTR(-EINVAL); | ||
| 603 | |||
| 604 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | ||
| 605 | if (!iommu) | ||
| 606 | return ERR_PTR(-ENOMEM); | ||
| 607 | |||
| 608 | INIT_LIST_HEAD(&iommu->group_list); | ||
| 609 | INIT_LIST_HEAD(&iommu->dma_list); | ||
| 610 | mutex_init(&iommu->lock); | ||
| 611 | |||
| 612 | /* | ||
| 613 | * Wish we didn't have to know about bus_type here. | ||
| 614 | */ | ||
| 615 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | ||
| 616 | if (!iommu->domain) { | ||
| 617 | kfree(iommu); | ||
| 618 | return ERR_PTR(-EIO); | ||
| 619 | } | ||
| 620 | |||
| 621 | /* | ||
| 622 | * Wish we could specify required capabilities rather than create | ||
| 623 | * a domain, see what comes out and hope it doesn't change along | ||
| 624 | * the way. Fortunately we know interrupt remapping is global for | ||
| 625 | * our iommus. | ||
| 626 | */ | ||
| 627 | if (!allow_unsafe_interrupts && | ||
| 628 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | ||
| 629 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
| 630 | __func__); | ||
| 631 | iommu_domain_free(iommu->domain); | ||
| 632 | kfree(iommu); | ||
| 633 | return ERR_PTR(-EPERM); | ||
| 634 | } | ||
| 635 | |||
| 636 | return iommu; | ||
| 637 | } | ||
| 638 | |||
| 639 | static void vfio_iommu_type1_release(void *iommu_data) | ||
| 640 | { | ||
| 641 | struct vfio_iommu *iommu = iommu_data; | ||
| 642 | struct vfio_group *group, *group_tmp; | ||
| 643 | struct vfio_dma *dma, *dma_tmp; | ||
| 644 | |||
| 645 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | ||
| 646 | iommu_detach_group(iommu->domain, group->iommu_group); | ||
| 647 | list_del(&group->next); | ||
| 648 | kfree(group); | ||
| 649 | } | ||
| 650 | |||
| 651 | list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { | ||
| 652 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
| 653 | list_del(&dma->next); | ||
| 654 | kfree(dma); | ||
| 655 | } | ||
| 656 | |||
| 657 | iommu_domain_free(iommu->domain); | ||
| 658 | iommu->domain = NULL; | ||
| 659 | kfree(iommu); | ||
| 660 | } | ||
| 661 | |||
| 662 | static long vfio_iommu_type1_ioctl(void *iommu_data, | ||
| 663 | unsigned int cmd, unsigned long arg) | ||
| 664 | { | ||
| 665 | struct vfio_iommu *iommu = iommu_data; | ||
| 666 | unsigned long minsz; | ||
| 667 | |||
| 668 | if (cmd == VFIO_CHECK_EXTENSION) { | ||
| 669 | switch (arg) { | ||
| 670 | case VFIO_TYPE1_IOMMU: | ||
| 671 | return 1; | ||
| 672 | default: | ||
| 673 | return 0; | ||
| 674 | } | ||
| 675 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | ||
| 676 | struct vfio_iommu_type1_info info; | ||
| 677 | |||
| 678 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | ||
| 679 | |||
| 680 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
| 681 | return -EFAULT; | ||
| 682 | |||
| 683 | if (info.argsz < minsz) | ||
| 684 | return -EINVAL; | ||
| 685 | |||
| 686 | info.flags = 0; | ||
| 687 | |||
| 688 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | ||
| 689 | |||
| 690 | return copy_to_user((void __user *)arg, &info, minsz); | ||
| 691 | |||
| 692 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | ||
| 693 | struct vfio_iommu_type1_dma_map map; | ||
| 694 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | ||
| 695 | VFIO_DMA_MAP_FLAG_WRITE; | ||
| 696 | |||
| 697 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | ||
| 698 | |||
| 699 | if (copy_from_user(&map, (void __user *)arg, minsz)) | ||
| 700 | return -EFAULT; | ||
| 701 | |||
| 702 | if (map.argsz < minsz || map.flags & ~mask) | ||
| 703 | return -EINVAL; | ||
| 704 | |||
| 705 | return vfio_dma_do_map(iommu, &map); | ||
| 706 | |||
| 707 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | ||
| 708 | struct vfio_iommu_type1_dma_unmap unmap; | ||
| 709 | |||
| 710 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | ||
| 711 | |||
| 712 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | ||
| 713 | return -EFAULT; | ||
| 714 | |||
| 715 | if (unmap.argsz < minsz || unmap.flags) | ||
| 716 | return -EINVAL; | ||
| 717 | |||
| 718 | return vfio_dma_do_unmap(iommu, &unmap); | ||
| 719 | } | ||
| 720 | |||
| 721 | return -ENOTTY; | ||
| 722 | } | ||
| 723 | |||
| 724 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | ||
| 725 | .name = "vfio-iommu-type1", | ||
| 726 | .owner = THIS_MODULE, | ||
| 727 | .open = vfio_iommu_type1_open, | ||
| 728 | .release = vfio_iommu_type1_release, | ||
| 729 | .ioctl = vfio_iommu_type1_ioctl, | ||
| 730 | .attach_group = vfio_iommu_type1_attach_group, | ||
| 731 | .detach_group = vfio_iommu_type1_detach_group, | ||
| 732 | }; | ||
| 733 | |||
| 734 | static int __init vfio_iommu_type1_init(void) | ||
| 735 | { | ||
| 736 | if (!iommu_present(&pci_bus_type)) | ||
| 737 | return -ENODEV; | ||
| 738 | |||
| 739 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
| 740 | } | ||
| 741 | |||
| 742 | static void __exit vfio_iommu_type1_cleanup(void) | ||
| 743 | { | ||
| 744 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
| 745 | } | ||
| 746 | |||
| 747 | module_init(vfio_iommu_type1_init); | ||
| 748 | module_exit(vfio_iommu_type1_cleanup); | ||
| 749 | |||
| 750 | MODULE_VERSION(DRIVER_VERSION); | ||
| 751 | MODULE_LICENSE("GPL v2"); | ||
| 752 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
| 753 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h new file mode 100644 index 000000000000..0a4f180a11d8 --- /dev/null +++ b/include/linux/vfio.h | |||
| @@ -0,0 +1,445 @@ | |||
| 1 | /* | ||
| 2 | * VFIO API definition | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | */ | ||
| 11 | #ifndef VFIO_H | ||
| 12 | #define VFIO_H | ||
| 13 | |||
| 14 | #include <linux/types.h> | ||
| 15 | #include <linux/ioctl.h> | ||
| 16 | |||
| 17 | #define VFIO_API_VERSION 0 | ||
| 18 | |||
| 19 | #ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */ | ||
| 20 | |||
| 21 | #include <linux/iommu.h> | ||
| 22 | #include <linux/mm.h> | ||
| 23 | |||
| 24 | /** | ||
| 25 | * struct vfio_device_ops - VFIO bus driver device callbacks | ||
| 26 | * | ||
| 27 | * @open: Called when userspace creates new file descriptor for device | ||
| 28 | * @release: Called when userspace releases file descriptor for device | ||
| 29 | * @read: Perform read(2) on device file descriptor | ||
| 30 | * @write: Perform write(2) on device file descriptor | ||
| 31 | * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* | ||
| 32 | * operations documented below | ||
| 33 | * @mmap: Perform mmap(2) on a region of the device file descriptor | ||
| 34 | */ | ||
| 35 | struct vfio_device_ops { | ||
| 36 | char *name; | ||
| 37 | int (*open)(void *device_data); | ||
| 38 | void (*release)(void *device_data); | ||
| 39 | ssize_t (*read)(void *device_data, char __user *buf, | ||
| 40 | size_t count, loff_t *ppos); | ||
| 41 | ssize_t (*write)(void *device_data, const char __user *buf, | ||
| 42 | size_t count, loff_t *size); | ||
| 43 | long (*ioctl)(void *device_data, unsigned int cmd, | ||
| 44 | unsigned long arg); | ||
| 45 | int (*mmap)(void *device_data, struct vm_area_struct *vma); | ||
| 46 | }; | ||
| 47 | |||
| 48 | extern int vfio_add_group_dev(struct device *dev, | ||
| 49 | const struct vfio_device_ops *ops, | ||
| 50 | void *device_data); | ||
| 51 | |||
| 52 | extern void *vfio_del_group_dev(struct device *dev); | ||
| 53 | |||
| 54 | /** | ||
| 55 | * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks | ||
| 56 | */ | ||
| 57 | struct vfio_iommu_driver_ops { | ||
| 58 | char *name; | ||
| 59 | struct module *owner; | ||
| 60 | void *(*open)(unsigned long arg); | ||
| 61 | void (*release)(void *iommu_data); | ||
| 62 | ssize_t (*read)(void *iommu_data, char __user *buf, | ||
| 63 | size_t count, loff_t *ppos); | ||
| 64 | ssize_t (*write)(void *iommu_data, const char __user *buf, | ||
| 65 | size_t count, loff_t *size); | ||
| 66 | long (*ioctl)(void *iommu_data, unsigned int cmd, | ||
| 67 | unsigned long arg); | ||
| 68 | int (*mmap)(void *iommu_data, struct vm_area_struct *vma); | ||
| 69 | int (*attach_group)(void *iommu_data, | ||
| 70 | struct iommu_group *group); | ||
| 71 | void (*detach_group)(void *iommu_data, | ||
| 72 | struct iommu_group *group); | ||
| 73 | |||
| 74 | }; | ||
| 75 | |||
| 76 | extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); | ||
| 77 | |||
| 78 | extern void vfio_unregister_iommu_driver( | ||
| 79 | const struct vfio_iommu_driver_ops *ops); | ||
| 80 | |||
| 81 | /** | ||
| 82 | * offsetofend(TYPE, MEMBER) | ||
| 83 | * | ||
| 84 | * @TYPE: The type of the structure | ||
| 85 | * @MEMBER: The member within the structure to get the end offset of | ||
| 86 | * | ||
| 87 | * Simple helper macro for dealing with variable sized structures passed | ||
| 88 | * from user space. This allows us to easily determine if the provided | ||
| 89 | * structure is sized to include various fields. | ||
| 90 | */ | ||
| 91 | #define offsetofend(TYPE, MEMBER) ({ \ | ||
| 92 | TYPE tmp; \ | ||
| 93 | offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ | ||
| 94 | |||
| 95 | #endif /* __KERNEL__ */ | ||
| 96 | |||
| 97 | /* Kernel & User level defines for VFIO IOCTLs. */ | ||
| 98 | |||
| 99 | /* Extensions */ | ||
| 100 | |||
| 101 | #define VFIO_TYPE1_IOMMU 1 | ||
| 102 | |||
| 103 | /* | ||
| 104 | * The IOCTL interface is designed for extensibility by embedding the | ||
| 105 | * structure length (argsz) and flags into structures passed between | ||
| 106 | * kernel and userspace. We therefore use the _IO() macro for these | ||
| 107 | * defines to avoid implicitly embedding a size into the ioctl request. | ||
| 108 | * As structure fields are added, argsz will increase to match and flag | ||
| 109 | * bits will be defined to indicate additional fields with valid data. | ||
| 110 | * It's *always* the caller's responsibility to indicate the size of | ||
| 111 | * the structure passed by setting argsz appropriately. | ||
| 112 | */ | ||
| 113 | |||
| 114 | #define VFIO_TYPE (';') | ||
| 115 | #define VFIO_BASE 100 | ||
| 116 | |||
| 117 | /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ | ||
| 118 | |||
| 119 | /** | ||
| 120 | * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) | ||
| 121 | * | ||
| 122 | * Report the version of the VFIO API. This allows us to bump the entire | ||
| 123 | * API version should we later need to add or change features in incompatible | ||
| 124 | * ways. | ||
| 125 | * Return: VFIO_API_VERSION | ||
| 126 | * Availability: Always | ||
| 127 | */ | ||
| 128 | #define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) | ||
| 129 | |||
| 130 | /** | ||
| 131 | * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) | ||
| 132 | * | ||
| 133 | * Check whether an extension is supported. | ||
| 134 | * Return: 0 if not supported, 1 (or some other positive integer) if supported. | ||
| 135 | * Availability: Always | ||
| 136 | */ | ||
| 137 | #define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) | ||
| 138 | |||
| 139 | /** | ||
| 140 | * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) | ||
| 141 | * | ||
| 142 | * Set the iommu to the given type. The type must be supported by an | ||
| 143 | * iommu driver as verified by calling CHECK_EXTENSION using the same | ||
| 144 | * type. A group must be set to this file descriptor before this | ||
| 145 | * ioctl is available. The IOMMU interfaces enabled by this call are | ||
| 146 | * specific to the value set. | ||
| 147 | * Return: 0 on success, -errno on failure | ||
| 148 | * Availability: When VFIO group attached | ||
| 149 | */ | ||
| 150 | #define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) | ||
| 151 | |||
| 152 | /* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ | ||
| 153 | |||
| 154 | /** | ||
| 155 | * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, | ||
| 156 | * struct vfio_group_status) | ||
| 157 | * | ||
| 158 | * Retrieve information about the group. Fills in provided | ||
| 159 | * struct vfio_group_info. Caller sets argsz. | ||
| 160 | * Return: 0 on succes, -errno on failure. | ||
| 161 | * Availability: Always | ||
| 162 | */ | ||
| 163 | struct vfio_group_status { | ||
| 164 | __u32 argsz; | ||
| 165 | __u32 flags; | ||
| 166 | #define VFIO_GROUP_FLAGS_VIABLE (1 << 0) | ||
| 167 | #define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) | ||
| 168 | }; | ||
| 169 | #define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) | ||
| 170 | |||
| 171 | /** | ||
| 172 | * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) | ||
| 173 | * | ||
| 174 | * Set the container for the VFIO group to the open VFIO file | ||
| 175 | * descriptor provided. Groups may only belong to a single | ||
| 176 | * container. Containers may, at their discretion, support multiple | ||
| 177 | * groups. Only when a container is set are all of the interfaces | ||
| 178 | * of the VFIO file descriptor and the VFIO group file descriptor | ||
| 179 | * available to the user. | ||
| 180 | * Return: 0 on success, -errno on failure. | ||
| 181 | * Availability: Always | ||
| 182 | */ | ||
| 183 | #define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) | ||
| 184 | |||
| 185 | /** | ||
| 186 | * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) | ||
| 187 | * | ||
| 188 | * Remove the group from the attached container. This is the | ||
| 189 | * opposite of the SET_CONTAINER call and returns the group to | ||
| 190 | * an initial state. All device file descriptors must be released | ||
| 191 | * prior to calling this interface. When removing the last group | ||
| 192 | * from a container, the IOMMU will be disabled and all state lost, | ||
| 193 | * effectively also returning the VFIO file descriptor to an initial | ||
| 194 | * state. | ||
| 195 | * Return: 0 on success, -errno on failure. | ||
| 196 | * Availability: When attached to container | ||
| 197 | */ | ||
| 198 | #define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) | ||
| 199 | |||
| 200 | /** | ||
| 201 | * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) | ||
| 202 | * | ||
| 203 | * Return a new file descriptor for the device object described by | ||
| 204 | * the provided string. The string should match a device listed in | ||
| 205 | * the devices subdirectory of the IOMMU group sysfs entry. The | ||
| 206 | * group containing the device must already be added to this context. | ||
| 207 | * Return: new file descriptor on success, -errno on failure. | ||
| 208 | * Availability: When attached to container | ||
| 209 | */ | ||
| 210 | #define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) | ||
| 211 | |||
| 212 | /* --------------- IOCTLs for DEVICE file descriptors --------------- */ | ||
| 213 | |||
| 214 | /** | ||
| 215 | * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, | ||
| 216 | * struct vfio_device_info) | ||
| 217 | * | ||
| 218 | * Retrieve information about the device. Fills in provided | ||
| 219 | * struct vfio_device_info. Caller sets argsz. | ||
| 220 | * Return: 0 on success, -errno on failure. | ||
| 221 | */ | ||
| 222 | struct vfio_device_info { | ||
| 223 | __u32 argsz; | ||
| 224 | __u32 flags; | ||
| 225 | #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ | ||
| 226 | #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ | ||
| 227 | __u32 num_regions; /* Max region index + 1 */ | ||
| 228 | __u32 num_irqs; /* Max IRQ index + 1 */ | ||
| 229 | }; | ||
| 230 | #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) | ||
| 231 | |||
| 232 | /** | ||
| 233 | * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, | ||
| 234 | * struct vfio_region_info) | ||
| 235 | * | ||
| 236 | * Retrieve information about a device region. Caller provides | ||
| 237 | * struct vfio_region_info with index value set. Caller sets argsz. | ||
| 238 | * Implementation of region mapping is bus driver specific. This is | ||
| 239 | * intended to describe MMIO, I/O port, as well as bus specific | ||
| 240 | * regions (ex. PCI config space). Zero sized regions may be used | ||
| 241 | * to describe unimplemented regions (ex. unimplemented PCI BARs). | ||
| 242 | * Return: 0 on success, -errno on failure. | ||
| 243 | */ | ||
| 244 | struct vfio_region_info { | ||
| 245 | __u32 argsz; | ||
| 246 | __u32 flags; | ||
| 247 | #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ | ||
| 248 | #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ | ||
| 249 | #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ | ||
| 250 | __u32 index; /* Region index */ | ||
| 251 | __u32 resv; /* Reserved for alignment */ | ||
| 252 | __u64 size; /* Region size (bytes) */ | ||
| 253 | __u64 offset; /* Region offset from start of device fd */ | ||
| 254 | }; | ||
| 255 | #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) | ||
| 256 | |||
| 257 | /** | ||
| 258 | * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, | ||
| 259 | * struct vfio_irq_info) | ||
| 260 | * | ||
| 261 | * Retrieve information about a device IRQ. Caller provides | ||
| 262 | * struct vfio_irq_info with index value set. Caller sets argsz. | ||
| 263 | * Implementation of IRQ mapping is bus driver specific. Indexes | ||
| 264 | * using multiple IRQs are primarily intended to support MSI-like | ||
| 265 | * interrupt blocks. Zero count irq blocks may be used to describe | ||
| 266 | * unimplemented interrupt types. | ||
| 267 | * | ||
| 268 | * The EVENTFD flag indicates the interrupt index supports eventfd based | ||
| 269 | * signaling. | ||
| 270 | * | ||
| 271 | * The MASKABLE flags indicates the index supports MASK and UNMASK | ||
| 272 | * actions described below. | ||
| 273 | * | ||
| 274 | * AUTOMASKED indicates that after signaling, the interrupt line is | ||
| 275 | * automatically masked by VFIO and the user needs to unmask the line | ||
| 276 | * to receive new interrupts. This is primarily intended to distinguish | ||
| 277 | * level triggered interrupts. | ||
| 278 | * | ||
| 279 | * The NORESIZE flag indicates that the interrupt lines within the index | ||
| 280 | * are setup as a set and new subindexes cannot be enabled without first | ||
| 281 | * disabling the entire index. This is used for interrupts like PCI MSI | ||
| 282 | * and MSI-X where the driver may only use a subset of the available | ||
| 283 | * indexes, but VFIO needs to enable a specific number of vectors | ||
| 284 | * upfront. In the case of MSI-X, where the user can enable MSI-X and | ||
| 285 | * then add and unmask vectors, it's up to userspace to make the decision | ||
| 286 | * whether to allocate the maximum supported number of vectors or tear | ||
| 287 | * down setup and incrementally increase the vectors as each is enabled. | ||
| 288 | */ | ||
| 289 | struct vfio_irq_info { | ||
| 290 | __u32 argsz; | ||
| 291 | __u32 flags; | ||
| 292 | #define VFIO_IRQ_INFO_EVENTFD (1 << 0) | ||
| 293 | #define VFIO_IRQ_INFO_MASKABLE (1 << 1) | ||
| 294 | #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) | ||
| 295 | #define VFIO_IRQ_INFO_NORESIZE (1 << 3) | ||
| 296 | __u32 index; /* IRQ index */ | ||
| 297 | __u32 count; /* Number of IRQs within this index */ | ||
| 298 | }; | ||
| 299 | #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) | ||
| 300 | |||
| 301 | /** | ||
| 302 | * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) | ||
| 303 | * | ||
| 304 | * Set signaling, masking, and unmasking of interrupts. Caller provides | ||
| 305 | * struct vfio_irq_set with all fields set. 'start' and 'count' indicate | ||
| 306 | * the range of subindexes being specified. | ||
| 307 | * | ||
| 308 | * The DATA flags specify the type of data provided. If DATA_NONE, the | ||
| 309 | * operation performs the specified action immediately on the specified | ||
| 310 | * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: | ||
| 311 | * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. | ||
| 312 | * | ||
| 313 | * DATA_BOOL allows sparse support for the same on arrays of interrupts. | ||
| 314 | * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): | ||
| 315 | * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, | ||
| 316 | * data = {1,0,1} | ||
| 317 | * | ||
| 318 | * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. | ||
| 319 | * A value of -1 can be used to either de-assign interrupts if already | ||
| 320 | * assigned or skip un-assigned interrupts. For example, to set an eventfd | ||
| 321 | * to be trigger for interrupts [0,0] and [0,2]: | ||
| 322 | * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, | ||
| 323 | * data = {fd1, -1, fd2} | ||
| 324 | * If index [0,1] is previously set, two count = 1 ioctls calls would be | ||
| 325 | * required to set [0,0] and [0,2] without changing [0,1]. | ||
| 326 | * | ||
| 327 | * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used | ||
| 328 | * with ACTION_TRIGGER to perform kernel level interrupt loopback testing | ||
| 329 | * from userspace (ie. simulate hardware triggering). | ||
| 330 | * | ||
| 331 | * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER | ||
| 332 | * enables the interrupt index for the device. Individual subindex interrupts | ||
| 333 | * can be disabled using the -1 value for DATA_EVENTFD or the index can be | ||
| 334 | * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. | ||
| 335 | * | ||
| 336 | * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while | ||
| 337 | * ACTION_TRIGGER specifies kernel->user signaling. | ||
| 338 | */ | ||
| 339 | struct vfio_irq_set { | ||
| 340 | __u32 argsz; | ||
| 341 | __u32 flags; | ||
| 342 | #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ | ||
| 343 | #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ | ||
| 344 | #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ | ||
| 345 | #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ | ||
| 346 | #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ | ||
| 347 | #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ | ||
| 348 | __u32 index; | ||
| 349 | __u32 start; | ||
| 350 | __u32 count; | ||
| 351 | __u8 data[]; | ||
| 352 | }; | ||
| 353 | #define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) | ||
| 354 | |||
| 355 | #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ | ||
| 356 | VFIO_IRQ_SET_DATA_BOOL | \ | ||
| 357 | VFIO_IRQ_SET_DATA_EVENTFD) | ||
| 358 | #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ | ||
| 359 | VFIO_IRQ_SET_ACTION_UNMASK | \ | ||
| 360 | VFIO_IRQ_SET_ACTION_TRIGGER) | ||
| 361 | /** | ||
| 362 | * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) | ||
| 363 | * | ||
| 364 | * Reset a device. | ||
| 365 | */ | ||
| 366 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) | ||
| 367 | |||
| 368 | /* | ||
| 369 | * The VFIO-PCI bus driver makes use of the following fixed region and | ||
| 370 | * IRQ index mapping. Unimplemented regions return a size of zero. | ||
| 371 | * Unimplemented IRQ types return a count of zero. | ||
| 372 | */ | ||
| 373 | |||
| 374 | enum { | ||
| 375 | VFIO_PCI_BAR0_REGION_INDEX, | ||
| 376 | VFIO_PCI_BAR1_REGION_INDEX, | ||
| 377 | VFIO_PCI_BAR2_REGION_INDEX, | ||
| 378 | VFIO_PCI_BAR3_REGION_INDEX, | ||
| 379 | VFIO_PCI_BAR4_REGION_INDEX, | ||
| 380 | VFIO_PCI_BAR5_REGION_INDEX, | ||
| 381 | VFIO_PCI_ROM_REGION_INDEX, | ||
| 382 | VFIO_PCI_CONFIG_REGION_INDEX, | ||
| 383 | VFIO_PCI_NUM_REGIONS | ||
| 384 | }; | ||
| 385 | |||
| 386 | enum { | ||
| 387 | VFIO_PCI_INTX_IRQ_INDEX, | ||
| 388 | VFIO_PCI_MSI_IRQ_INDEX, | ||
| 389 | VFIO_PCI_MSIX_IRQ_INDEX, | ||
| 390 | VFIO_PCI_NUM_IRQS | ||
| 391 | }; | ||
| 392 | |||
| 393 | /* -------- API for Type1 VFIO IOMMU -------- */ | ||
| 394 | |||
| 395 | /** | ||
| 396 | * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) | ||
| 397 | * | ||
| 398 | * Retrieve information about the IOMMU object. Fills in provided | ||
| 399 | * struct vfio_iommu_info. Caller sets argsz. | ||
| 400 | * | ||
| 401 | * XXX Should we do these by CHECK_EXTENSION too? | ||
| 402 | */ | ||
| 403 | struct vfio_iommu_type1_info { | ||
| 404 | __u32 argsz; | ||
| 405 | __u32 flags; | ||
| 406 | #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ | ||
| 407 | __u64 iova_pgsizes; /* Bitmap of supported page sizes */ | ||
| 408 | }; | ||
| 409 | |||
| 410 | #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) | ||
| 411 | |||
| 412 | /** | ||
| 413 | * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) | ||
| 414 | * | ||
| 415 | * Map process virtual addresses to IO virtual addresses using the | ||
| 416 | * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. | ||
| 417 | */ | ||
| 418 | struct vfio_iommu_type1_dma_map { | ||
| 419 | __u32 argsz; | ||
| 420 | __u32 flags; | ||
| 421 | #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ | ||
| 422 | #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ | ||
| 423 | __u64 vaddr; /* Process virtual address */ | ||
| 424 | __u64 iova; /* IO virtual address */ | ||
| 425 | __u64 size; /* Size of mapping (bytes) */ | ||
| 426 | }; | ||
| 427 | |||
| 428 | #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) | ||
| 429 | |||
| 430 | /** | ||
| 431 | * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) | ||
| 432 | * | ||
| 433 | * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. | ||
| 434 | * Caller sets argsz. | ||
| 435 | */ | ||
| 436 | struct vfio_iommu_type1_dma_unmap { | ||
| 437 | __u32 argsz; | ||
| 438 | __u32 flags; | ||
| 439 | __u64 iova; /* IO virtual address */ | ||
| 440 | __u64 size; /* Size of mapping (bytes) */ | ||
| 441 | }; | ||
| 442 | |||
| 443 | #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) | ||
| 444 | |||
| 445 | #endif /* VFIO_H */ | ||
