diff options
-rw-r--r-- | drivers/vfio/Kconfig | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/Kconfig | 8 | ||||
-rw-r--r-- | drivers/vfio/pci/Makefile | 4 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 579 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 1540 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 740 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 91 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 269 | ||||
-rw-r--r-- | include/linux/vfio.h | 26 |
9 files changed, 3259 insertions, 0 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 128b97910b8e..7cd5dec0abd1 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig | |||
@@ -12,3 +12,5 @@ menuconfig VFIO | |||
12 | See Documentation/vfio.txt for more details. | 12 | See Documentation/vfio.txt for more details. |
13 | 13 | ||
14 | If you don't know what to do here, say N. | 14 | If you don't know what to do here, say N. |
15 | |||
16 | source "drivers/vfio/pci/Kconfig" | ||
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig new file mode 100644 index 000000000000..5980758563eb --- /dev/null +++ b/drivers/vfio/pci/Kconfig | |||
@@ -0,0 +1,8 @@ | |||
1 | config VFIO_PCI | ||
2 | tristate "VFIO support for PCI devices" | ||
3 | depends on VFIO && PCI && EVENTFD | ||
4 | help | ||
5 | Support for the PCI VFIO bus driver. This is required to make | ||
6 | use of PCI drivers using the VFIO framework. | ||
7 | |||
8 | If you don't know what to do here, say N. | ||
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile new file mode 100644 index 000000000000..131079255fd9 --- /dev/null +++ b/drivers/vfio/pci/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | |||
2 | vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o | ||
3 | |||
4 | obj-$(CONFIG_VFIO_PCI) += vfio-pci.o | ||
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c new file mode 100644 index 000000000000..6968b7232232 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci.c | |||
@@ -0,0 +1,579 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * Derived from original vfio: | ||
10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
11 | * Author: Tom Lyon, pugs@cisco.com | ||
12 | */ | ||
13 | |||
14 | #include <linux/device.h> | ||
15 | #include <linux/eventfd.h> | ||
16 | #include <linux/interrupt.h> | ||
17 | #include <linux/iommu.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mutex.h> | ||
20 | #include <linux/notifier.h> | ||
21 | #include <linux/pci.h> | ||
22 | #include <linux/pm_runtime.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/uaccess.h> | ||
26 | #include <linux/vfio.h> | ||
27 | |||
28 | #include "vfio_pci_private.h" | ||
29 | |||
30 | #define DRIVER_VERSION "0.2" | ||
31 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
32 | #define DRIVER_DESC "VFIO PCI - User Level meta-driver" | ||
33 | |||
34 | static bool nointxmask; | ||
35 | module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); | ||
36 | MODULE_PARM_DESC(nointxmask, | ||
37 | "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); | ||
38 | |||
39 | static int vfio_pci_enable(struct vfio_pci_device *vdev) | ||
40 | { | ||
41 | struct pci_dev *pdev = vdev->pdev; | ||
42 | int ret; | ||
43 | u16 cmd; | ||
44 | u8 msix_pos; | ||
45 | |||
46 | vdev->reset_works = (pci_reset_function(pdev) == 0); | ||
47 | pci_save_state(pdev); | ||
48 | vdev->pci_saved_state = pci_store_saved_state(pdev); | ||
49 | if (!vdev->pci_saved_state) | ||
50 | pr_debug("%s: Couldn't store %s saved state\n", | ||
51 | __func__, dev_name(&pdev->dev)); | ||
52 | |||
53 | ret = vfio_config_init(vdev); | ||
54 | if (ret) | ||
55 | goto out; | ||
56 | |||
57 | if (likely(!nointxmask)) | ||
58 | vdev->pci_2_3 = pci_intx_mask_supported(pdev); | ||
59 | |||
60 | pci_read_config_word(pdev, PCI_COMMAND, &cmd); | ||
61 | if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { | ||
62 | cmd &= ~PCI_COMMAND_INTX_DISABLE; | ||
63 | pci_write_config_word(pdev, PCI_COMMAND, cmd); | ||
64 | } | ||
65 | |||
66 | msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); | ||
67 | if (msix_pos) { | ||
68 | u16 flags; | ||
69 | u32 table; | ||
70 | |||
71 | pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); | ||
72 | pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); | ||
73 | |||
74 | vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK; | ||
75 | vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; | ||
76 | vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; | ||
77 | } else | ||
78 | vdev->msix_bar = 0xFF; | ||
79 | |||
80 | ret = pci_enable_device(pdev); | ||
81 | if (ret) | ||
82 | goto out; | ||
83 | |||
84 | return ret; | ||
85 | |||
86 | out: | ||
87 | kfree(vdev->pci_saved_state); | ||
88 | vdev->pci_saved_state = NULL; | ||
89 | vfio_config_free(vdev); | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | static void vfio_pci_disable(struct vfio_pci_device *vdev) | ||
94 | { | ||
95 | int bar; | ||
96 | |||
97 | pci_disable_device(vdev->pdev); | ||
98 | |||
99 | vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | | ||
100 | VFIO_IRQ_SET_ACTION_TRIGGER, | ||
101 | vdev->irq_type, 0, 0, NULL); | ||
102 | |||
103 | vdev->virq_disabled = false; | ||
104 | |||
105 | vfio_config_free(vdev); | ||
106 | |||
107 | pci_reset_function(vdev->pdev); | ||
108 | |||
109 | if (pci_load_and_free_saved_state(vdev->pdev, | ||
110 | &vdev->pci_saved_state) == 0) | ||
111 | pci_restore_state(vdev->pdev); | ||
112 | else | ||
113 | pr_info("%s: Couldn't reload %s saved state\n", | ||
114 | __func__, dev_name(&vdev->pdev->dev)); | ||
115 | |||
116 | for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { | ||
117 | if (!vdev->barmap[bar]) | ||
118 | continue; | ||
119 | pci_iounmap(vdev->pdev, vdev->barmap[bar]); | ||
120 | pci_release_selected_regions(vdev->pdev, 1 << bar); | ||
121 | vdev->barmap[bar] = NULL; | ||
122 | } | ||
123 | } | ||
124 | |||
125 | static void vfio_pci_release(void *device_data) | ||
126 | { | ||
127 | struct vfio_pci_device *vdev = device_data; | ||
128 | |||
129 | if (atomic_dec_and_test(&vdev->refcnt)) | ||
130 | vfio_pci_disable(vdev); | ||
131 | |||
132 | module_put(THIS_MODULE); | ||
133 | } | ||
134 | |||
135 | static int vfio_pci_open(void *device_data) | ||
136 | { | ||
137 | struct vfio_pci_device *vdev = device_data; | ||
138 | |||
139 | if (!try_module_get(THIS_MODULE)) | ||
140 | return -ENODEV; | ||
141 | |||
142 | if (atomic_inc_return(&vdev->refcnt) == 1) { | ||
143 | int ret = vfio_pci_enable(vdev); | ||
144 | if (ret) { | ||
145 | module_put(THIS_MODULE); | ||
146 | return ret; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) | ||
154 | { | ||
155 | if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { | ||
156 | u8 pin; | ||
157 | pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); | ||
158 | if (pin) | ||
159 | return 1; | ||
160 | |||
161 | } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { | ||
162 | u8 pos; | ||
163 | u16 flags; | ||
164 | |||
165 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI); | ||
166 | if (pos) { | ||
167 | pci_read_config_word(vdev->pdev, | ||
168 | pos + PCI_MSI_FLAGS, &flags); | ||
169 | |||
170 | return 1 << (flags & PCI_MSI_FLAGS_QMASK); | ||
171 | } | ||
172 | } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { | ||
173 | u8 pos; | ||
174 | u16 flags; | ||
175 | |||
176 | pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX); | ||
177 | if (pos) { | ||
178 | pci_read_config_word(vdev->pdev, | ||
179 | pos + PCI_MSIX_FLAGS, &flags); | ||
180 | |||
181 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static long vfio_pci_ioctl(void *device_data, | ||
189 | unsigned int cmd, unsigned long arg) | ||
190 | { | ||
191 | struct vfio_pci_device *vdev = device_data; | ||
192 | unsigned long minsz; | ||
193 | |||
194 | if (cmd == VFIO_DEVICE_GET_INFO) { | ||
195 | struct vfio_device_info info; | ||
196 | |||
197 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
198 | |||
199 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
200 | return -EFAULT; | ||
201 | |||
202 | if (info.argsz < minsz) | ||
203 | return -EINVAL; | ||
204 | |||
205 | info.flags = VFIO_DEVICE_FLAGS_PCI; | ||
206 | |||
207 | if (vdev->reset_works) | ||
208 | info.flags |= VFIO_DEVICE_FLAGS_RESET; | ||
209 | |||
210 | info.num_regions = VFIO_PCI_NUM_REGIONS; | ||
211 | info.num_irqs = VFIO_PCI_NUM_IRQS; | ||
212 | |||
213 | return copy_to_user((void __user *)arg, &info, minsz); | ||
214 | |||
215 | } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { | ||
216 | struct pci_dev *pdev = vdev->pdev; | ||
217 | struct vfio_region_info info; | ||
218 | |||
219 | minsz = offsetofend(struct vfio_region_info, offset); | ||
220 | |||
221 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
222 | return -EFAULT; | ||
223 | |||
224 | if (info.argsz < minsz) | ||
225 | return -EINVAL; | ||
226 | |||
227 | switch (info.index) { | ||
228 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
229 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
230 | info.size = pdev->cfg_size; | ||
231 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
232 | VFIO_REGION_INFO_FLAG_WRITE; | ||
233 | break; | ||
234 | case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: | ||
235 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
236 | info.size = pci_resource_len(pdev, info.index); | ||
237 | if (!info.size) { | ||
238 | info.flags = 0; | ||
239 | break; | ||
240 | } | ||
241 | |||
242 | info.flags = VFIO_REGION_INFO_FLAG_READ | | ||
243 | VFIO_REGION_INFO_FLAG_WRITE; | ||
244 | if (pci_resource_flags(pdev, info.index) & | ||
245 | IORESOURCE_MEM && info.size >= PAGE_SIZE) | ||
246 | info.flags |= VFIO_REGION_INFO_FLAG_MMAP; | ||
247 | break; | ||
248 | case VFIO_PCI_ROM_REGION_INDEX: | ||
249 | { | ||
250 | void __iomem *io; | ||
251 | size_t size; | ||
252 | |||
253 | info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); | ||
254 | info.flags = 0; | ||
255 | |||
256 | /* Report the BAR size, not the ROM size */ | ||
257 | info.size = pci_resource_len(pdev, info.index); | ||
258 | if (!info.size) | ||
259 | break; | ||
260 | |||
261 | /* Is it really there? */ | ||
262 | io = pci_map_rom(pdev, &size); | ||
263 | if (!io || !size) { | ||
264 | info.size = 0; | ||
265 | break; | ||
266 | } | ||
267 | pci_unmap_rom(pdev, io); | ||
268 | |||
269 | info.flags = VFIO_REGION_INFO_FLAG_READ; | ||
270 | break; | ||
271 | } | ||
272 | default: | ||
273 | return -EINVAL; | ||
274 | } | ||
275 | |||
276 | return copy_to_user((void __user *)arg, &info, minsz); | ||
277 | |||
278 | } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { | ||
279 | struct vfio_irq_info info; | ||
280 | |||
281 | minsz = offsetofend(struct vfio_irq_info, count); | ||
282 | |||
283 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
284 | return -EFAULT; | ||
285 | |||
286 | if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) | ||
287 | return -EINVAL; | ||
288 | |||
289 | info.flags = VFIO_IRQ_INFO_EVENTFD; | ||
290 | |||
291 | info.count = vfio_pci_get_irq_count(vdev, info.index); | ||
292 | |||
293 | if (info.index == VFIO_PCI_INTX_IRQ_INDEX) | ||
294 | info.flags |= (VFIO_IRQ_INFO_MASKABLE | | ||
295 | VFIO_IRQ_INFO_AUTOMASKED); | ||
296 | else | ||
297 | info.flags |= VFIO_IRQ_INFO_NORESIZE; | ||
298 | |||
299 | return copy_to_user((void __user *)arg, &info, minsz); | ||
300 | |||
301 | } else if (cmd == VFIO_DEVICE_SET_IRQS) { | ||
302 | struct vfio_irq_set hdr; | ||
303 | u8 *data = NULL; | ||
304 | int ret = 0; | ||
305 | |||
306 | minsz = offsetofend(struct vfio_irq_set, count); | ||
307 | |||
308 | if (copy_from_user(&hdr, (void __user *)arg, minsz)) | ||
309 | return -EFAULT; | ||
310 | |||
311 | if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || | ||
312 | hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | | ||
313 | VFIO_IRQ_SET_ACTION_TYPE_MASK)) | ||
314 | return -EINVAL; | ||
315 | |||
316 | if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
317 | size_t size; | ||
318 | |||
319 | if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) | ||
320 | size = sizeof(uint8_t); | ||
321 | else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) | ||
322 | size = sizeof(int32_t); | ||
323 | else | ||
324 | return -EINVAL; | ||
325 | |||
326 | if (hdr.argsz - minsz < hdr.count * size || | ||
327 | hdr.count > vfio_pci_get_irq_count(vdev, hdr.index)) | ||
328 | return -EINVAL; | ||
329 | |||
330 | data = kmalloc(hdr.count * size, GFP_KERNEL); | ||
331 | if (!data) | ||
332 | return -ENOMEM; | ||
333 | |||
334 | if (copy_from_user(data, (void __user *)(arg + minsz), | ||
335 | hdr.count * size)) { | ||
336 | kfree(data); | ||
337 | return -EFAULT; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | mutex_lock(&vdev->igate); | ||
342 | |||
343 | ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, | ||
344 | hdr.start, hdr.count, data); | ||
345 | |||
346 | mutex_unlock(&vdev->igate); | ||
347 | kfree(data); | ||
348 | |||
349 | return ret; | ||
350 | |||
351 | } else if (cmd == VFIO_DEVICE_RESET) | ||
352 | return vdev->reset_works ? | ||
353 | pci_reset_function(vdev->pdev) : -EINVAL; | ||
354 | |||
355 | return -ENOTTY; | ||
356 | } | ||
357 | |||
358 | static ssize_t vfio_pci_read(void *device_data, char __user *buf, | ||
359 | size_t count, loff_t *ppos) | ||
360 | { | ||
361 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
362 | struct vfio_pci_device *vdev = device_data; | ||
363 | struct pci_dev *pdev = vdev->pdev; | ||
364 | |||
365 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
366 | return -EINVAL; | ||
367 | |||
368 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
369 | return vfio_pci_config_readwrite(vdev, buf, count, ppos, false); | ||
370 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
371 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
372 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
373 | return vfio_pci_io_readwrite(vdev, buf, count, ppos, false); | ||
374 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) | ||
375 | return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); | ||
376 | |||
377 | return -EINVAL; | ||
378 | } | ||
379 | |||
380 | static ssize_t vfio_pci_write(void *device_data, const char __user *buf, | ||
381 | size_t count, loff_t *ppos) | ||
382 | { | ||
383 | unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
384 | struct vfio_pci_device *vdev = device_data; | ||
385 | struct pci_dev *pdev = vdev->pdev; | ||
386 | |||
387 | if (index >= VFIO_PCI_NUM_REGIONS) | ||
388 | return -EINVAL; | ||
389 | |||
390 | if (index == VFIO_PCI_CONFIG_REGION_INDEX) | ||
391 | return vfio_pci_config_readwrite(vdev, (char __user *)buf, | ||
392 | count, ppos, true); | ||
393 | else if (index == VFIO_PCI_ROM_REGION_INDEX) | ||
394 | return -EINVAL; | ||
395 | else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) | ||
396 | return vfio_pci_io_readwrite(vdev, (char __user *)buf, | ||
397 | count, ppos, true); | ||
398 | else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) { | ||
399 | return vfio_pci_mem_readwrite(vdev, (char __user *)buf, | ||
400 | count, ppos, true); | ||
401 | } | ||
402 | |||
403 | return -EINVAL; | ||
404 | } | ||
405 | |||
406 | static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) | ||
407 | { | ||
408 | struct vfio_pci_device *vdev = device_data; | ||
409 | struct pci_dev *pdev = vdev->pdev; | ||
410 | unsigned int index; | ||
411 | u64 phys_len, req_len, pgoff, req_start, phys; | ||
412 | int ret; | ||
413 | |||
414 | index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); | ||
415 | |||
416 | if (vma->vm_end < vma->vm_start) | ||
417 | return -EINVAL; | ||
418 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
419 | return -EINVAL; | ||
420 | if (index >= VFIO_PCI_ROM_REGION_INDEX) | ||
421 | return -EINVAL; | ||
422 | if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) | ||
423 | return -EINVAL; | ||
424 | |||
425 | phys_len = pci_resource_len(pdev, index); | ||
426 | req_len = vma->vm_end - vma->vm_start; | ||
427 | pgoff = vma->vm_pgoff & | ||
428 | ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); | ||
429 | req_start = pgoff << PAGE_SHIFT; | ||
430 | |||
431 | if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) | ||
432 | return -EINVAL; | ||
433 | |||
434 | if (index == vdev->msix_bar) { | ||
435 | /* | ||
436 | * Disallow mmaps overlapping the MSI-X table; users don't | ||
437 | * get to touch this directly. We could find somewhere | ||
438 | * else to map the overlap, but page granularity is only | ||
439 | * a recommendation, not a requirement, so the user needs | ||
440 | * to know which bits are real. Requiring them to mmap | ||
441 | * around the table makes that clear. | ||
442 | */ | ||
443 | |||
444 | /* If neither entirely above nor below, then it overlaps */ | ||
445 | if (!(req_start >= vdev->msix_offset + vdev->msix_size || | ||
446 | req_start + req_len <= vdev->msix_offset)) | ||
447 | return -EINVAL; | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Even though we don't make use of the barmap for the mmap, | ||
452 | * we need to request the region and the barmap tracks that. | ||
453 | */ | ||
454 | if (!vdev->barmap[index]) { | ||
455 | ret = pci_request_selected_regions(pdev, | ||
456 | 1 << index, "vfio-pci"); | ||
457 | if (ret) | ||
458 | return ret; | ||
459 | |||
460 | vdev->barmap[index] = pci_iomap(pdev, index, 0); | ||
461 | } | ||
462 | |||
463 | vma->vm_private_data = vdev; | ||
464 | vma->vm_flags |= (VM_IO | VM_RESERVED); | ||
465 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
466 | |||
467 | phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; | ||
468 | |||
469 | return remap_pfn_range(vma, vma->vm_start, phys, | ||
470 | req_len, vma->vm_page_prot); | ||
471 | } | ||
472 | |||
473 | static const struct vfio_device_ops vfio_pci_ops = { | ||
474 | .name = "vfio-pci", | ||
475 | .open = vfio_pci_open, | ||
476 | .release = vfio_pci_release, | ||
477 | .ioctl = vfio_pci_ioctl, | ||
478 | .read = vfio_pci_read, | ||
479 | .write = vfio_pci_write, | ||
480 | .mmap = vfio_pci_mmap, | ||
481 | }; | ||
482 | |||
483 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | ||
484 | { | ||
485 | u8 type; | ||
486 | struct vfio_pci_device *vdev; | ||
487 | struct iommu_group *group; | ||
488 | int ret; | ||
489 | |||
490 | pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type); | ||
491 | if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) | ||
492 | return -EINVAL; | ||
493 | |||
494 | group = iommu_group_get(&pdev->dev); | ||
495 | if (!group) | ||
496 | return -EINVAL; | ||
497 | |||
498 | vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); | ||
499 | if (!vdev) { | ||
500 | iommu_group_put(group); | ||
501 | return -ENOMEM; | ||
502 | } | ||
503 | |||
504 | vdev->pdev = pdev; | ||
505 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
506 | mutex_init(&vdev->igate); | ||
507 | spin_lock_init(&vdev->irqlock); | ||
508 | atomic_set(&vdev->refcnt, 0); | ||
509 | |||
510 | ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); | ||
511 | if (ret) { | ||
512 | iommu_group_put(group); | ||
513 | kfree(vdev); | ||
514 | } | ||
515 | |||
516 | return ret; | ||
517 | } | ||
518 | |||
519 | static void vfio_pci_remove(struct pci_dev *pdev) | ||
520 | { | ||
521 | struct vfio_pci_device *vdev; | ||
522 | |||
523 | vdev = vfio_del_group_dev(&pdev->dev); | ||
524 | if (!vdev) | ||
525 | return; | ||
526 | |||
527 | iommu_group_put(pdev->dev.iommu_group); | ||
528 | kfree(vdev); | ||
529 | } | ||
530 | |||
531 | static struct pci_driver vfio_pci_driver = { | ||
532 | .name = "vfio-pci", | ||
533 | .id_table = NULL, /* only dynamic ids */ | ||
534 | .probe = vfio_pci_probe, | ||
535 | .remove = vfio_pci_remove, | ||
536 | }; | ||
537 | |||
538 | static void __exit vfio_pci_cleanup(void) | ||
539 | { | ||
540 | pci_unregister_driver(&vfio_pci_driver); | ||
541 | vfio_pci_virqfd_exit(); | ||
542 | vfio_pci_uninit_perm_bits(); | ||
543 | } | ||
544 | |||
545 | static int __init vfio_pci_init(void) | ||
546 | { | ||
547 | int ret; | ||
548 | |||
549 | /* Allocate shared config space permision data used by all devices */ | ||
550 | ret = vfio_pci_init_perm_bits(); | ||
551 | if (ret) | ||
552 | return ret; | ||
553 | |||
554 | /* Start the virqfd cleanup handler */ | ||
555 | ret = vfio_pci_virqfd_init(); | ||
556 | if (ret) | ||
557 | goto out_virqfd; | ||
558 | |||
559 | /* Register and scan for devices */ | ||
560 | ret = pci_register_driver(&vfio_pci_driver); | ||
561 | if (ret) | ||
562 | goto out_driver; | ||
563 | |||
564 | return 0; | ||
565 | |||
566 | out_virqfd: | ||
567 | vfio_pci_virqfd_exit(); | ||
568 | out_driver: | ||
569 | vfio_pci_uninit_perm_bits(); | ||
570 | return ret; | ||
571 | } | ||
572 | |||
573 | module_init(vfio_pci_init); | ||
574 | module_exit(vfio_pci_cleanup); | ||
575 | |||
576 | MODULE_VERSION(DRIVER_VERSION); | ||
577 | MODULE_LICENSE("GPL v2"); | ||
578 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
579 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c new file mode 100644 index 000000000000..8b8f7d11e102 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_config.c | |||
@@ -0,0 +1,1540 @@ | |||
1 | /* | ||
2 | * VFIO PCI config space virtualization | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * This code handles reading and writing of PCI configuration registers. | ||
18 | * This is hairy because we want to allow a lot of flexibility to the | ||
19 | * user driver, but cannot trust it with all of the config fields. | ||
20 | * Tables determine which fields can be read and written, as well as | ||
21 | * which fields are 'virtualized' - special actions and translations to | ||
22 | * make it appear to the user that he has control, when in fact things | ||
23 | * must be negotiated with the underlying OS. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/pci.h> | ||
28 | #include <linux/uaccess.h> | ||
29 | #include <linux/vfio.h> | ||
30 | |||
31 | #include "vfio_pci_private.h" | ||
32 | |||
33 | #define PCI_CFG_SPACE_SIZE 256 | ||
34 | |||
35 | /* Useful "pseudo" capabilities */ | ||
36 | #define PCI_CAP_ID_BASIC 0 | ||
37 | #define PCI_CAP_ID_INVALID 0xFF | ||
38 | |||
39 | #define is_bar(offset) \ | ||
40 | ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ | ||
41 | (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4)) | ||
42 | |||
43 | /* | ||
44 | * Lengths of PCI Config Capabilities | ||
45 | * 0: Removed from the user visible capability list | ||
46 | * FF: Variable length | ||
47 | */ | ||
48 | static u8 pci_cap_length[] = { | ||
49 | [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ | ||
50 | [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, | ||
51 | [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, | ||
52 | [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF, | ||
53 | [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */ | ||
54 | [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */ | ||
55 | [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */ | ||
56 | [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */ | ||
57 | [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */ | ||
58 | [PCI_CAP_ID_VNDR] = 0xFF, /* variable */ | ||
59 | [PCI_CAP_ID_DBG] = 0, /* debug - don't care */ | ||
60 | [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */ | ||
61 | [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */ | ||
62 | [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */ | ||
63 | [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */ | ||
64 | [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */ | ||
65 | [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */ | ||
66 | [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF, | ||
67 | [PCI_CAP_ID_SATA] = 0xFF, | ||
68 | [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF, | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * Lengths of PCIe/PCI-X Extended Config Capabilities | ||
73 | * 0: Removed or masked from the user visible capabilty list | ||
74 | * FF: Variable length | ||
75 | */ | ||
76 | static u16 pci_ext_cap_length[] = { | ||
77 | [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, | ||
78 | [PCI_EXT_CAP_ID_VC] = 0xFF, | ||
79 | [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, | ||
80 | [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF, | ||
81 | [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */ | ||
82 | [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */ | ||
83 | [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */ | ||
84 | [PCI_EXT_CAP_ID_MFVC] = 0xFF, | ||
85 | [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */ | ||
86 | [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */ | ||
87 | [PCI_EXT_CAP_ID_VNDR] = 0xFF, | ||
88 | [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */ | ||
89 | [PCI_EXT_CAP_ID_ACS] = 0xFF, | ||
90 | [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF, | ||
91 | [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF, | ||
92 | [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF, | ||
93 | [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */ | ||
94 | [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF, | ||
95 | [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF, | ||
96 | [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */ | ||
97 | [PCI_EXT_CAP_ID_REBAR] = 0xFF, | ||
98 | [PCI_EXT_CAP_ID_DPA] = 0xFF, | ||
99 | [PCI_EXT_CAP_ID_TPH] = 0xFF, | ||
100 | [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, | ||
101 | [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ | ||
102 | [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ | ||
103 | [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ | ||
104 | }; | ||
105 | |||
106 | /* | ||
107 | * Read/Write Permission Bits - one bit for each bit in capability | ||
108 | * Any field can be read if it exists, but what is read depends on | ||
109 | * whether the field is 'virtualized', or just pass thru to the | ||
110 | * hardware. Any virtualized field is also virtualized for writes. | ||
111 | * Writes are only permitted if they have a 1 bit here. | ||
112 | */ | ||
113 | struct perm_bits { | ||
114 | u8 *virt; /* read/write virtual data, not hw */ | ||
115 | u8 *write; /* writeable bits */ | ||
116 | int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, | ||
117 | struct perm_bits *perm, int offset, __le32 *val); | ||
118 | int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, | ||
119 | struct perm_bits *perm, int offset, __le32 val); | ||
120 | }; | ||
121 | |||
122 | #define NO_VIRT 0 | ||
123 | #define ALL_VIRT 0xFFFFFFFFU | ||
124 | #define NO_WRITE 0 | ||
125 | #define ALL_WRITE 0xFFFFFFFFU | ||
126 | |||
127 | static int vfio_user_config_read(struct pci_dev *pdev, int offset, | ||
128 | __le32 *val, int count) | ||
129 | { | ||
130 | int ret = -EINVAL; | ||
131 | u32 tmp_val = 0; | ||
132 | |||
133 | switch (count) { | ||
134 | case 1: | ||
135 | { | ||
136 | u8 tmp; | ||
137 | ret = pci_user_read_config_byte(pdev, offset, &tmp); | ||
138 | tmp_val = tmp; | ||
139 | break; | ||
140 | } | ||
141 | case 2: | ||
142 | { | ||
143 | u16 tmp; | ||
144 | ret = pci_user_read_config_word(pdev, offset, &tmp); | ||
145 | tmp_val = tmp; | ||
146 | break; | ||
147 | } | ||
148 | case 4: | ||
149 | ret = pci_user_read_config_dword(pdev, offset, &tmp_val); | ||
150 | break; | ||
151 | } | ||
152 | |||
153 | *val = cpu_to_le32(tmp_val); | ||
154 | |||
155 | return pcibios_err_to_errno(ret); | ||
156 | } | ||
157 | |||
158 | static int vfio_user_config_write(struct pci_dev *pdev, int offset, | ||
159 | __le32 val, int count) | ||
160 | { | ||
161 | int ret = -EINVAL; | ||
162 | u32 tmp_val = le32_to_cpu(val); | ||
163 | |||
164 | switch (count) { | ||
165 | case 1: | ||
166 | ret = pci_user_write_config_byte(pdev, offset, tmp_val); | ||
167 | break; | ||
168 | case 2: | ||
169 | ret = pci_user_write_config_word(pdev, offset, tmp_val); | ||
170 | break; | ||
171 | case 4: | ||
172 | ret = pci_user_write_config_dword(pdev, offset, tmp_val); | ||
173 | break; | ||
174 | } | ||
175 | |||
176 | return pcibios_err_to_errno(ret); | ||
177 | } | ||
178 | |||
179 | static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, | ||
180 | int count, struct perm_bits *perm, | ||
181 | int offset, __le32 *val) | ||
182 | { | ||
183 | __le32 virt = 0; | ||
184 | |||
185 | memcpy(val, vdev->vconfig + pos, count); | ||
186 | |||
187 | memcpy(&virt, perm->virt + offset, count); | ||
188 | |||
189 | /* Any non-virtualized bits? */ | ||
190 | if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) { | ||
191 | struct pci_dev *pdev = vdev->pdev; | ||
192 | __le32 phys_val = 0; | ||
193 | int ret; | ||
194 | |||
195 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
196 | if (ret) | ||
197 | return ret; | ||
198 | |||
199 | *val = (phys_val & ~virt) | (*val & virt); | ||
200 | } | ||
201 | |||
202 | return count; | ||
203 | } | ||
204 | |||
205 | static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, | ||
206 | int count, struct perm_bits *perm, | ||
207 | int offset, __le32 val) | ||
208 | { | ||
209 | __le32 virt = 0, write = 0; | ||
210 | |||
211 | memcpy(&write, perm->write + offset, count); | ||
212 | |||
213 | if (!write) | ||
214 | return count; /* drop, no writable bits */ | ||
215 | |||
216 | memcpy(&virt, perm->virt + offset, count); | ||
217 | |||
218 | /* Virtualized and writable bits go to vconfig */ | ||
219 | if (write & virt) { | ||
220 | __le32 virt_val = 0; | ||
221 | |||
222 | memcpy(&virt_val, vdev->vconfig + pos, count); | ||
223 | |||
224 | virt_val &= ~(write & virt); | ||
225 | virt_val |= (val & (write & virt)); | ||
226 | |||
227 | memcpy(vdev->vconfig + pos, &virt_val, count); | ||
228 | } | ||
229 | |||
230 | /* Non-virtualzed and writable bits go to hardware */ | ||
231 | if (write & ~virt) { | ||
232 | struct pci_dev *pdev = vdev->pdev; | ||
233 | __le32 phys_val = 0; | ||
234 | int ret; | ||
235 | |||
236 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | ||
237 | if (ret) | ||
238 | return ret; | ||
239 | |||
240 | phys_val &= ~(write & ~virt); | ||
241 | phys_val |= (val & (write & ~virt)); | ||
242 | |||
243 | ret = vfio_user_config_write(pdev, pos, phys_val, count); | ||
244 | if (ret) | ||
245 | return ret; | ||
246 | } | ||
247 | |||
248 | return count; | ||
249 | } | ||
250 | |||
251 | /* Allow direct read from hardware, except for capability next pointer */ | ||
252 | static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, | ||
253 | int count, struct perm_bits *perm, | ||
254 | int offset, __le32 *val) | ||
255 | { | ||
256 | int ret; | ||
257 | |||
258 | ret = vfio_user_config_read(vdev->pdev, pos, val, count); | ||
259 | if (ret) | ||
260 | return pcibios_err_to_errno(ret); | ||
261 | |||
262 | if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ | ||
263 | if (offset < 4) | ||
264 | memcpy(val, vdev->vconfig + pos, count); | ||
265 | } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */ | ||
266 | if (offset == PCI_CAP_LIST_ID && count > 1) | ||
267 | memcpy(val, vdev->vconfig + pos, | ||
268 | min(PCI_CAP_FLAGS, count)); | ||
269 | else if (offset == PCI_CAP_LIST_NEXT) | ||
270 | memcpy(val, vdev->vconfig + pos, 1); | ||
271 | } | ||
272 | |||
273 | return count; | ||
274 | } | ||
275 | |||
276 | static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, | ||
277 | int count, struct perm_bits *perm, | ||
278 | int offset, __le32 val) | ||
279 | { | ||
280 | int ret; | ||
281 | |||
282 | ret = vfio_user_config_write(vdev->pdev, pos, val, count); | ||
283 | if (ret) | ||
284 | return ret; | ||
285 | |||
286 | return count; | ||
287 | } | ||
288 | |||
289 | /* Default all regions to read-only, no-virtualization */ | ||
290 | static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { | ||
291 | [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
292 | }; | ||
293 | static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { | ||
294 | [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | ||
295 | }; | ||
296 | |||
297 | static void free_perm_bits(struct perm_bits *perm) | ||
298 | { | ||
299 | kfree(perm->virt); | ||
300 | kfree(perm->write); | ||
301 | perm->virt = NULL; | ||
302 | perm->write = NULL; | ||
303 | } | ||
304 | |||
305 | static int alloc_perm_bits(struct perm_bits *perm, int size) | ||
306 | { | ||
307 | /* | ||
308 | * Round up all permission bits to the next dword, this lets us | ||
309 | * ignore whether a read/write exceeds the defined capability | ||
310 | * structure. We can do this because: | ||
311 | * - Standard config space is already dword aligned | ||
312 | * - Capabilities are all dword alinged (bits 0:1 of next reserved) | ||
313 | * - Express capabilities defined as dword aligned | ||
314 | */ | ||
315 | size = round_up(size, 4); | ||
316 | |||
317 | /* | ||
318 | * Zero state is | ||
319 | * - All Readable, None Writeable, None Virtualized | ||
320 | */ | ||
321 | perm->virt = kzalloc(size, GFP_KERNEL); | ||
322 | perm->write = kzalloc(size, GFP_KERNEL); | ||
323 | if (!perm->virt || !perm->write) { | ||
324 | free_perm_bits(perm); | ||
325 | return -ENOMEM; | ||
326 | } | ||
327 | |||
328 | perm->readfn = vfio_default_config_read; | ||
329 | perm->writefn = vfio_default_config_write; | ||
330 | |||
331 | return 0; | ||
332 | } | ||
333 | |||
334 | /* | ||
335 | * Helper functions for filling in permission tables | ||
336 | */ | ||
337 | static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write) | ||
338 | { | ||
339 | p->virt[off] = virt; | ||
340 | p->write[off] = write; | ||
341 | } | ||
342 | |||
343 | /* Handle endian-ness - pci and tables are little-endian */ | ||
344 | static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write) | ||
345 | { | ||
346 | *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt); | ||
347 | *(__le16 *)(&p->write[off]) = cpu_to_le16(write); | ||
348 | } | ||
349 | |||
350 | /* Handle endian-ness - pci and tables are little-endian */ | ||
351 | static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) | ||
352 | { | ||
353 | *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt); | ||
354 | *(__le32 *)(&p->write[off]) = cpu_to_le32(write); | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * Restore the *real* BARs after we detect a FLR or backdoor reset. | ||
359 | * (backdoor = some device specific technique that we didn't catch) | ||
360 | */ | ||
361 | static void vfio_bar_restore(struct vfio_pci_device *vdev) | ||
362 | { | ||
363 | struct pci_dev *pdev = vdev->pdev; | ||
364 | u32 *rbar = vdev->rbar; | ||
365 | int i; | ||
366 | |||
367 | if (pdev->is_virtfn) | ||
368 | return; | ||
369 | |||
370 | pr_info("%s: %s reset recovery - restoring bars\n", | ||
371 | __func__, dev_name(&pdev->dev)); | ||
372 | |||
373 | for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++) | ||
374 | pci_user_write_config_dword(pdev, i, *rbar); | ||
375 | |||
376 | pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); | ||
377 | } | ||
378 | |||
379 | static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) | ||
380 | { | ||
381 | unsigned long flags = pci_resource_flags(pdev, bar); | ||
382 | u32 val; | ||
383 | |||
384 | if (flags & IORESOURCE_IO) | ||
385 | return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO); | ||
386 | |||
387 | val = PCI_BASE_ADDRESS_SPACE_MEMORY; | ||
388 | |||
389 | if (flags & IORESOURCE_PREFETCH) | ||
390 | val |= PCI_BASE_ADDRESS_MEM_PREFETCH; | ||
391 | |||
392 | if (flags & IORESOURCE_MEM_64) | ||
393 | val |= PCI_BASE_ADDRESS_MEM_TYPE_64; | ||
394 | |||
395 | return cpu_to_le32(val); | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Pretend we're hardware and tweak the values of the *virtual* PCI BARs | ||
400 | * to reflect the hardware capabilities. This implements BAR sizing. | ||
401 | */ | ||
402 | static void vfio_bar_fixup(struct vfio_pci_device *vdev) | ||
403 | { | ||
404 | struct pci_dev *pdev = vdev->pdev; | ||
405 | int i; | ||
406 | __le32 *bar; | ||
407 | u64 mask; | ||
408 | |||
409 | bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0]; | ||
410 | |||
411 | for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) { | ||
412 | if (!pci_resource_start(pdev, i)) { | ||
413 | *bar = 0; /* Unmapped by host = unimplemented to user */ | ||
414 | continue; | ||
415 | } | ||
416 | |||
417 | mask = ~(pci_resource_len(pdev, i) - 1); | ||
418 | |||
419 | *bar &= cpu_to_le32((u32)mask); | ||
420 | *bar |= vfio_generate_bar_flags(pdev, i); | ||
421 | |||
422 | if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) { | ||
423 | bar++; | ||
424 | *bar &= cpu_to_le32((u32)(mask >> 32)); | ||
425 | i++; | ||
426 | } | ||
427 | } | ||
428 | |||
429 | bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; | ||
430 | |||
431 | /* | ||
432 | * NB. we expose the actual BAR size here, regardless of whether | ||
433 | * we can read it. When we report the REGION_INFO for the ROM | ||
434 | * we report what PCI tells us is the actual ROM size. | ||
435 | */ | ||
436 | if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { | ||
437 | mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); | ||
438 | mask |= PCI_ROM_ADDRESS_ENABLE; | ||
439 | *bar &= cpu_to_le32((u32)mask); | ||
440 | } else | ||
441 | *bar = 0; | ||
442 | |||
443 | vdev->bardirty = false; | ||
444 | } | ||
445 | |||
446 | static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, | ||
447 | int count, struct perm_bits *perm, | ||
448 | int offset, __le32 *val) | ||
449 | { | ||
450 | if (is_bar(offset)) /* pos == offset for basic config */ | ||
451 | vfio_bar_fixup(vdev); | ||
452 | |||
453 | count = vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
454 | |||
455 | /* Mask in virtual memory enable for SR-IOV devices */ | ||
456 | if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { | ||
457 | u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); | ||
458 | u32 tmp_val = le32_to_cpu(*val); | ||
459 | |||
460 | tmp_val |= cmd & PCI_COMMAND_MEMORY; | ||
461 | *val = cpu_to_le32(tmp_val); | ||
462 | } | ||
463 | |||
464 | return count; | ||
465 | } | ||
466 | |||
467 | static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, | ||
468 | int count, struct perm_bits *perm, | ||
469 | int offset, __le32 val) | ||
470 | { | ||
471 | struct pci_dev *pdev = vdev->pdev; | ||
472 | __le16 *virt_cmd; | ||
473 | u16 new_cmd = 0; | ||
474 | int ret; | ||
475 | |||
476 | virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND]; | ||
477 | |||
478 | if (offset == PCI_COMMAND) { | ||
479 | bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io; | ||
480 | u16 phys_cmd; | ||
481 | |||
482 | ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd); | ||
483 | if (ret) | ||
484 | return ret; | ||
485 | |||
486 | new_cmd = le32_to_cpu(val); | ||
487 | |||
488 | phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); | ||
489 | virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); | ||
490 | new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); | ||
491 | |||
492 | phys_io = !!(phys_cmd & PCI_COMMAND_IO); | ||
493 | virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); | ||
494 | new_io = !!(new_cmd & PCI_COMMAND_IO); | ||
495 | |||
496 | /* | ||
497 | * If the user is writing mem/io enable (new_mem/io) and we | ||
498 | * think it's already enabled (virt_mem/io), but the hardware | ||
499 | * shows it disabled (phys_mem/io, then the device has | ||
500 | * undergone some kind of backdoor reset and needs to be | ||
501 | * restored before we allow it to enable the bars. | ||
502 | * SR-IOV devices will trigger this, but we catch them later | ||
503 | */ | ||
504 | if ((new_mem && virt_mem && !phys_mem) || | ||
505 | (new_io && virt_io && !phys_io)) | ||
506 | vfio_bar_restore(vdev); | ||
507 | } | ||
508 | |||
509 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
510 | if (count < 0) | ||
511 | return count; | ||
512 | |||
513 | /* | ||
514 | * Save current memory/io enable bits in vconfig to allow for | ||
515 | * the test above next time. | ||
516 | */ | ||
517 | if (offset == PCI_COMMAND) { | ||
518 | u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO; | ||
519 | |||
520 | *virt_cmd &= cpu_to_le16(~mask); | ||
521 | *virt_cmd |= cpu_to_le16(new_cmd & mask); | ||
522 | } | ||
523 | |||
524 | /* Emulate INTx disable */ | ||
525 | if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) { | ||
526 | bool virt_intx_disable; | ||
527 | |||
528 | virt_intx_disable = !!(le16_to_cpu(*virt_cmd) & | ||
529 | PCI_COMMAND_INTX_DISABLE); | ||
530 | |||
531 | if (virt_intx_disable && !vdev->virq_disabled) { | ||
532 | vdev->virq_disabled = true; | ||
533 | vfio_pci_intx_mask(vdev); | ||
534 | } else if (!virt_intx_disable && vdev->virq_disabled) { | ||
535 | vdev->virq_disabled = false; | ||
536 | vfio_pci_intx_unmask(vdev); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | if (is_bar(offset)) | ||
541 | vdev->bardirty = true; | ||
542 | |||
543 | return count; | ||
544 | } | ||
545 | |||
546 | /* Permissions for the Basic PCI Header */ | ||
547 | static int __init init_pci_cap_basic_perm(struct perm_bits *perm) | ||
548 | { | ||
549 | if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF)) | ||
550 | return -ENOMEM; | ||
551 | |||
552 | perm->readfn = vfio_basic_config_read; | ||
553 | perm->writefn = vfio_basic_config_write; | ||
554 | |||
555 | /* Virtualized for SR-IOV functions, which just have FFFF */ | ||
556 | p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE); | ||
557 | p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE); | ||
558 | |||
559 | /* | ||
560 | * Virtualize INTx disable, we use it internally for interrupt | ||
561 | * control and can emulate it for non-PCI 2.3 devices. | ||
562 | */ | ||
563 | p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE); | ||
564 | |||
565 | /* Virtualize capability list, we might want to skip/disable */ | ||
566 | p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE); | ||
567 | |||
568 | /* No harm to write */ | ||
569 | p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE); | ||
570 | p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE); | ||
571 | p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE); | ||
572 | |||
573 | /* Virtualize all bars, can't touch the real ones */ | ||
574 | p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE); | ||
575 | p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE); | ||
576 | p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE); | ||
577 | p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE); | ||
578 | p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE); | ||
579 | p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE); | ||
580 | p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE); | ||
581 | |||
582 | /* Allow us to adjust capability chain */ | ||
583 | p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE); | ||
584 | |||
585 | /* Sometimes used by sw, just virtualize */ | ||
586 | p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | /* Permissions for the Power Management capability */ | ||
591 | static int __init init_pci_cap_pm_perm(struct perm_bits *perm) | ||
592 | { | ||
593 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) | ||
594 | return -ENOMEM; | ||
595 | |||
596 | /* | ||
597 | * We always virtualize the next field so we can remove | ||
598 | * capabilities from the chain if we want to. | ||
599 | */ | ||
600 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
601 | |||
602 | /* | ||
603 | * Power management is defined *per function*, | ||
604 | * so we let the user write this | ||
605 | */ | ||
606 | p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | /* Permissions for PCI-X capability */ | ||
611 | static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) | ||
612 | { | ||
613 | /* Alloc 24, but only 8 are used in v0 */ | ||
614 | if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2)) | ||
615 | return -ENOMEM; | ||
616 | |||
617 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
618 | |||
619 | p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE); | ||
620 | p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE); | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | /* Permissions for PCI Express capability */ | ||
625 | static int __init init_pci_cap_exp_perm(struct perm_bits *perm) | ||
626 | { | ||
627 | /* Alloc larger of two possible sizes */ | ||
628 | if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) | ||
629 | return -ENOMEM; | ||
630 | |||
631 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
632 | |||
633 | /* | ||
634 | * Allow writes to device control fields (includes FLR!) | ||
635 | * but not to devctl_phantom which could confuse IOMMU | ||
636 | * or to the ARI bit in devctl2 which is set at probe time | ||
637 | */ | ||
638 | p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); | ||
639 | p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); | ||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | /* Permissions for Advanced Function capability */ | ||
644 | static int __init init_pci_cap_af_perm(struct perm_bits *perm) | ||
645 | { | ||
646 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) | ||
647 | return -ENOMEM; | ||
648 | |||
649 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
650 | p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | /* Permissions for Advanced Error Reporting extended capability */ | ||
655 | static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) | ||
656 | { | ||
657 | u32 mask; | ||
658 | |||
659 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR])) | ||
660 | return -ENOMEM; | ||
661 | |||
662 | /* | ||
663 | * Virtualize the first dword of all express capabilities | ||
664 | * because it includes the next pointer. This lets us later | ||
665 | * remove capabilities from the chain if we need to. | ||
666 | */ | ||
667 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
668 | |||
669 | /* Writable bits mask */ | ||
670 | mask = PCI_ERR_UNC_TRAIN | /* Training */ | ||
671 | PCI_ERR_UNC_DLP | /* Data Link Protocol */ | ||
672 | PCI_ERR_UNC_SURPDN | /* Surprise Down */ | ||
673 | PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ | ||
674 | PCI_ERR_UNC_FCP | /* Flow Control Protocol */ | ||
675 | PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */ | ||
676 | PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */ | ||
677 | PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */ | ||
678 | PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */ | ||
679 | PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */ | ||
680 | PCI_ERR_UNC_ECRC | /* ECRC Error Status */ | ||
681 | PCI_ERR_UNC_UNSUP | /* Unsupported Request */ | ||
682 | PCI_ERR_UNC_ACSV | /* ACS Violation */ | ||
683 | PCI_ERR_UNC_INTN | /* internal error */ | ||
684 | PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */ | ||
685 | PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */ | ||
686 | PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */ | ||
687 | p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask); | ||
688 | p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask); | ||
689 | p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask); | ||
690 | |||
691 | mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */ | ||
692 | PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */ | ||
693 | PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */ | ||
694 | PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */ | ||
695 | PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */ | ||
696 | PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */ | ||
697 | PCI_ERR_COR_INTERNAL | /* Corrected Internal */ | ||
698 | PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */ | ||
699 | p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask); | ||
700 | p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask); | ||
701 | |||
702 | mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */ | ||
703 | PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */ | ||
704 | p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask); | ||
705 | return 0; | ||
706 | } | ||
707 | |||
708 | /* Permissions for Power Budgeting extended capability */ | ||
709 | static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) | ||
710 | { | ||
711 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR])) | ||
712 | return -ENOMEM; | ||
713 | |||
714 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | ||
715 | |||
716 | /* Writing the data selector is OK, the info is still read-only */ | ||
717 | p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE); | ||
718 | return 0; | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * Initialize the shared permission tables | ||
723 | */ | ||
724 | void vfio_pci_uninit_perm_bits(void) | ||
725 | { | ||
726 | free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); | ||
727 | |||
728 | free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); | ||
729 | free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); | ||
730 | free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); | ||
731 | free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); | ||
732 | |||
733 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
734 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
735 | } | ||
736 | |||
737 | int __init vfio_pci_init_perm_bits(void) | ||
738 | { | ||
739 | int ret; | ||
740 | |||
741 | /* Basic config space */ | ||
742 | ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]); | ||
743 | |||
744 | /* Capabilities */ | ||
745 | ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); | ||
746 | cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; | ||
747 | ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); | ||
748 | cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
749 | ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); | ||
750 | ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); | ||
751 | |||
752 | /* Extended capabilities */ | ||
753 | ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | ||
754 | ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | ||
755 | ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; | ||
756 | |||
757 | if (ret) | ||
758 | vfio_pci_uninit_perm_bits(); | ||
759 | |||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) | ||
764 | { | ||
765 | u8 cap; | ||
766 | int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : | ||
767 | PCI_STD_HEADER_SIZEOF; | ||
768 | base /= 4; | ||
769 | pos /= 4; | ||
770 | |||
771 | cap = vdev->pci_config_map[pos]; | ||
772 | |||
773 | if (cap == PCI_CAP_ID_BASIC) | ||
774 | return 0; | ||
775 | |||
776 | /* XXX Can we have to abutting capabilities of the same type? */ | ||
777 | while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) | ||
778 | pos--; | ||
779 | |||
780 | return pos * 4; | ||
781 | } | ||
782 | |||
783 | static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, | ||
784 | int count, struct perm_bits *perm, | ||
785 | int offset, __le32 *val) | ||
786 | { | ||
787 | /* Update max available queue size from msi_qmax */ | ||
788 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
789 | __le16 *flags; | ||
790 | int start; | ||
791 | |||
792 | start = vfio_find_cap_start(vdev, pos); | ||
793 | |||
794 | flags = (__le16 *)&vdev->vconfig[start]; | ||
795 | |||
796 | *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK); | ||
797 | *flags |= cpu_to_le16(vdev->msi_qmax << 1); | ||
798 | } | ||
799 | |||
800 | return vfio_default_config_read(vdev, pos, count, perm, offset, val); | ||
801 | } | ||
802 | |||
803 | static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, | ||
804 | int count, struct perm_bits *perm, | ||
805 | int offset, __le32 val) | ||
806 | { | ||
807 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | ||
808 | if (count < 0) | ||
809 | return count; | ||
810 | |||
811 | /* Fixup and write configured queue size and enable to hardware */ | ||
812 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | ||
813 | __le16 *pflags; | ||
814 | u16 flags; | ||
815 | int start, ret; | ||
816 | |||
817 | start = vfio_find_cap_start(vdev, pos); | ||
818 | |||
819 | pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS]; | ||
820 | |||
821 | flags = le16_to_cpu(*pflags); | ||
822 | |||
823 | /* MSI is enabled via ioctl */ | ||
824 | if (!is_msi(vdev)) | ||
825 | flags &= ~PCI_MSI_FLAGS_ENABLE; | ||
826 | |||
827 | /* Check queue size */ | ||
828 | if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) { | ||
829 | flags &= ~PCI_MSI_FLAGS_QSIZE; | ||
830 | flags |= vdev->msi_qmax << 4; | ||
831 | } | ||
832 | |||
833 | /* Write back to virt and to hardware */ | ||
834 | *pflags = cpu_to_le16(flags); | ||
835 | ret = pci_user_write_config_word(vdev->pdev, | ||
836 | start + PCI_MSI_FLAGS, | ||
837 | flags); | ||
838 | if (ret) | ||
839 | return pcibios_err_to_errno(ret); | ||
840 | } | ||
841 | |||
842 | return count; | ||
843 | } | ||
844 | |||
845 | /* | ||
846 | * MSI determination is per-device, so this routine gets used beyond | ||
847 | * initialization time. Don't add __init | ||
848 | */ | ||
849 | static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags) | ||
850 | { | ||
851 | if (alloc_perm_bits(perm, len)) | ||
852 | return -ENOMEM; | ||
853 | |||
854 | perm->readfn = vfio_msi_config_read; | ||
855 | perm->writefn = vfio_msi_config_write; | ||
856 | |||
857 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | ||
858 | |||
859 | /* | ||
860 | * The upper byte of the control register is reserved, | ||
861 | * just setup the lower byte. | ||
862 | */ | ||
863 | p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE); | ||
864 | p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE); | ||
865 | if (flags & PCI_MSI_FLAGS_64BIT) { | ||
866 | p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE); | ||
867 | p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
868 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
869 | p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE); | ||
870 | p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE); | ||
871 | } | ||
872 | } else { | ||
873 | p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE); | ||
874 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | ||
875 | p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE); | ||
876 | p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE); | ||
877 | } | ||
878 | } | ||
879 | return 0; | ||
880 | } | ||
881 | |||
882 | /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ | ||
883 | static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) | ||
884 | { | ||
885 | struct pci_dev *pdev = vdev->pdev; | ||
886 | int len, ret; | ||
887 | u16 flags; | ||
888 | |||
889 | ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags); | ||
890 | if (ret) | ||
891 | return pcibios_err_to_errno(ret); | ||
892 | |||
893 | len = 10; /* Minimum size */ | ||
894 | if (flags & PCI_MSI_FLAGS_64BIT) | ||
895 | len += 4; | ||
896 | if (flags & PCI_MSI_FLAGS_MASKBIT) | ||
897 | len += 10; | ||
898 | |||
899 | if (vdev->msi_perm) | ||
900 | return len; | ||
901 | |||
902 | vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); | ||
903 | if (!vdev->msi_perm) | ||
904 | return -ENOMEM; | ||
905 | |||
906 | ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags); | ||
907 | if (ret) | ||
908 | return ret; | ||
909 | |||
910 | return len; | ||
911 | } | ||
912 | |||
913 | /* Determine extended capability length for VC (2 & 9) and MFVC */ | ||
914 | static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) | ||
915 | { | ||
916 | struct pci_dev *pdev = vdev->pdev; | ||
917 | u32 tmp; | ||
918 | int ret, evcc, phases, vc_arb; | ||
919 | int len = PCI_CAP_VC_BASE_SIZEOF; | ||
920 | |||
921 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp); | ||
922 | if (ret) | ||
923 | return pcibios_err_to_errno(ret); | ||
924 | |||
925 | evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */ | ||
926 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp); | ||
927 | if (ret) | ||
928 | return pcibios_err_to_errno(ret); | ||
929 | |||
930 | if (tmp & PCI_VC_REG2_128_PHASE) | ||
931 | phases = 128; | ||
932 | else if (tmp & PCI_VC_REG2_64_PHASE) | ||
933 | phases = 64; | ||
934 | else if (tmp & PCI_VC_REG2_32_PHASE) | ||
935 | phases = 32; | ||
936 | else | ||
937 | phases = 0; | ||
938 | |||
939 | vc_arb = phases * 4; | ||
940 | |||
941 | /* | ||
942 | * Port arbitration tables are root & switch only; | ||
943 | * function arbitration tables are function 0 only. | ||
944 | * In either case, we'll never let user write them so | ||
945 | * we don't care how big they are | ||
946 | */ | ||
947 | len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF; | ||
948 | if (vc_arb) { | ||
949 | len = round_up(len, 16); | ||
950 | len += vc_arb / 8; | ||
951 | } | ||
952 | return len; | ||
953 | } | ||
954 | |||
955 | static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) | ||
956 | { | ||
957 | struct pci_dev *pdev = vdev->pdev; | ||
958 | u16 word; | ||
959 | u8 byte; | ||
960 | int ret; | ||
961 | |||
962 | switch (cap) { | ||
963 | case PCI_CAP_ID_MSI: | ||
964 | return vfio_msi_cap_len(vdev, pos); | ||
965 | case PCI_CAP_ID_PCIX: | ||
966 | ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word); | ||
967 | if (ret) | ||
968 | return pcibios_err_to_errno(ret); | ||
969 | |||
970 | if (PCI_X_CMD_VERSION(word)) { | ||
971 | vdev->extended_caps = true; | ||
972 | return PCI_CAP_PCIX_SIZEOF_V2; | ||
973 | } else | ||
974 | return PCI_CAP_PCIX_SIZEOF_V0; | ||
975 | case PCI_CAP_ID_VNDR: | ||
976 | /* length follows next field */ | ||
977 | ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte); | ||
978 | if (ret) | ||
979 | return pcibios_err_to_errno(ret); | ||
980 | |||
981 | return byte; | ||
982 | case PCI_CAP_ID_EXP: | ||
983 | /* length based on version */ | ||
984 | ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); | ||
985 | if (ret) | ||
986 | return pcibios_err_to_errno(ret); | ||
987 | |||
988 | if ((word & PCI_EXP_FLAGS_VERS) == 1) | ||
989 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; | ||
990 | else { | ||
991 | vdev->extended_caps = true; | ||
992 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; | ||
993 | } | ||
994 | case PCI_CAP_ID_HT: | ||
995 | ret = pci_read_config_byte(pdev, pos + 3, &byte); | ||
996 | if (ret) | ||
997 | return pcibios_err_to_errno(ret); | ||
998 | |||
999 | return (byte & HT_3BIT_CAP_MASK) ? | ||
1000 | HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG; | ||
1001 | case PCI_CAP_ID_SATA: | ||
1002 | ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte); | ||
1003 | if (ret) | ||
1004 | return pcibios_err_to_errno(ret); | ||
1005 | |||
1006 | byte &= PCI_SATA_REGS_MASK; | ||
1007 | if (byte == PCI_SATA_REGS_INLINE) | ||
1008 | return PCI_SATA_SIZEOF_LONG; | ||
1009 | else | ||
1010 | return PCI_SATA_SIZEOF_SHORT; | ||
1011 | default: | ||
1012 | pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n", | ||
1013 | dev_name(&pdev->dev), __func__, cap, pos); | ||
1014 | } | ||
1015 | |||
1016 | return 0; | ||
1017 | } | ||
1018 | |||
1019 | static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) | ||
1020 | { | ||
1021 | struct pci_dev *pdev = vdev->pdev; | ||
1022 | u8 byte; | ||
1023 | u32 dword; | ||
1024 | int ret; | ||
1025 | |||
1026 | switch (ecap) { | ||
1027 | case PCI_EXT_CAP_ID_VNDR: | ||
1028 | ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); | ||
1029 | if (ret) | ||
1030 | return pcibios_err_to_errno(ret); | ||
1031 | |||
1032 | return dword >> PCI_VSEC_HDR_LEN_SHIFT; | ||
1033 | case PCI_EXT_CAP_ID_VC: | ||
1034 | case PCI_EXT_CAP_ID_VC9: | ||
1035 | case PCI_EXT_CAP_ID_MFVC: | ||
1036 | return vfio_vc_cap_len(vdev, epos); | ||
1037 | case PCI_EXT_CAP_ID_ACS: | ||
1038 | ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte); | ||
1039 | if (ret) | ||
1040 | return pcibios_err_to_errno(ret); | ||
1041 | |||
1042 | if (byte & PCI_ACS_EC) { | ||
1043 | int bits; | ||
1044 | |||
1045 | ret = pci_read_config_byte(pdev, | ||
1046 | epos + PCI_ACS_EGRESS_BITS, | ||
1047 | &byte); | ||
1048 | if (ret) | ||
1049 | return pcibios_err_to_errno(ret); | ||
1050 | |||
1051 | bits = byte ? round_up(byte, 32) : 256; | ||
1052 | return 8 + (bits / 8); | ||
1053 | } | ||
1054 | return 8; | ||
1055 | |||
1056 | case PCI_EXT_CAP_ID_REBAR: | ||
1057 | ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte); | ||
1058 | if (ret) | ||
1059 | return pcibios_err_to_errno(ret); | ||
1060 | |||
1061 | byte &= PCI_REBAR_CTRL_NBAR_MASK; | ||
1062 | byte >>= PCI_REBAR_CTRL_NBAR_SHIFT; | ||
1063 | |||
1064 | return 4 + (byte * 8); | ||
1065 | case PCI_EXT_CAP_ID_DPA: | ||
1066 | ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte); | ||
1067 | if (ret) | ||
1068 | return pcibios_err_to_errno(ret); | ||
1069 | |||
1070 | byte &= PCI_DPA_CAP_SUBSTATE_MASK; | ||
1071 | byte = round_up(byte + 1, 4); | ||
1072 | return PCI_DPA_BASE_SIZEOF + byte; | ||
1073 | case PCI_EXT_CAP_ID_TPH: | ||
1074 | ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); | ||
1075 | if (ret) | ||
1076 | return pcibios_err_to_errno(ret); | ||
1077 | |||
1078 | if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { | ||
1079 | int sts; | ||
1080 | |||
1081 | sts = byte & PCI_TPH_CAP_ST_MASK; | ||
1082 | sts >>= PCI_TPH_CAP_ST_SHIFT; | ||
1083 | return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4); | ||
1084 | } | ||
1085 | return PCI_TPH_BASE_SIZEOF; | ||
1086 | default: | ||
1087 | pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n", | ||
1088 | dev_name(&pdev->dev), __func__, ecap, epos); | ||
1089 | } | ||
1090 | |||
1091 | return 0; | ||
1092 | } | ||
1093 | |||
1094 | static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, | ||
1095 | int offset, int size) | ||
1096 | { | ||
1097 | struct pci_dev *pdev = vdev->pdev; | ||
1098 | int ret = 0; | ||
1099 | |||
1100 | /* | ||
1101 | * We try to read physical config space in the largest chunks | ||
1102 | * we can, assuming that all of the fields support dword access. | ||
1103 | * pci_save_state() makes this same assumption and seems to do ok. | ||
1104 | */ | ||
1105 | while (size) { | ||
1106 | int filled; | ||
1107 | |||
1108 | if (size >= 4 && !(offset % 4)) { | ||
1109 | __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; | ||
1110 | u32 dword; | ||
1111 | |||
1112 | ret = pci_read_config_dword(pdev, offset, &dword); | ||
1113 | if (ret) | ||
1114 | return ret; | ||
1115 | *dwordp = cpu_to_le32(dword); | ||
1116 | filled = 4; | ||
1117 | } else if (size >= 2 && !(offset % 2)) { | ||
1118 | __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; | ||
1119 | u16 word; | ||
1120 | |||
1121 | ret = pci_read_config_word(pdev, offset, &word); | ||
1122 | if (ret) | ||
1123 | return ret; | ||
1124 | *wordp = cpu_to_le16(word); | ||
1125 | filled = 2; | ||
1126 | } else { | ||
1127 | u8 *byte = &vdev->vconfig[offset]; | ||
1128 | ret = pci_read_config_byte(pdev, offset, byte); | ||
1129 | if (ret) | ||
1130 | return ret; | ||
1131 | filled = 1; | ||
1132 | } | ||
1133 | |||
1134 | offset += filled; | ||
1135 | size -= filled; | ||
1136 | } | ||
1137 | |||
1138 | return ret; | ||
1139 | } | ||
1140 | |||
1141 | static int vfio_cap_init(struct vfio_pci_device *vdev) | ||
1142 | { | ||
1143 | struct pci_dev *pdev = vdev->pdev; | ||
1144 | u8 *map = vdev->pci_config_map; | ||
1145 | u16 status; | ||
1146 | u8 pos, *prev, cap; | ||
1147 | int loops, ret, caps = 0; | ||
1148 | |||
1149 | /* Any capabilities? */ | ||
1150 | ret = pci_read_config_word(pdev, PCI_STATUS, &status); | ||
1151 | if (ret) | ||
1152 | return ret; | ||
1153 | |||
1154 | if (!(status & PCI_STATUS_CAP_LIST)) | ||
1155 | return 0; /* Done */ | ||
1156 | |||
1157 | ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); | ||
1158 | if (ret) | ||
1159 | return ret; | ||
1160 | |||
1161 | /* Mark the previous position in case we want to skip a capability */ | ||
1162 | prev = &vdev->vconfig[PCI_CAPABILITY_LIST]; | ||
1163 | |||
1164 | /* We can bound our loop, capabilities are dword aligned */ | ||
1165 | loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF; | ||
1166 | while (pos && loops--) { | ||
1167 | u8 next; | ||
1168 | int i, len = 0; | ||
1169 | |||
1170 | ret = pci_read_config_byte(pdev, pos, &cap); | ||
1171 | if (ret) | ||
1172 | return ret; | ||
1173 | |||
1174 | ret = pci_read_config_byte(pdev, | ||
1175 | pos + PCI_CAP_LIST_NEXT, &next); | ||
1176 | if (ret) | ||
1177 | return ret; | ||
1178 | |||
1179 | if (cap <= PCI_CAP_ID_MAX) { | ||
1180 | len = pci_cap_length[cap]; | ||
1181 | if (len == 0xFF) { /* Variable length */ | ||
1182 | len = vfio_cap_len(vdev, cap, pos); | ||
1183 | if (len < 0) | ||
1184 | return len; | ||
1185 | } | ||
1186 | } | ||
1187 | |||
1188 | if (!len) { | ||
1189 | pr_info("%s: %s hiding cap 0x%x\n", | ||
1190 | __func__, dev_name(&pdev->dev), cap); | ||
1191 | *prev = next; | ||
1192 | pos = next; | ||
1193 | continue; | ||
1194 | } | ||
1195 | |||
1196 | /* Sanity check, do we overlap other capabilities? */ | ||
1197 | for (i = 0; i < len; i += 4) { | ||
1198 | if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
1199 | continue; | ||
1200 | |||
1201 | pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", | ||
1202 | __func__, dev_name(&pdev->dev), | ||
1203 | pos + i, map[pos + i], cap); | ||
1204 | } | ||
1205 | |||
1206 | memset(map + (pos / 4), cap, len / 4); | ||
1207 | ret = vfio_fill_vconfig_bytes(vdev, pos, len); | ||
1208 | if (ret) | ||
1209 | return ret; | ||
1210 | |||
1211 | prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; | ||
1212 | pos = next; | ||
1213 | caps++; | ||
1214 | } | ||
1215 | |||
1216 | /* If we didn't fill any capabilities, clear the status flag */ | ||
1217 | if (!caps) { | ||
1218 | __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS]; | ||
1219 | *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST); | ||
1220 | } | ||
1221 | |||
1222 | return 0; | ||
1223 | } | ||
1224 | |||
1225 | static int vfio_ecap_init(struct vfio_pci_device *vdev) | ||
1226 | { | ||
1227 | struct pci_dev *pdev = vdev->pdev; | ||
1228 | u8 *map = vdev->pci_config_map; | ||
1229 | u16 epos; | ||
1230 | __le32 *prev = NULL; | ||
1231 | int loops, ret, ecaps = 0; | ||
1232 | |||
1233 | if (!vdev->extended_caps) | ||
1234 | return 0; | ||
1235 | |||
1236 | epos = PCI_CFG_SPACE_SIZE; | ||
1237 | |||
1238 | loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; | ||
1239 | |||
1240 | while (loops-- && epos >= PCI_CFG_SPACE_SIZE) { | ||
1241 | u32 header; | ||
1242 | u16 ecap; | ||
1243 | int i, len = 0; | ||
1244 | bool hidden = false; | ||
1245 | |||
1246 | ret = pci_read_config_dword(pdev, epos, &header); | ||
1247 | if (ret) | ||
1248 | return ret; | ||
1249 | |||
1250 | ecap = PCI_EXT_CAP_ID(header); | ||
1251 | |||
1252 | if (ecap <= PCI_EXT_CAP_ID_MAX) { | ||
1253 | len = pci_ext_cap_length[ecap]; | ||
1254 | if (len == 0xFF) { | ||
1255 | len = vfio_ext_cap_len(vdev, ecap, epos); | ||
1256 | if (len < 0) | ||
1257 | return ret; | ||
1258 | } | ||
1259 | } | ||
1260 | |||
1261 | if (!len) { | ||
1262 | pr_info("%s: %s hiding ecap 0x%x@0x%x\n", | ||
1263 | __func__, dev_name(&pdev->dev), ecap, epos); | ||
1264 | |||
1265 | /* If not the first in the chain, we can skip over it */ | ||
1266 | if (prev) { | ||
1267 | u32 val = epos = PCI_EXT_CAP_NEXT(header); | ||
1268 | *prev &= cpu_to_le32(~(0xffcU << 20)); | ||
1269 | *prev |= cpu_to_le32(val << 20); | ||
1270 | continue; | ||
1271 | } | ||
1272 | |||
1273 | /* | ||
1274 | * Otherwise, fill in a placeholder, the direct | ||
1275 | * readfn will virtualize this automatically | ||
1276 | */ | ||
1277 | len = PCI_CAP_SIZEOF; | ||
1278 | hidden = true; | ||
1279 | } | ||
1280 | |||
1281 | for (i = 0; i < len; i += 4) { | ||
1282 | if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) | ||
1283 | continue; | ||
1284 | |||
1285 | pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", | ||
1286 | __func__, dev_name(&pdev->dev), | ||
1287 | epos + i, map[epos + i], ecap); | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * Even though ecap is 2 bytes, we're currently a long way | ||
1292 | * from exceeding 1 byte capabilities. If we ever make it | ||
1293 | * up to 0xFF we'll need to up this to a two-byte, byte map. | ||
1294 | */ | ||
1295 | BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); | ||
1296 | |||
1297 | memset(map + (epos / 4), ecap, len / 4); | ||
1298 | ret = vfio_fill_vconfig_bytes(vdev, epos, len); | ||
1299 | if (ret) | ||
1300 | return ret; | ||
1301 | |||
1302 | /* | ||
1303 | * If we're just using this capability to anchor the list, | ||
1304 | * hide the real ID. Only count real ecaps. XXX PCI spec | ||
1305 | * indicates to use cap id = 0, version = 0, next = 0 if | ||
1306 | * ecaps are absent, hope users check all the way to next. | ||
1307 | */ | ||
1308 | if (hidden) | ||
1309 | *(__le32 *)&vdev->vconfig[epos] &= | ||
1310 | cpu_to_le32((0xffcU << 20)); | ||
1311 | else | ||
1312 | ecaps++; | ||
1313 | |||
1314 | prev = (__le32 *)&vdev->vconfig[epos]; | ||
1315 | epos = PCI_EXT_CAP_NEXT(header); | ||
1316 | } | ||
1317 | |||
1318 | if (!ecaps) | ||
1319 | *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; | ||
1320 | |||
1321 | return 0; | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * For each device we allocate a pci_config_map that indicates the | ||
1326 | * capability occupying each dword and thus the struct perm_bits we | ||
1327 | * use for read and write. We also allocate a virtualized config | ||
1328 | * space which tracks reads and writes to bits that we emulate for | ||
1329 | * the user. Initial values filled from device. | ||
1330 | * | ||
1331 | * Using shared stuct perm_bits between all vfio-pci devices saves | ||
1332 | * us from allocating cfg_size buffers for virt and write for every | ||
1333 | * device. We could remove vconfig and allocate individual buffers | ||
1334 | * for each area requring emulated bits, but the array of pointers | ||
1335 | * would be comparable in size (at least for standard config space). | ||
1336 | */ | ||
1337 | int vfio_config_init(struct vfio_pci_device *vdev) | ||
1338 | { | ||
1339 | struct pci_dev *pdev = vdev->pdev; | ||
1340 | u8 *map, *vconfig; | ||
1341 | int ret; | ||
1342 | |||
1343 | /* | ||
1344 | * Config space, caps and ecaps are all dword aligned, so we can | ||
1345 | * use one byte per dword to record the type. | ||
1346 | */ | ||
1347 | map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); | ||
1348 | if (!map) | ||
1349 | return -ENOMEM; | ||
1350 | |||
1351 | vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); | ||
1352 | if (!vconfig) { | ||
1353 | kfree(map); | ||
1354 | return -ENOMEM; | ||
1355 | } | ||
1356 | |||
1357 | vdev->pci_config_map = map; | ||
1358 | vdev->vconfig = vconfig; | ||
1359 | |||
1360 | memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); | ||
1361 | memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, | ||
1362 | (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); | ||
1363 | |||
1364 | ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); | ||
1365 | if (ret) | ||
1366 | goto out; | ||
1367 | |||
1368 | vdev->bardirty = true; | ||
1369 | |||
1370 | /* | ||
1371 | * XXX can we just pci_load_saved_state/pci_restore_state? | ||
1372 | * may need to rebuild vconfig after that | ||
1373 | */ | ||
1374 | |||
1375 | /* For restore after reset */ | ||
1376 | vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]); | ||
1377 | vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]); | ||
1378 | vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]); | ||
1379 | vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]); | ||
1380 | vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]); | ||
1381 | vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]); | ||
1382 | vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]); | ||
1383 | |||
1384 | if (pdev->is_virtfn) { | ||
1385 | *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor); | ||
1386 | *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); | ||
1387 | } | ||
1388 | |||
1389 | ret = vfio_cap_init(vdev); | ||
1390 | if (ret) | ||
1391 | goto out; | ||
1392 | |||
1393 | ret = vfio_ecap_init(vdev); | ||
1394 | if (ret) | ||
1395 | goto out; | ||
1396 | |||
1397 | return 0; | ||
1398 | |||
1399 | out: | ||
1400 | kfree(map); | ||
1401 | vdev->pci_config_map = NULL; | ||
1402 | kfree(vconfig); | ||
1403 | vdev->vconfig = NULL; | ||
1404 | return pcibios_err_to_errno(ret); | ||
1405 | } | ||
1406 | |||
1407 | void vfio_config_free(struct vfio_pci_device *vdev) | ||
1408 | { | ||
1409 | kfree(vdev->vconfig); | ||
1410 | vdev->vconfig = NULL; | ||
1411 | kfree(vdev->pci_config_map); | ||
1412 | vdev->pci_config_map = NULL; | ||
1413 | kfree(vdev->msi_perm); | ||
1414 | vdev->msi_perm = NULL; | ||
1415 | } | ||
1416 | |||
1417 | static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, | ||
1418 | size_t count, loff_t *ppos, bool iswrite) | ||
1419 | { | ||
1420 | struct pci_dev *pdev = vdev->pdev; | ||
1421 | struct perm_bits *perm; | ||
1422 | __le32 val = 0; | ||
1423 | int cap_start = 0, offset; | ||
1424 | u8 cap_id; | ||
1425 | ssize_t ret = count; | ||
1426 | |||
1427 | if (*ppos < 0 || *ppos + count > pdev->cfg_size) | ||
1428 | return -EFAULT; | ||
1429 | |||
1430 | /* | ||
1431 | * gcc can't seem to figure out we're a static function, only called | ||
1432 | * with count of 1/2/4 and hits copy_from_user_overflow without this. | ||
1433 | */ | ||
1434 | if (count > sizeof(val)) | ||
1435 | return -EINVAL; | ||
1436 | |||
1437 | cap_id = vdev->pci_config_map[*ppos / 4]; | ||
1438 | |||
1439 | if (cap_id == PCI_CAP_ID_INVALID) { | ||
1440 | if (iswrite) | ||
1441 | return ret; /* drop */ | ||
1442 | |||
1443 | /* | ||
1444 | * Per PCI spec 3.0, section 6.1, reads from reserved and | ||
1445 | * unimplemented registers return 0 | ||
1446 | */ | ||
1447 | if (copy_to_user(buf, &val, count)) | ||
1448 | return -EFAULT; | ||
1449 | |||
1450 | return ret; | ||
1451 | } | ||
1452 | |||
1453 | /* | ||
1454 | * All capabilities are minimum 4 bytes and aligned on dword | ||
1455 | * boundaries. Since we don't support unaligned accesses, we're | ||
1456 | * only ever accessing a single capability. | ||
1457 | */ | ||
1458 | if (*ppos >= PCI_CFG_SPACE_SIZE) { | ||
1459 | WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); | ||
1460 | |||
1461 | perm = &ecap_perms[cap_id]; | ||
1462 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
1463 | |||
1464 | } else { | ||
1465 | WARN_ON(cap_id > PCI_CAP_ID_MAX); | ||
1466 | |||
1467 | perm = &cap_perms[cap_id]; | ||
1468 | |||
1469 | if (cap_id == PCI_CAP_ID_MSI) | ||
1470 | perm = vdev->msi_perm; | ||
1471 | |||
1472 | if (cap_id > PCI_CAP_ID_BASIC) | ||
1473 | cap_start = vfio_find_cap_start(vdev, *ppos); | ||
1474 | } | ||
1475 | |||
1476 | WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); | ||
1477 | WARN_ON(cap_start > *ppos); | ||
1478 | |||
1479 | offset = *ppos - cap_start; | ||
1480 | |||
1481 | if (iswrite) { | ||
1482 | if (!perm->writefn) | ||
1483 | return ret; | ||
1484 | |||
1485 | if (copy_from_user(&val, buf, count)) | ||
1486 | return -EFAULT; | ||
1487 | |||
1488 | ret = perm->writefn(vdev, *ppos, count, perm, offset, val); | ||
1489 | } else { | ||
1490 | if (perm->readfn) { | ||
1491 | ret = perm->readfn(vdev, *ppos, count, | ||
1492 | perm, offset, &val); | ||
1493 | if (ret < 0) | ||
1494 | return ret; | ||
1495 | } | ||
1496 | |||
1497 | if (copy_to_user(buf, &val, count)) | ||
1498 | return -EFAULT; | ||
1499 | } | ||
1500 | |||
1501 | return ret; | ||
1502 | } | ||
1503 | |||
1504 | ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
1505 | char __user *buf, size_t count, | ||
1506 | loff_t *ppos, bool iswrite) | ||
1507 | { | ||
1508 | size_t done = 0; | ||
1509 | int ret = 0; | ||
1510 | loff_t pos = *ppos; | ||
1511 | |||
1512 | pos &= VFIO_PCI_OFFSET_MASK; | ||
1513 | |||
1514 | /* | ||
1515 | * We want to both keep the access size the caller users as well as | ||
1516 | * support reading large chunks of config space in a single call. | ||
1517 | * PCI doesn't support unaligned accesses, so we can safely break | ||
1518 | * those apart. | ||
1519 | */ | ||
1520 | while (count) { | ||
1521 | if (count >= 4 && !(pos % 4)) | ||
1522 | ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); | ||
1523 | else if (count >= 2 && !(pos % 2)) | ||
1524 | ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); | ||
1525 | else | ||
1526 | ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); | ||
1527 | |||
1528 | if (ret < 0) | ||
1529 | return ret; | ||
1530 | |||
1531 | count -= ret; | ||
1532 | done += ret; | ||
1533 | buf += ret; | ||
1534 | pos += ret; | ||
1535 | } | ||
1536 | |||
1537 | *ppos += done; | ||
1538 | |||
1539 | return done; | ||
1540 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c new file mode 100644 index 000000000000..211a4920b88a --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_intrs.c | |||
@@ -0,0 +1,740 @@ | |||
1 | /* | ||
2 | * VFIO PCI interrupt handling | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | #include <linux/device.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/eventfd.h> | ||
19 | #include <linux/pci.h> | ||
20 | #include <linux/file.h> | ||
21 | #include <linux/poll.h> | ||
22 | #include <linux/vfio.h> | ||
23 | #include <linux/wait.h> | ||
24 | #include <linux/workqueue.h> | ||
25 | |||
26 | #include "vfio_pci_private.h" | ||
27 | |||
28 | /* | ||
29 | * IRQfd - generic | ||
30 | */ | ||
31 | struct virqfd { | ||
32 | struct vfio_pci_device *vdev; | ||
33 | struct eventfd_ctx *eventfd; | ||
34 | int (*handler)(struct vfio_pci_device *, void *); | ||
35 | void (*thread)(struct vfio_pci_device *, void *); | ||
36 | void *data; | ||
37 | struct work_struct inject; | ||
38 | wait_queue_t wait; | ||
39 | poll_table pt; | ||
40 | struct work_struct shutdown; | ||
41 | struct virqfd **pvirqfd; | ||
42 | }; | ||
43 | |||
44 | static struct workqueue_struct *vfio_irqfd_cleanup_wq; | ||
45 | |||
46 | int __init vfio_pci_virqfd_init(void) | ||
47 | { | ||
48 | vfio_irqfd_cleanup_wq = | ||
49 | create_singlethread_workqueue("vfio-irqfd-cleanup"); | ||
50 | if (!vfio_irqfd_cleanup_wq) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | void vfio_pci_virqfd_exit(void) | ||
57 | { | ||
58 | destroy_workqueue(vfio_irqfd_cleanup_wq); | ||
59 | } | ||
60 | |||
61 | static void virqfd_deactivate(struct virqfd *virqfd) | ||
62 | { | ||
63 | queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); | ||
64 | } | ||
65 | |||
66 | static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
67 | { | ||
68 | struct virqfd *virqfd = container_of(wait, struct virqfd, wait); | ||
69 | unsigned long flags = (unsigned long)key; | ||
70 | |||
71 | if (flags & POLLIN) { | ||
72 | /* An event has been signaled, call function */ | ||
73 | if ((!virqfd->handler || | ||
74 | virqfd->handler(virqfd->vdev, virqfd->data)) && | ||
75 | virqfd->thread) | ||
76 | schedule_work(&virqfd->inject); | ||
77 | } | ||
78 | |||
79 | if (flags & POLLHUP) | ||
80 | /* The eventfd is closing, detach from VFIO */ | ||
81 | virqfd_deactivate(virqfd); | ||
82 | |||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static void virqfd_ptable_queue_proc(struct file *file, | ||
87 | wait_queue_head_t *wqh, poll_table *pt) | ||
88 | { | ||
89 | struct virqfd *virqfd = container_of(pt, struct virqfd, pt); | ||
90 | add_wait_queue(wqh, &virqfd->wait); | ||
91 | } | ||
92 | |||
93 | static void virqfd_shutdown(struct work_struct *work) | ||
94 | { | ||
95 | struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); | ||
96 | struct virqfd **pvirqfd = virqfd->pvirqfd; | ||
97 | u64 cnt; | ||
98 | |||
99 | eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt); | ||
100 | flush_work(&virqfd->inject); | ||
101 | eventfd_ctx_put(virqfd->eventfd); | ||
102 | |||
103 | kfree(virqfd); | ||
104 | *pvirqfd = NULL; | ||
105 | } | ||
106 | |||
107 | static void virqfd_inject(struct work_struct *work) | ||
108 | { | ||
109 | struct virqfd *virqfd = container_of(work, struct virqfd, inject); | ||
110 | if (virqfd->thread) | ||
111 | virqfd->thread(virqfd->vdev, virqfd->data); | ||
112 | } | ||
113 | |||
114 | static int virqfd_enable(struct vfio_pci_device *vdev, | ||
115 | int (*handler)(struct vfio_pci_device *, void *), | ||
116 | void (*thread)(struct vfio_pci_device *, void *), | ||
117 | void *data, struct virqfd **pvirqfd, int fd) | ||
118 | { | ||
119 | struct file *file = NULL; | ||
120 | struct eventfd_ctx *ctx = NULL; | ||
121 | struct virqfd *virqfd; | ||
122 | int ret = 0; | ||
123 | unsigned int events; | ||
124 | |||
125 | if (*pvirqfd) | ||
126 | return -EBUSY; | ||
127 | |||
128 | virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); | ||
129 | if (!virqfd) | ||
130 | return -ENOMEM; | ||
131 | |||
132 | virqfd->pvirqfd = pvirqfd; | ||
133 | *pvirqfd = virqfd; | ||
134 | virqfd->vdev = vdev; | ||
135 | virqfd->handler = handler; | ||
136 | virqfd->thread = thread; | ||
137 | virqfd->data = data; | ||
138 | |||
139 | INIT_WORK(&virqfd->shutdown, virqfd_shutdown); | ||
140 | INIT_WORK(&virqfd->inject, virqfd_inject); | ||
141 | |||
142 | file = eventfd_fget(fd); | ||
143 | if (IS_ERR(file)) { | ||
144 | ret = PTR_ERR(file); | ||
145 | goto fail; | ||
146 | } | ||
147 | |||
148 | ctx = eventfd_ctx_fileget(file); | ||
149 | if (IS_ERR(ctx)) { | ||
150 | ret = PTR_ERR(ctx); | ||
151 | goto fail; | ||
152 | } | ||
153 | |||
154 | virqfd->eventfd = ctx; | ||
155 | |||
156 | /* | ||
157 | * Install our own custom wake-up handling so we are notified via | ||
158 | * a callback whenever someone signals the underlying eventfd. | ||
159 | */ | ||
160 | init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); | ||
161 | init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); | ||
162 | |||
163 | events = file->f_op->poll(file, &virqfd->pt); | ||
164 | |||
165 | /* | ||
166 | * Check if there was an event already pending on the eventfd | ||
167 | * before we registered and trigger it as if we didn't miss it. | ||
168 | */ | ||
169 | if (events & POLLIN) { | ||
170 | if ((!handler || handler(vdev, data)) && thread) | ||
171 | schedule_work(&virqfd->inject); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Do not drop the file until the irqfd is fully initialized, | ||
176 | * otherwise we might race against the POLLHUP. | ||
177 | */ | ||
178 | fput(file); | ||
179 | |||
180 | return 0; | ||
181 | |||
182 | fail: | ||
183 | if (ctx && !IS_ERR(ctx)) | ||
184 | eventfd_ctx_put(ctx); | ||
185 | |||
186 | if (file && !IS_ERR(file)) | ||
187 | fput(file); | ||
188 | |||
189 | kfree(virqfd); | ||
190 | *pvirqfd = NULL; | ||
191 | |||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | static void virqfd_disable(struct virqfd *virqfd) | ||
196 | { | ||
197 | if (!virqfd) | ||
198 | return; | ||
199 | |||
200 | virqfd_deactivate(virqfd); | ||
201 | |||
202 | /* Block until we know all outstanding shutdown jobs have completed. */ | ||
203 | flush_workqueue(vfio_irqfd_cleanup_wq); | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * INTx | ||
208 | */ | ||
209 | static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused) | ||
210 | { | ||
211 | if (likely(is_intx(vdev) && !vdev->virq_disabled)) | ||
212 | eventfd_signal(vdev->ctx[0].trigger, 1); | ||
213 | } | ||
214 | |||
215 | void vfio_pci_intx_mask(struct vfio_pci_device *vdev) | ||
216 | { | ||
217 | struct pci_dev *pdev = vdev->pdev; | ||
218 | unsigned long flags; | ||
219 | |||
220 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
221 | |||
222 | /* | ||
223 | * Masking can come from interrupt, ioctl, or config space | ||
224 | * via INTx disable. The latter means this can get called | ||
225 | * even when not using intx delivery. In this case, just | ||
226 | * try to have the physical bit follow the virtual bit. | ||
227 | */ | ||
228 | if (unlikely(!is_intx(vdev))) { | ||
229 | if (vdev->pci_2_3) | ||
230 | pci_intx(pdev, 0); | ||
231 | } else if (!vdev->ctx[0].masked) { | ||
232 | /* | ||
233 | * Can't use check_and_mask here because we always want to | ||
234 | * mask, not just when something is pending. | ||
235 | */ | ||
236 | if (vdev->pci_2_3) | ||
237 | pci_intx(pdev, 0); | ||
238 | else | ||
239 | disable_irq_nosync(pdev->irq); | ||
240 | |||
241 | vdev->ctx[0].masked = true; | ||
242 | } | ||
243 | |||
244 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * If this is triggered by an eventfd, we can't call eventfd_signal | ||
249 | * or else we'll deadlock on the eventfd wait queue. Return >0 when | ||
250 | * a signal is necessary, which can then be handled via a work queue | ||
251 | * or directly depending on the caller. | ||
252 | */ | ||
253 | int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) | ||
254 | { | ||
255 | struct pci_dev *pdev = vdev->pdev; | ||
256 | unsigned long flags; | ||
257 | int ret = 0; | ||
258 | |||
259 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
260 | |||
261 | /* | ||
262 | * Unmasking comes from ioctl or config, so again, have the | ||
263 | * physical bit follow the virtual even when not using INTx. | ||
264 | */ | ||
265 | if (unlikely(!is_intx(vdev))) { | ||
266 | if (vdev->pci_2_3) | ||
267 | pci_intx(pdev, 1); | ||
268 | } else if (vdev->ctx[0].masked && !vdev->virq_disabled) { | ||
269 | /* | ||
270 | * A pending interrupt here would immediately trigger, | ||
271 | * but we can avoid that overhead by just re-sending | ||
272 | * the interrupt to the user. | ||
273 | */ | ||
274 | if (vdev->pci_2_3) { | ||
275 | if (!pci_check_and_unmask_intx(pdev)) | ||
276 | ret = 1; | ||
277 | } else | ||
278 | enable_irq(pdev->irq); | ||
279 | |||
280 | vdev->ctx[0].masked = (ret > 0); | ||
281 | } | ||
282 | |||
283 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
284 | |||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) | ||
289 | { | ||
290 | if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) | ||
291 | vfio_send_intx_eventfd(vdev, NULL); | ||
292 | } | ||
293 | |||
294 | static irqreturn_t vfio_intx_handler(int irq, void *dev_id) | ||
295 | { | ||
296 | struct vfio_pci_device *vdev = dev_id; | ||
297 | unsigned long flags; | ||
298 | int ret = IRQ_NONE; | ||
299 | |||
300 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
301 | |||
302 | if (!vdev->pci_2_3) { | ||
303 | disable_irq_nosync(vdev->pdev->irq); | ||
304 | vdev->ctx[0].masked = true; | ||
305 | ret = IRQ_HANDLED; | ||
306 | } else if (!vdev->ctx[0].masked && /* may be shared */ | ||
307 | pci_check_and_mask_intx(vdev->pdev)) { | ||
308 | vdev->ctx[0].masked = true; | ||
309 | ret = IRQ_HANDLED; | ||
310 | } | ||
311 | |||
312 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
313 | |||
314 | if (ret == IRQ_HANDLED) | ||
315 | vfio_send_intx_eventfd(vdev, NULL); | ||
316 | |||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static int vfio_intx_enable(struct vfio_pci_device *vdev) | ||
321 | { | ||
322 | if (!is_irq_none(vdev)) | ||
323 | return -EINVAL; | ||
324 | |||
325 | if (!vdev->pdev->irq) | ||
326 | return -ENODEV; | ||
327 | |||
328 | vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
329 | if (!vdev->ctx) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | vdev->num_ctx = 1; | ||
333 | vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; | ||
334 | |||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) | ||
339 | { | ||
340 | struct pci_dev *pdev = vdev->pdev; | ||
341 | unsigned long irqflags = IRQF_SHARED; | ||
342 | struct eventfd_ctx *trigger; | ||
343 | unsigned long flags; | ||
344 | int ret; | ||
345 | |||
346 | if (vdev->ctx[0].trigger) { | ||
347 | free_irq(pdev->irq, vdev); | ||
348 | kfree(vdev->ctx[0].name); | ||
349 | eventfd_ctx_put(vdev->ctx[0].trigger); | ||
350 | vdev->ctx[0].trigger = NULL; | ||
351 | } | ||
352 | |||
353 | if (fd < 0) /* Disable only */ | ||
354 | return 0; | ||
355 | |||
356 | vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", | ||
357 | pci_name(pdev)); | ||
358 | if (!vdev->ctx[0].name) | ||
359 | return -ENOMEM; | ||
360 | |||
361 | trigger = eventfd_ctx_fdget(fd); | ||
362 | if (IS_ERR(trigger)) { | ||
363 | kfree(vdev->ctx[0].name); | ||
364 | return PTR_ERR(trigger); | ||
365 | } | ||
366 | |||
367 | if (!vdev->pci_2_3) | ||
368 | irqflags = 0; | ||
369 | |||
370 | ret = request_irq(pdev->irq, vfio_intx_handler, | ||
371 | irqflags, vdev->ctx[0].name, vdev); | ||
372 | if (ret) { | ||
373 | kfree(vdev->ctx[0].name); | ||
374 | eventfd_ctx_put(trigger); | ||
375 | return ret; | ||
376 | } | ||
377 | |||
378 | vdev->ctx[0].trigger = trigger; | ||
379 | |||
380 | /* | ||
381 | * INTx disable will stick across the new irq setup, | ||
382 | * disable_irq won't. | ||
383 | */ | ||
384 | spin_lock_irqsave(&vdev->irqlock, flags); | ||
385 | if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled)) | ||
386 | disable_irq_nosync(pdev->irq); | ||
387 | spin_unlock_irqrestore(&vdev->irqlock, flags); | ||
388 | |||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static void vfio_intx_disable(struct vfio_pci_device *vdev) | ||
393 | { | ||
394 | vfio_intx_set_signal(vdev, -1); | ||
395 | virqfd_disable(vdev->ctx[0].unmask); | ||
396 | virqfd_disable(vdev->ctx[0].mask); | ||
397 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
398 | vdev->num_ctx = 0; | ||
399 | kfree(vdev->ctx); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * MSI/MSI-X | ||
404 | */ | ||
405 | static irqreturn_t vfio_msihandler(int irq, void *arg) | ||
406 | { | ||
407 | struct eventfd_ctx *trigger = arg; | ||
408 | |||
409 | eventfd_signal(trigger, 1); | ||
410 | return IRQ_HANDLED; | ||
411 | } | ||
412 | |||
413 | static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) | ||
414 | { | ||
415 | struct pci_dev *pdev = vdev->pdev; | ||
416 | int ret; | ||
417 | |||
418 | if (!is_irq_none(vdev)) | ||
419 | return -EINVAL; | ||
420 | |||
421 | vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); | ||
422 | if (!vdev->ctx) | ||
423 | return -ENOMEM; | ||
424 | |||
425 | if (msix) { | ||
426 | int i; | ||
427 | |||
428 | vdev->msix = kzalloc(nvec * sizeof(struct msix_entry), | ||
429 | GFP_KERNEL); | ||
430 | if (!vdev->msix) { | ||
431 | kfree(vdev->ctx); | ||
432 | return -ENOMEM; | ||
433 | } | ||
434 | |||
435 | for (i = 0; i < nvec; i++) | ||
436 | vdev->msix[i].entry = i; | ||
437 | |||
438 | ret = pci_enable_msix(pdev, vdev->msix, nvec); | ||
439 | if (ret) { | ||
440 | kfree(vdev->msix); | ||
441 | kfree(vdev->ctx); | ||
442 | return ret; | ||
443 | } | ||
444 | } else { | ||
445 | ret = pci_enable_msi_block(pdev, nvec); | ||
446 | if (ret) { | ||
447 | kfree(vdev->ctx); | ||
448 | return ret; | ||
449 | } | ||
450 | } | ||
451 | |||
452 | vdev->num_ctx = nvec; | ||
453 | vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : | ||
454 | VFIO_PCI_MSI_IRQ_INDEX; | ||
455 | |||
456 | if (!msix) { | ||
457 | /* | ||
458 | * Compute the virtual hardware field for max msi vectors - | ||
459 | * it is the log base 2 of the number of vectors. | ||
460 | */ | ||
461 | vdev->msi_qmax = fls(nvec * 2 - 1) - 1; | ||
462 | } | ||
463 | |||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, | ||
468 | int vector, int fd, bool msix) | ||
469 | { | ||
470 | struct pci_dev *pdev = vdev->pdev; | ||
471 | int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; | ||
472 | char *name = msix ? "vfio-msix" : "vfio-msi"; | ||
473 | struct eventfd_ctx *trigger; | ||
474 | int ret; | ||
475 | |||
476 | if (vector >= vdev->num_ctx) | ||
477 | return -EINVAL; | ||
478 | |||
479 | if (vdev->ctx[vector].trigger) { | ||
480 | free_irq(irq, vdev->ctx[vector].trigger); | ||
481 | kfree(vdev->ctx[vector].name); | ||
482 | eventfd_ctx_put(vdev->ctx[vector].trigger); | ||
483 | vdev->ctx[vector].trigger = NULL; | ||
484 | } | ||
485 | |||
486 | if (fd < 0) | ||
487 | return 0; | ||
488 | |||
489 | vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", | ||
490 | name, vector, pci_name(pdev)); | ||
491 | if (!vdev->ctx[vector].name) | ||
492 | return -ENOMEM; | ||
493 | |||
494 | trigger = eventfd_ctx_fdget(fd); | ||
495 | if (IS_ERR(trigger)) { | ||
496 | kfree(vdev->ctx[vector].name); | ||
497 | return PTR_ERR(trigger); | ||
498 | } | ||
499 | |||
500 | ret = request_irq(irq, vfio_msihandler, 0, | ||
501 | vdev->ctx[vector].name, trigger); | ||
502 | if (ret) { | ||
503 | kfree(vdev->ctx[vector].name); | ||
504 | eventfd_ctx_put(trigger); | ||
505 | return ret; | ||
506 | } | ||
507 | |||
508 | vdev->ctx[vector].trigger = trigger; | ||
509 | |||
510 | return 0; | ||
511 | } | ||
512 | |||
513 | static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, | ||
514 | unsigned count, int32_t *fds, bool msix) | ||
515 | { | ||
516 | int i, j, ret = 0; | ||
517 | |||
518 | if (start + count > vdev->num_ctx) | ||
519 | return -EINVAL; | ||
520 | |||
521 | for (i = 0, j = start; i < count && !ret; i++, j++) { | ||
522 | int fd = fds ? fds[i] : -1; | ||
523 | ret = vfio_msi_set_vector_signal(vdev, j, fd, msix); | ||
524 | } | ||
525 | |||
526 | if (ret) { | ||
527 | for (--j; j >= start; j--) | ||
528 | vfio_msi_set_vector_signal(vdev, j, -1, msix); | ||
529 | } | ||
530 | |||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) | ||
535 | { | ||
536 | struct pci_dev *pdev = vdev->pdev; | ||
537 | int i; | ||
538 | |||
539 | vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix); | ||
540 | |||
541 | for (i = 0; i < vdev->num_ctx; i++) { | ||
542 | virqfd_disable(vdev->ctx[i].unmask); | ||
543 | virqfd_disable(vdev->ctx[i].mask); | ||
544 | } | ||
545 | |||
546 | if (msix) { | ||
547 | pci_disable_msix(vdev->pdev); | ||
548 | kfree(vdev->msix); | ||
549 | } else | ||
550 | pci_disable_msi(pdev); | ||
551 | |||
552 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | ||
553 | vdev->num_ctx = 0; | ||
554 | kfree(vdev->ctx); | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * IOCTL support | ||
559 | */ | ||
560 | static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, | ||
561 | unsigned index, unsigned start, | ||
562 | unsigned count, uint32_t flags, void *data) | ||
563 | { | ||
564 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
565 | return -EINVAL; | ||
566 | |||
567 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
568 | vfio_pci_intx_unmask(vdev); | ||
569 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
570 | uint8_t unmask = *(uint8_t *)data; | ||
571 | if (unmask) | ||
572 | vfio_pci_intx_unmask(vdev); | ||
573 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
574 | int32_t fd = *(int32_t *)data; | ||
575 | if (fd >= 0) | ||
576 | return virqfd_enable(vdev, vfio_pci_intx_unmask_handler, | ||
577 | vfio_send_intx_eventfd, NULL, | ||
578 | &vdev->ctx[0].unmask, fd); | ||
579 | |||
580 | virqfd_disable(vdev->ctx[0].unmask); | ||
581 | } | ||
582 | |||
583 | return 0; | ||
584 | } | ||
585 | |||
586 | static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, | ||
587 | unsigned index, unsigned start, | ||
588 | unsigned count, uint32_t flags, void *data) | ||
589 | { | ||
590 | if (!is_intx(vdev) || start != 0 || count != 1) | ||
591 | return -EINVAL; | ||
592 | |||
593 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
594 | vfio_pci_intx_mask(vdev); | ||
595 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
596 | uint8_t mask = *(uint8_t *)data; | ||
597 | if (mask) | ||
598 | vfio_pci_intx_mask(vdev); | ||
599 | } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
600 | return -ENOTTY; /* XXX implement me */ | ||
601 | } | ||
602 | |||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, | ||
607 | unsigned index, unsigned start, | ||
608 | unsigned count, uint32_t flags, void *data) | ||
609 | { | ||
610 | if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
611 | vfio_intx_disable(vdev); | ||
612 | return 0; | ||
613 | } | ||
614 | |||
615 | if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1) | ||
616 | return -EINVAL; | ||
617 | |||
618 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
619 | int32_t fd = *(int32_t *)data; | ||
620 | int ret; | ||
621 | |||
622 | if (is_intx(vdev)) | ||
623 | return vfio_intx_set_signal(vdev, fd); | ||
624 | |||
625 | ret = vfio_intx_enable(vdev); | ||
626 | if (ret) | ||
627 | return ret; | ||
628 | |||
629 | ret = vfio_intx_set_signal(vdev, fd); | ||
630 | if (ret) | ||
631 | vfio_intx_disable(vdev); | ||
632 | |||
633 | return ret; | ||
634 | } | ||
635 | |||
636 | if (!is_intx(vdev)) | ||
637 | return -EINVAL; | ||
638 | |||
639 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
640 | vfio_send_intx_eventfd(vdev, NULL); | ||
641 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
642 | uint8_t trigger = *(uint8_t *)data; | ||
643 | if (trigger) | ||
644 | vfio_send_intx_eventfd(vdev, NULL); | ||
645 | } | ||
646 | return 0; | ||
647 | } | ||
648 | |||
649 | static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, | ||
650 | unsigned index, unsigned start, | ||
651 | unsigned count, uint32_t flags, void *data) | ||
652 | { | ||
653 | int i; | ||
654 | bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; | ||
655 | |||
656 | if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { | ||
657 | vfio_msi_disable(vdev, msix); | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | if (!(irq_is(vdev, index) || is_irq_none(vdev))) | ||
662 | return -EINVAL; | ||
663 | |||
664 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
665 | int32_t *fds = data; | ||
666 | int ret; | ||
667 | |||
668 | if (vdev->irq_type == index) | ||
669 | return vfio_msi_set_block(vdev, start, count, | ||
670 | fds, msix); | ||
671 | |||
672 | ret = vfio_msi_enable(vdev, start + count, msix); | ||
673 | if (ret) | ||
674 | return ret; | ||
675 | |||
676 | ret = vfio_msi_set_block(vdev, start, count, fds, msix); | ||
677 | if (ret) | ||
678 | vfio_msi_disable(vdev, msix); | ||
679 | |||
680 | return ret; | ||
681 | } | ||
682 | |||
683 | if (!irq_is(vdev, index) || start + count > vdev->num_ctx) | ||
684 | return -EINVAL; | ||
685 | |||
686 | for (i = start; i < start + count; i++) { | ||
687 | if (!vdev->ctx[i].trigger) | ||
688 | continue; | ||
689 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
690 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
691 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | ||
692 | uint8_t *bools = data; | ||
693 | if (bools[i - start]) | ||
694 | eventfd_signal(vdev->ctx[i].trigger, 1); | ||
695 | } | ||
696 | } | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | ||
701 | unsigned index, unsigned start, unsigned count, | ||
702 | void *data) | ||
703 | { | ||
704 | int (*func)(struct vfio_pci_device *vdev, unsigned index, | ||
705 | unsigned start, unsigned count, uint32_t flags, | ||
706 | void *data) = NULL; | ||
707 | |||
708 | switch (index) { | ||
709 | case VFIO_PCI_INTX_IRQ_INDEX: | ||
710 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
711 | case VFIO_IRQ_SET_ACTION_MASK: | ||
712 | func = vfio_pci_set_intx_mask; | ||
713 | break; | ||
714 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
715 | func = vfio_pci_set_intx_unmask; | ||
716 | break; | ||
717 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
718 | func = vfio_pci_set_intx_trigger; | ||
719 | break; | ||
720 | } | ||
721 | break; | ||
722 | case VFIO_PCI_MSI_IRQ_INDEX: | ||
723 | case VFIO_PCI_MSIX_IRQ_INDEX: | ||
724 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
725 | case VFIO_IRQ_SET_ACTION_MASK: | ||
726 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
727 | /* XXX Need masking support exported */ | ||
728 | break; | ||
729 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
730 | func = vfio_pci_set_msi_trigger; | ||
731 | break; | ||
732 | } | ||
733 | break; | ||
734 | } | ||
735 | |||
736 | if (!func) | ||
737 | return -ENOTTY; | ||
738 | |||
739 | return func(vdev, index, start, count, flags, data); | ||
740 | } | ||
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h new file mode 100644 index 000000000000..611827cba8cd --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
3 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * Derived from original vfio: | ||
10 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
11 | * Author: Tom Lyon, pugs@cisco.com | ||
12 | */ | ||
13 | |||
14 | #include <linux/mutex.h> | ||
15 | #include <linux/pci.h> | ||
16 | |||
17 | #ifndef VFIO_PCI_PRIVATE_H | ||
18 | #define VFIO_PCI_PRIVATE_H | ||
19 | |||
20 | #define VFIO_PCI_OFFSET_SHIFT 40 | ||
21 | |||
22 | #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) | ||
23 | #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) | ||
24 | #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) | ||
25 | |||
26 | struct vfio_pci_irq_ctx { | ||
27 | struct eventfd_ctx *trigger; | ||
28 | struct virqfd *unmask; | ||
29 | struct virqfd *mask; | ||
30 | char *name; | ||
31 | bool masked; | ||
32 | }; | ||
33 | |||
34 | struct vfio_pci_device { | ||
35 | struct pci_dev *pdev; | ||
36 | void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; | ||
37 | u8 *pci_config_map; | ||
38 | u8 *vconfig; | ||
39 | struct perm_bits *msi_perm; | ||
40 | spinlock_t irqlock; | ||
41 | struct mutex igate; | ||
42 | struct msix_entry *msix; | ||
43 | struct vfio_pci_irq_ctx *ctx; | ||
44 | int num_ctx; | ||
45 | int irq_type; | ||
46 | u8 msi_qmax; | ||
47 | u8 msix_bar; | ||
48 | u16 msix_size; | ||
49 | u32 msix_offset; | ||
50 | u32 rbar[7]; | ||
51 | bool pci_2_3; | ||
52 | bool virq_disabled; | ||
53 | bool reset_works; | ||
54 | bool extended_caps; | ||
55 | bool bardirty; | ||
56 | struct pci_saved_state *pci_saved_state; | ||
57 | atomic_t refcnt; | ||
58 | }; | ||
59 | |||
60 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | ||
61 | #define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) | ||
62 | #define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) | ||
63 | #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) | ||
64 | #define irq_is(vdev, type) (vdev->irq_type == type) | ||
65 | |||
66 | extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); | ||
67 | extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); | ||
68 | |||
69 | extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, | ||
70 | uint32_t flags, unsigned index, | ||
71 | unsigned start, unsigned count, void *data); | ||
72 | |||
73 | extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, | ||
74 | char __user *buf, size_t count, | ||
75 | loff_t *ppos, bool iswrite); | ||
76 | extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, | ||
77 | char __user *buf, size_t count, | ||
78 | loff_t *ppos, bool iswrite); | ||
79 | extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, | ||
80 | char __user *buf, size_t count, | ||
81 | loff_t *ppos, bool iswrite); | ||
82 | |||
83 | extern int vfio_pci_init_perm_bits(void); | ||
84 | extern void vfio_pci_uninit_perm_bits(void); | ||
85 | |||
86 | extern int vfio_pci_virqfd_init(void); | ||
87 | extern void vfio_pci_virqfd_exit(void); | ||
88 | |||
89 | extern int vfio_config_init(struct vfio_pci_device *vdev); | ||
90 | extern void vfio_config_free(struct vfio_pci_device *vdev); | ||
91 | #endif /* VFIO_PCI_PRIVATE_H */ | ||
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c new file mode 100644 index 000000000000..4362d9e7baa3 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_rdwr.c | |||
@@ -0,0 +1,269 @@ | |||
1 | /* | ||
2 | * VFIO PCI I/O Port & MMIO access | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | */ | ||
15 | |||
16 | #include <linux/fs.h> | ||
17 | #include <linux/pci.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/io.h> | ||
20 | |||
21 | #include "vfio_pci_private.h" | ||
22 | |||
23 | /* I/O Port BAR access */ | ||
24 | ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
25 | size_t count, loff_t *ppos, bool iswrite) | ||
26 | { | ||
27 | struct pci_dev *pdev = vdev->pdev; | ||
28 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
29 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
30 | void __iomem *io; | ||
31 | size_t done = 0; | ||
32 | |||
33 | if (!pci_resource_start(pdev, bar)) | ||
34 | return -EINVAL; | ||
35 | |||
36 | if (pos + count > pci_resource_len(pdev, bar)) | ||
37 | return -EINVAL; | ||
38 | |||
39 | if (!vdev->barmap[bar]) { | ||
40 | int ret; | ||
41 | |||
42 | ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); | ||
43 | if (ret) | ||
44 | return ret; | ||
45 | |||
46 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
47 | |||
48 | if (!vdev->barmap[bar]) { | ||
49 | pci_release_selected_regions(pdev, 1 << bar); | ||
50 | return -EINVAL; | ||
51 | } | ||
52 | } | ||
53 | |||
54 | io = vdev->barmap[bar]; | ||
55 | |||
56 | while (count) { | ||
57 | int filled; | ||
58 | |||
59 | if (count >= 3 && !(pos % 4)) { | ||
60 | __le32 val; | ||
61 | |||
62 | if (iswrite) { | ||
63 | if (copy_from_user(&val, buf, 4)) | ||
64 | return -EFAULT; | ||
65 | |||
66 | iowrite32(le32_to_cpu(val), io + pos); | ||
67 | } else { | ||
68 | val = cpu_to_le32(ioread32(io + pos)); | ||
69 | |||
70 | if (copy_to_user(buf, &val, 4)) | ||
71 | return -EFAULT; | ||
72 | } | ||
73 | |||
74 | filled = 4; | ||
75 | |||
76 | } else if ((pos % 2) == 0 && count >= 2) { | ||
77 | __le16 val; | ||
78 | |||
79 | if (iswrite) { | ||
80 | if (copy_from_user(&val, buf, 2)) | ||
81 | return -EFAULT; | ||
82 | |||
83 | iowrite16(le16_to_cpu(val), io + pos); | ||
84 | } else { | ||
85 | val = cpu_to_le16(ioread16(io + pos)); | ||
86 | |||
87 | if (copy_to_user(buf, &val, 2)) | ||
88 | return -EFAULT; | ||
89 | } | ||
90 | |||
91 | filled = 2; | ||
92 | } else { | ||
93 | u8 val; | ||
94 | |||
95 | if (iswrite) { | ||
96 | if (copy_from_user(&val, buf, 1)) | ||
97 | return -EFAULT; | ||
98 | |||
99 | iowrite8(val, io + pos); | ||
100 | } else { | ||
101 | val = ioread8(io + pos); | ||
102 | |||
103 | if (copy_to_user(buf, &val, 1)) | ||
104 | return -EFAULT; | ||
105 | } | ||
106 | |||
107 | filled = 1; | ||
108 | } | ||
109 | |||
110 | count -= filled; | ||
111 | done += filled; | ||
112 | buf += filled; | ||
113 | pos += filled; | ||
114 | } | ||
115 | |||
116 | *ppos += done; | ||
117 | |||
118 | return done; | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * MMIO BAR access | ||
123 | * We handle two excluded ranges here as well, if the user tries to read | ||
124 | * the ROM beyond what PCI tells us is available or the MSI-X table region, | ||
125 | * we return 0xFF and writes are dropped. | ||
126 | */ | ||
127 | ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf, | ||
128 | size_t count, loff_t *ppos, bool iswrite) | ||
129 | { | ||
130 | struct pci_dev *pdev = vdev->pdev; | ||
131 | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | ||
132 | int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); | ||
133 | void __iomem *io; | ||
134 | resource_size_t end; | ||
135 | size_t done = 0; | ||
136 | size_t x_start = 0, x_end = 0; /* excluded range */ | ||
137 | |||
138 | if (!pci_resource_start(pdev, bar)) | ||
139 | return -EINVAL; | ||
140 | |||
141 | end = pci_resource_len(pdev, bar); | ||
142 | |||
143 | if (pos > end) | ||
144 | return -EINVAL; | ||
145 | |||
146 | if (pos == end) | ||
147 | return 0; | ||
148 | |||
149 | if (pos + count > end) | ||
150 | count = end - pos; | ||
151 | |||
152 | if (bar == PCI_ROM_RESOURCE) { | ||
153 | io = pci_map_rom(pdev, &x_start); | ||
154 | x_end = end; | ||
155 | } else { | ||
156 | if (!vdev->barmap[bar]) { | ||
157 | int ret; | ||
158 | |||
159 | ret = pci_request_selected_regions(pdev, 1 << bar, | ||
160 | "vfio"); | ||
161 | if (ret) | ||
162 | return ret; | ||
163 | |||
164 | vdev->barmap[bar] = pci_iomap(pdev, bar, 0); | ||
165 | |||
166 | if (!vdev->barmap[bar]) { | ||
167 | pci_release_selected_regions(pdev, 1 << bar); | ||
168 | return -EINVAL; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | io = vdev->barmap[bar]; | ||
173 | |||
174 | if (bar == vdev->msix_bar) { | ||
175 | x_start = vdev->msix_offset; | ||
176 | x_end = vdev->msix_offset + vdev->msix_size; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | if (!io) | ||
181 | return -EINVAL; | ||
182 | |||
183 | while (count) { | ||
184 | size_t fillable, filled; | ||
185 | |||
186 | if (pos < x_start) | ||
187 | fillable = x_start - pos; | ||
188 | else if (pos >= x_end) | ||
189 | fillable = end - pos; | ||
190 | else | ||
191 | fillable = 0; | ||
192 | |||
193 | if (fillable >= 4 && !(pos % 4) && (count >= 4)) { | ||
194 | __le32 val; | ||
195 | |||
196 | if (iswrite) { | ||
197 | if (copy_from_user(&val, buf, 4)) | ||
198 | goto out; | ||
199 | |||
200 | iowrite32(le32_to_cpu(val), io + pos); | ||
201 | } else { | ||
202 | val = cpu_to_le32(ioread32(io + pos)); | ||
203 | |||
204 | if (copy_to_user(buf, &val, 4)) | ||
205 | goto out; | ||
206 | } | ||
207 | |||
208 | filled = 4; | ||
209 | } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) { | ||
210 | __le16 val; | ||
211 | |||
212 | if (iswrite) { | ||
213 | if (copy_from_user(&val, buf, 2)) | ||
214 | goto out; | ||
215 | |||
216 | iowrite16(le16_to_cpu(val), io + pos); | ||
217 | } else { | ||
218 | val = cpu_to_le16(ioread16(io + pos)); | ||
219 | |||
220 | if (copy_to_user(buf, &val, 2)) | ||
221 | goto out; | ||
222 | } | ||
223 | |||
224 | filled = 2; | ||
225 | } else if (fillable) { | ||
226 | u8 val; | ||
227 | |||
228 | if (iswrite) { | ||
229 | if (copy_from_user(&val, buf, 1)) | ||
230 | goto out; | ||
231 | |||
232 | iowrite8(val, io + pos); | ||
233 | } else { | ||
234 | val = ioread8(io + pos); | ||
235 | |||
236 | if (copy_to_user(buf, &val, 1)) | ||
237 | goto out; | ||
238 | } | ||
239 | |||
240 | filled = 1; | ||
241 | } else { | ||
242 | /* Drop writes, fill reads with FF */ | ||
243 | if (!iswrite) { | ||
244 | char val = 0xFF; | ||
245 | size_t i; | ||
246 | |||
247 | for (i = 0; i < x_end - pos; i++) { | ||
248 | if (put_user(val, buf + i)) | ||
249 | goto out; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | filled = x_end - pos; | ||
254 | } | ||
255 | |||
256 | count -= filled; | ||
257 | done += filled; | ||
258 | buf += filled; | ||
259 | pos += filled; | ||
260 | } | ||
261 | |||
262 | *ppos += done; | ||
263 | |||
264 | out: | ||
265 | if (bar == PCI_ROM_RESOURCE) | ||
266 | pci_unmap_rom(pdev, io); | ||
267 | |||
268 | return count ? -EFAULT : done; | ||
269 | } | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h index acb046fd5b70..0a4f180a11d8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h | |||
@@ -223,6 +223,7 @@ struct vfio_device_info { | |||
223 | __u32 argsz; | 223 | __u32 argsz; |
224 | __u32 flags; | 224 | __u32 flags; |
225 | #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ | 225 | #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ |
226 | #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ | ||
226 | __u32 num_regions; /* Max region index + 1 */ | 227 | __u32 num_regions; /* Max region index + 1 */ |
227 | __u32 num_irqs; /* Max IRQ index + 1 */ | 228 | __u32 num_irqs; /* Max IRQ index + 1 */ |
228 | }; | 229 | }; |
@@ -364,6 +365,31 @@ struct vfio_irq_set { | |||
364 | */ | 365 | */ |
365 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) | 366 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) |
366 | 367 | ||
368 | /* | ||
369 | * The VFIO-PCI bus driver makes use of the following fixed region and | ||
370 | * IRQ index mapping. Unimplemented regions return a size of zero. | ||
371 | * Unimplemented IRQ types return a count of zero. | ||
372 | */ | ||
373 | |||
374 | enum { | ||
375 | VFIO_PCI_BAR0_REGION_INDEX, | ||
376 | VFIO_PCI_BAR1_REGION_INDEX, | ||
377 | VFIO_PCI_BAR2_REGION_INDEX, | ||
378 | VFIO_PCI_BAR3_REGION_INDEX, | ||
379 | VFIO_PCI_BAR4_REGION_INDEX, | ||
380 | VFIO_PCI_BAR5_REGION_INDEX, | ||
381 | VFIO_PCI_ROM_REGION_INDEX, | ||
382 | VFIO_PCI_CONFIG_REGION_INDEX, | ||
383 | VFIO_PCI_NUM_REGIONS | ||
384 | }; | ||
385 | |||
386 | enum { | ||
387 | VFIO_PCI_INTX_IRQ_INDEX, | ||
388 | VFIO_PCI_MSI_IRQ_INDEX, | ||
389 | VFIO_PCI_MSIX_IRQ_INDEX, | ||
390 | VFIO_PCI_NUM_IRQS | ||
391 | }; | ||
392 | |||
367 | /* -------- API for Type1 VFIO IOMMU -------- */ | 393 | /* -------- API for Type1 VFIO IOMMU -------- */ |
368 | 394 | ||
369 | /** | 395 | /** |