aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/vfio/Kconfig2
-rw-r--r--drivers/vfio/pci/Kconfig8
-rw-r--r--drivers/vfio/pci/Makefile4
-rw-r--r--drivers/vfio/pci/vfio_pci.c579
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c1540
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c740
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h91
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c269
-rw-r--r--include/linux/vfio.h26
9 files changed, 3259 insertions, 0 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 128b97910b8e..7cd5dec0abd1 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -12,3 +12,5 @@ menuconfig VFIO
12 See Documentation/vfio.txt for more details. 12 See Documentation/vfio.txt for more details.
13 13
14 If you don't know what to do here, say N. 14 If you don't know what to do here, say N.
15
16source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
new file mode 100644
index 000000000000..5980758563eb
--- /dev/null
+++ b/drivers/vfio/pci/Kconfig
@@ -0,0 +1,8 @@
1config VFIO_PCI
2 tristate "VFIO support for PCI devices"
3 depends on VFIO && PCI && EVENTFD
4 help
5 Support for the PCI VFIO bus driver. This is required to make
6 use of PCI drivers using the VFIO framework.
7
8 If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
new file mode 100644
index 000000000000..131079255fd9
--- /dev/null
+++ b/drivers/vfio/pci/Makefile
@@ -0,0 +1,4 @@
1
2vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
3
4obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 000000000000..6968b7232232
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/interrupt.h>
17#include <linux/iommu.h>
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/notifier.h>
21#include <linux/pci.h>
22#include <linux/pm_runtime.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/uaccess.h>
26#include <linux/vfio.h>
27
28#include "vfio_pci_private.h"
29
30#define DRIVER_VERSION "0.2"
31#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
33
34static bool nointxmask;
35module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36MODULE_PARM_DESC(nointxmask,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38
39static int vfio_pci_enable(struct vfio_pci_device *vdev)
40{
41 struct pci_dev *pdev = vdev->pdev;
42 int ret;
43 u16 cmd;
44 u8 msix_pos;
45
46 vdev->reset_works = (pci_reset_function(pdev) == 0);
47 pci_save_state(pdev);
48 vdev->pci_saved_state = pci_store_saved_state(pdev);
49 if (!vdev->pci_saved_state)
50 pr_debug("%s: Couldn't store %s saved state\n",
51 __func__, dev_name(&pdev->dev));
52
53 ret = vfio_config_init(vdev);
54 if (ret)
55 goto out;
56
57 if (likely(!nointxmask))
58 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
59
60 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
61 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
62 cmd &= ~PCI_COMMAND_INTX_DISABLE;
63 pci_write_config_word(pdev, PCI_COMMAND, cmd);
64 }
65
66 msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
67 if (msix_pos) {
68 u16 flags;
69 u32 table;
70
71 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
72 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
73
74 vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
75 vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
76 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
77 } else
78 vdev->msix_bar = 0xFF;
79
80 ret = pci_enable_device(pdev);
81 if (ret)
82 goto out;
83
84 return ret;
85
86out:
87 kfree(vdev->pci_saved_state);
88 vdev->pci_saved_state = NULL;
89 vfio_config_free(vdev);
90 return ret;
91}
92
93static void vfio_pci_disable(struct vfio_pci_device *vdev)
94{
95 int bar;
96
97 pci_disable_device(vdev->pdev);
98
99 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
100 VFIO_IRQ_SET_ACTION_TRIGGER,
101 vdev->irq_type, 0, 0, NULL);
102
103 vdev->virq_disabled = false;
104
105 vfio_config_free(vdev);
106
107 pci_reset_function(vdev->pdev);
108
109 if (pci_load_and_free_saved_state(vdev->pdev,
110 &vdev->pci_saved_state) == 0)
111 pci_restore_state(vdev->pdev);
112 else
113 pr_info("%s: Couldn't reload %s saved state\n",
114 __func__, dev_name(&vdev->pdev->dev));
115
116 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
117 if (!vdev->barmap[bar])
118 continue;
119 pci_iounmap(vdev->pdev, vdev->barmap[bar]);
120 pci_release_selected_regions(vdev->pdev, 1 << bar);
121 vdev->barmap[bar] = NULL;
122 }
123}
124
125static void vfio_pci_release(void *device_data)
126{
127 struct vfio_pci_device *vdev = device_data;
128
129 if (atomic_dec_and_test(&vdev->refcnt))
130 vfio_pci_disable(vdev);
131
132 module_put(THIS_MODULE);
133}
134
135static int vfio_pci_open(void *device_data)
136{
137 struct vfio_pci_device *vdev = device_data;
138
139 if (!try_module_get(THIS_MODULE))
140 return -ENODEV;
141
142 if (atomic_inc_return(&vdev->refcnt) == 1) {
143 int ret = vfio_pci_enable(vdev);
144 if (ret) {
145 module_put(THIS_MODULE);
146 return ret;
147 }
148 }
149
150 return 0;
151}
152
153static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
154{
155 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
156 u8 pin;
157 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
158 if (pin)
159 return 1;
160
161 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
162 u8 pos;
163 u16 flags;
164
165 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
166 if (pos) {
167 pci_read_config_word(vdev->pdev,
168 pos + PCI_MSI_FLAGS, &flags);
169
170 return 1 << (flags & PCI_MSI_FLAGS_QMASK);
171 }
172 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
173 u8 pos;
174 u16 flags;
175
176 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
177 if (pos) {
178 pci_read_config_word(vdev->pdev,
179 pos + PCI_MSIX_FLAGS, &flags);
180
181 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
182 }
183 }
184
185 return 0;
186}
187
188static long vfio_pci_ioctl(void *device_data,
189 unsigned int cmd, unsigned long arg)
190{
191 struct vfio_pci_device *vdev = device_data;
192 unsigned long minsz;
193
194 if (cmd == VFIO_DEVICE_GET_INFO) {
195 struct vfio_device_info info;
196
197 minsz = offsetofend(struct vfio_device_info, num_irqs);
198
199 if (copy_from_user(&info, (void __user *)arg, minsz))
200 return -EFAULT;
201
202 if (info.argsz < minsz)
203 return -EINVAL;
204
205 info.flags = VFIO_DEVICE_FLAGS_PCI;
206
207 if (vdev->reset_works)
208 info.flags |= VFIO_DEVICE_FLAGS_RESET;
209
210 info.num_regions = VFIO_PCI_NUM_REGIONS;
211 info.num_irqs = VFIO_PCI_NUM_IRQS;
212
213 return copy_to_user((void __user *)arg, &info, minsz);
214
215 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
216 struct pci_dev *pdev = vdev->pdev;
217 struct vfio_region_info info;
218
219 minsz = offsetofend(struct vfio_region_info, offset);
220
221 if (copy_from_user(&info, (void __user *)arg, minsz))
222 return -EFAULT;
223
224 if (info.argsz < minsz)
225 return -EINVAL;
226
227 switch (info.index) {
228 case VFIO_PCI_CONFIG_REGION_INDEX:
229 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
230 info.size = pdev->cfg_size;
231 info.flags = VFIO_REGION_INFO_FLAG_READ |
232 VFIO_REGION_INFO_FLAG_WRITE;
233 break;
234 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
235 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
236 info.size = pci_resource_len(pdev, info.index);
237 if (!info.size) {
238 info.flags = 0;
239 break;
240 }
241
242 info.flags = VFIO_REGION_INFO_FLAG_READ |
243 VFIO_REGION_INFO_FLAG_WRITE;
244 if (pci_resource_flags(pdev, info.index) &
245 IORESOURCE_MEM && info.size >= PAGE_SIZE)
246 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
247 break;
248 case VFIO_PCI_ROM_REGION_INDEX:
249 {
250 void __iomem *io;
251 size_t size;
252
253 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
254 info.flags = 0;
255
256 /* Report the BAR size, not the ROM size */
257 info.size = pci_resource_len(pdev, info.index);
258 if (!info.size)
259 break;
260
261 /* Is it really there? */
262 io = pci_map_rom(pdev, &size);
263 if (!io || !size) {
264 info.size = 0;
265 break;
266 }
267 pci_unmap_rom(pdev, io);
268
269 info.flags = VFIO_REGION_INFO_FLAG_READ;
270 break;
271 }
272 default:
273 return -EINVAL;
274 }
275
276 return copy_to_user((void __user *)arg, &info, minsz);
277
278 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
279 struct vfio_irq_info info;
280
281 minsz = offsetofend(struct vfio_irq_info, count);
282
283 if (copy_from_user(&info, (void __user *)arg, minsz))
284 return -EFAULT;
285
286 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
287 return -EINVAL;
288
289 info.flags = VFIO_IRQ_INFO_EVENTFD;
290
291 info.count = vfio_pci_get_irq_count(vdev, info.index);
292
293 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
294 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
295 VFIO_IRQ_INFO_AUTOMASKED);
296 else
297 info.flags |= VFIO_IRQ_INFO_NORESIZE;
298
299 return copy_to_user((void __user *)arg, &info, minsz);
300
301 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
302 struct vfio_irq_set hdr;
303 u8 *data = NULL;
304 int ret = 0;
305
306 minsz = offsetofend(struct vfio_irq_set, count);
307
308 if (copy_from_user(&hdr, (void __user *)arg, minsz))
309 return -EFAULT;
310
311 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
312 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
313 VFIO_IRQ_SET_ACTION_TYPE_MASK))
314 return -EINVAL;
315
316 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
317 size_t size;
318
319 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
320 size = sizeof(uint8_t);
321 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
322 size = sizeof(int32_t);
323 else
324 return -EINVAL;
325
326 if (hdr.argsz - minsz < hdr.count * size ||
327 hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
328 return -EINVAL;
329
330 data = kmalloc(hdr.count * size, GFP_KERNEL);
331 if (!data)
332 return -ENOMEM;
333
334 if (copy_from_user(data, (void __user *)(arg + minsz),
335 hdr.count * size)) {
336 kfree(data);
337 return -EFAULT;
338 }
339 }
340
341 mutex_lock(&vdev->igate);
342
343 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
344 hdr.start, hdr.count, data);
345
346 mutex_unlock(&vdev->igate);
347 kfree(data);
348
349 return ret;
350
351 } else if (cmd == VFIO_DEVICE_RESET)
352 return vdev->reset_works ?
353 pci_reset_function(vdev->pdev) : -EINVAL;
354
355 return -ENOTTY;
356}
357
358static ssize_t vfio_pci_read(void *device_data, char __user *buf,
359 size_t count, loff_t *ppos)
360{
361 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
362 struct vfio_pci_device *vdev = device_data;
363 struct pci_dev *pdev = vdev->pdev;
364
365 if (index >= VFIO_PCI_NUM_REGIONS)
366 return -EINVAL;
367
368 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
369 return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
370 else if (index == VFIO_PCI_ROM_REGION_INDEX)
371 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
372 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
373 return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
374 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
375 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
376
377 return -EINVAL;
378}
379
380static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
381 size_t count, loff_t *ppos)
382{
383 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
384 struct vfio_pci_device *vdev = device_data;
385 struct pci_dev *pdev = vdev->pdev;
386
387 if (index >= VFIO_PCI_NUM_REGIONS)
388 return -EINVAL;
389
390 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
391 return vfio_pci_config_readwrite(vdev, (char __user *)buf,
392 count, ppos, true);
393 else if (index == VFIO_PCI_ROM_REGION_INDEX)
394 return -EINVAL;
395 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
396 return vfio_pci_io_readwrite(vdev, (char __user *)buf,
397 count, ppos, true);
398 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
399 return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
400 count, ppos, true);
401 }
402
403 return -EINVAL;
404}
405
406static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
407{
408 struct vfio_pci_device *vdev = device_data;
409 struct pci_dev *pdev = vdev->pdev;
410 unsigned int index;
411 u64 phys_len, req_len, pgoff, req_start, phys;
412 int ret;
413
414 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
415
416 if (vma->vm_end < vma->vm_start)
417 return -EINVAL;
418 if ((vma->vm_flags & VM_SHARED) == 0)
419 return -EINVAL;
420 if (index >= VFIO_PCI_ROM_REGION_INDEX)
421 return -EINVAL;
422 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
423 return -EINVAL;
424
425 phys_len = pci_resource_len(pdev, index);
426 req_len = vma->vm_end - vma->vm_start;
427 pgoff = vma->vm_pgoff &
428 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
429 req_start = pgoff << PAGE_SHIFT;
430
431 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
432 return -EINVAL;
433
434 if (index == vdev->msix_bar) {
435 /*
436 * Disallow mmaps overlapping the MSI-X table; users don't
437 * get to touch this directly. We could find somewhere
438 * else to map the overlap, but page granularity is only
439 * a recommendation, not a requirement, so the user needs
440 * to know which bits are real. Requiring them to mmap
441 * around the table makes that clear.
442 */
443
444 /* If neither entirely above nor below, then it overlaps */
445 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
446 req_start + req_len <= vdev->msix_offset))
447 return -EINVAL;
448 }
449
450 /*
451 * Even though we don't make use of the barmap for the mmap,
452 * we need to request the region and the barmap tracks that.
453 */
454 if (!vdev->barmap[index]) {
455 ret = pci_request_selected_regions(pdev,
456 1 << index, "vfio-pci");
457 if (ret)
458 return ret;
459
460 vdev->barmap[index] = pci_iomap(pdev, index, 0);
461 }
462
463 vma->vm_private_data = vdev;
464 vma->vm_flags |= (VM_IO | VM_RESERVED);
465 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
466
467 phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
468
469 return remap_pfn_range(vma, vma->vm_start, phys,
470 req_len, vma->vm_page_prot);
471}
472
473static const struct vfio_device_ops vfio_pci_ops = {
474 .name = "vfio-pci",
475 .open = vfio_pci_open,
476 .release = vfio_pci_release,
477 .ioctl = vfio_pci_ioctl,
478 .read = vfio_pci_read,
479 .write = vfio_pci_write,
480 .mmap = vfio_pci_mmap,
481};
482
483static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
484{
485 u8 type;
486 struct vfio_pci_device *vdev;
487 struct iommu_group *group;
488 int ret;
489
490 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
491 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
492 return -EINVAL;
493
494 group = iommu_group_get(&pdev->dev);
495 if (!group)
496 return -EINVAL;
497
498 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
499 if (!vdev) {
500 iommu_group_put(group);
501 return -ENOMEM;
502 }
503
504 vdev->pdev = pdev;
505 vdev->irq_type = VFIO_PCI_NUM_IRQS;
506 mutex_init(&vdev->igate);
507 spin_lock_init(&vdev->irqlock);
508 atomic_set(&vdev->refcnt, 0);
509
510 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
511 if (ret) {
512 iommu_group_put(group);
513 kfree(vdev);
514 }
515
516 return ret;
517}
518
519static void vfio_pci_remove(struct pci_dev *pdev)
520{
521 struct vfio_pci_device *vdev;
522
523 vdev = vfio_del_group_dev(&pdev->dev);
524 if (!vdev)
525 return;
526
527 iommu_group_put(pdev->dev.iommu_group);
528 kfree(vdev);
529}
530
531static struct pci_driver vfio_pci_driver = {
532 .name = "vfio-pci",
533 .id_table = NULL, /* only dynamic ids */
534 .probe = vfio_pci_probe,
535 .remove = vfio_pci_remove,
536};
537
538static void __exit vfio_pci_cleanup(void)
539{
540 pci_unregister_driver(&vfio_pci_driver);
541 vfio_pci_virqfd_exit();
542 vfio_pci_uninit_perm_bits();
543}
544
545static int __init vfio_pci_init(void)
546{
547 int ret;
548
549 /* Allocate shared config space permision data used by all devices */
550 ret = vfio_pci_init_perm_bits();
551 if (ret)
552 return ret;
553
554 /* Start the virqfd cleanup handler */
555 ret = vfio_pci_virqfd_init();
556 if (ret)
557 goto out_virqfd;
558
559 /* Register and scan for devices */
560 ret = pci_register_driver(&vfio_pci_driver);
561 if (ret)
562 goto out_driver;
563
564 return 0;
565
566out_virqfd:
567 vfio_pci_virqfd_exit();
568out_driver:
569 vfio_pci_uninit_perm_bits();
570 return ret;
571}
572
573module_init(vfio_pci_init);
574module_exit(vfio_pci_cleanup);
575
576MODULE_VERSION(DRIVER_VERSION);
577MODULE_LICENSE("GPL v2");
578MODULE_AUTHOR(DRIVER_AUTHOR);
579MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
new file mode 100644
index 000000000000..8b8f7d11e102
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -0,0 +1,1540 @@
1/*
2 * VFIO PCI config space virtualization
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16/*
17 * This code handles reading and writing of PCI configuration registers.
18 * This is hairy because we want to allow a lot of flexibility to the
19 * user driver, but cannot trust it with all of the config fields.
20 * Tables determine which fields can be read and written, as well as
21 * which fields are 'virtualized' - special actions and translations to
22 * make it appear to the user that he has control, when in fact things
23 * must be negotiated with the underlying OS.
24 */
25
26#include <linux/fs.h>
27#include <linux/pci.h>
28#include <linux/uaccess.h>
29#include <linux/vfio.h>
30
31#include "vfio_pci_private.h"
32
33#define PCI_CFG_SPACE_SIZE 256
34
35/* Useful "pseudo" capabilities */
36#define PCI_CAP_ID_BASIC 0
37#define PCI_CAP_ID_INVALID 0xFF
38
39#define is_bar(offset) \
40 ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
41 (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
42
43/*
44 * Lengths of PCI Config Capabilities
45 * 0: Removed from the user visible capability list
46 * FF: Variable length
47 */
48static u8 pci_cap_length[] = {
49 [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */
50 [PCI_CAP_ID_PM] = PCI_PM_SIZEOF,
51 [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF,
52 [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF,
53 [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */
54 [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */
55 [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */
56 [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */
57 [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */
58 [PCI_CAP_ID_VNDR] = 0xFF, /* variable */
59 [PCI_CAP_ID_DBG] = 0, /* debug - don't care */
60 [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */
61 [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */
62 [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */
63 [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */
64 [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */
65 [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */
66 [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF,
67 [PCI_CAP_ID_SATA] = 0xFF,
68 [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF,
69};
70
71/*
72 * Lengths of PCIe/PCI-X Extended Config Capabilities
73 * 0: Removed or masked from the user visible capabilty list
74 * FF: Variable length
75 */
76static u16 pci_ext_cap_length[] = {
77 [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND,
78 [PCI_EXT_CAP_ID_VC] = 0xFF,
79 [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF,
80 [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF,
81 [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */
82 [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */
83 [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */
84 [PCI_EXT_CAP_ID_MFVC] = 0xFF,
85 [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */
86 [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */
87 [PCI_EXT_CAP_ID_VNDR] = 0xFF,
88 [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */
89 [PCI_EXT_CAP_ID_ACS] = 0xFF,
90 [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF,
91 [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF,
92 [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF,
93 [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */
94 [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
95 [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF,
96 [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */
97 [PCI_EXT_CAP_ID_REBAR] = 0xFF,
98 [PCI_EXT_CAP_ID_DPA] = 0xFF,
99 [PCI_EXT_CAP_ID_TPH] = 0xFF,
100 [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF,
101 [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */
102 [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */
103 [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */
104};
105
106/*
107 * Read/Write Permission Bits - one bit for each bit in capability
108 * Any field can be read if it exists, but what is read depends on
109 * whether the field is 'virtualized', or just pass thru to the
110 * hardware. Any virtualized field is also virtualized for writes.
111 * Writes are only permitted if they have a 1 bit here.
112 */
113struct perm_bits {
114 u8 *virt; /* read/write virtual data, not hw */
115 u8 *write; /* writeable bits */
116 int (*readfn)(struct vfio_pci_device *vdev, int pos, int count,
117 struct perm_bits *perm, int offset, __le32 *val);
118 int (*writefn)(struct vfio_pci_device *vdev, int pos, int count,
119 struct perm_bits *perm, int offset, __le32 val);
120};
121
122#define NO_VIRT 0
123#define ALL_VIRT 0xFFFFFFFFU
124#define NO_WRITE 0
125#define ALL_WRITE 0xFFFFFFFFU
126
127static int vfio_user_config_read(struct pci_dev *pdev, int offset,
128 __le32 *val, int count)
129{
130 int ret = -EINVAL;
131 u32 tmp_val = 0;
132
133 switch (count) {
134 case 1:
135 {
136 u8 tmp;
137 ret = pci_user_read_config_byte(pdev, offset, &tmp);
138 tmp_val = tmp;
139 break;
140 }
141 case 2:
142 {
143 u16 tmp;
144 ret = pci_user_read_config_word(pdev, offset, &tmp);
145 tmp_val = tmp;
146 break;
147 }
148 case 4:
149 ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
150 break;
151 }
152
153 *val = cpu_to_le32(tmp_val);
154
155 return pcibios_err_to_errno(ret);
156}
157
158static int vfio_user_config_write(struct pci_dev *pdev, int offset,
159 __le32 val, int count)
160{
161 int ret = -EINVAL;
162 u32 tmp_val = le32_to_cpu(val);
163
164 switch (count) {
165 case 1:
166 ret = pci_user_write_config_byte(pdev, offset, tmp_val);
167 break;
168 case 2:
169 ret = pci_user_write_config_word(pdev, offset, tmp_val);
170 break;
171 case 4:
172 ret = pci_user_write_config_dword(pdev, offset, tmp_val);
173 break;
174 }
175
176 return pcibios_err_to_errno(ret);
177}
178
179static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
180 int count, struct perm_bits *perm,
181 int offset, __le32 *val)
182{
183 __le32 virt = 0;
184
185 memcpy(val, vdev->vconfig + pos, count);
186
187 memcpy(&virt, perm->virt + offset, count);
188
189 /* Any non-virtualized bits? */
190 if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
191 struct pci_dev *pdev = vdev->pdev;
192 __le32 phys_val = 0;
193 int ret;
194
195 ret = vfio_user_config_read(pdev, pos, &phys_val, count);
196 if (ret)
197 return ret;
198
199 *val = (phys_val & ~virt) | (*val & virt);
200 }
201
202 return count;
203}
204
205static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
206 int count, struct perm_bits *perm,
207 int offset, __le32 val)
208{
209 __le32 virt = 0, write = 0;
210
211 memcpy(&write, perm->write + offset, count);
212
213 if (!write)
214 return count; /* drop, no writable bits */
215
216 memcpy(&virt, perm->virt + offset, count);
217
218 /* Virtualized and writable bits go to vconfig */
219 if (write & virt) {
220 __le32 virt_val = 0;
221
222 memcpy(&virt_val, vdev->vconfig + pos, count);
223
224 virt_val &= ~(write & virt);
225 virt_val |= (val & (write & virt));
226
227 memcpy(vdev->vconfig + pos, &virt_val, count);
228 }
229
230 /* Non-virtualzed and writable bits go to hardware */
231 if (write & ~virt) {
232 struct pci_dev *pdev = vdev->pdev;
233 __le32 phys_val = 0;
234 int ret;
235
236 ret = vfio_user_config_read(pdev, pos, &phys_val, count);
237 if (ret)
238 return ret;
239
240 phys_val &= ~(write & ~virt);
241 phys_val |= (val & (write & ~virt));
242
243 ret = vfio_user_config_write(pdev, pos, phys_val, count);
244 if (ret)
245 return ret;
246 }
247
248 return count;
249}
250
251/* Allow direct read from hardware, except for capability next pointer */
252static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
253 int count, struct perm_bits *perm,
254 int offset, __le32 *val)
255{
256 int ret;
257
258 ret = vfio_user_config_read(vdev->pdev, pos, val, count);
259 if (ret)
260 return pcibios_err_to_errno(ret);
261
262 if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
263 if (offset < 4)
264 memcpy(val, vdev->vconfig + pos, count);
265 } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
266 if (offset == PCI_CAP_LIST_ID && count > 1)
267 memcpy(val, vdev->vconfig + pos,
268 min(PCI_CAP_FLAGS, count));
269 else if (offset == PCI_CAP_LIST_NEXT)
270 memcpy(val, vdev->vconfig + pos, 1);
271 }
272
273 return count;
274}
275
276static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
277 int count, struct perm_bits *perm,
278 int offset, __le32 val)
279{
280 int ret;
281
282 ret = vfio_user_config_write(vdev->pdev, pos, val, count);
283 if (ret)
284 return ret;
285
286 return count;
287}
288
289/* Default all regions to read-only, no-virtualization */
290static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
291 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
292};
293static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
294 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
295};
296
297static void free_perm_bits(struct perm_bits *perm)
298{
299 kfree(perm->virt);
300 kfree(perm->write);
301 perm->virt = NULL;
302 perm->write = NULL;
303}
304
305static int alloc_perm_bits(struct perm_bits *perm, int size)
306{
307 /*
308 * Round up all permission bits to the next dword, this lets us
309 * ignore whether a read/write exceeds the defined capability
310 * structure. We can do this because:
311 * - Standard config space is already dword aligned
312 * - Capabilities are all dword alinged (bits 0:1 of next reserved)
313 * - Express capabilities defined as dword aligned
314 */
315 size = round_up(size, 4);
316
317 /*
318 * Zero state is
319 * - All Readable, None Writeable, None Virtualized
320 */
321 perm->virt = kzalloc(size, GFP_KERNEL);
322 perm->write = kzalloc(size, GFP_KERNEL);
323 if (!perm->virt || !perm->write) {
324 free_perm_bits(perm);
325 return -ENOMEM;
326 }
327
328 perm->readfn = vfio_default_config_read;
329 perm->writefn = vfio_default_config_write;
330
331 return 0;
332}
333
334/*
335 * Helper functions for filling in permission tables
336 */
337static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
338{
339 p->virt[off] = virt;
340 p->write[off] = write;
341}
342
343/* Handle endian-ness - pci and tables are little-endian */
344static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
345{
346 *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
347 *(__le16 *)(&p->write[off]) = cpu_to_le16(write);
348}
349
350/* Handle endian-ness - pci and tables are little-endian */
351static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
352{
353 *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
354 *(__le32 *)(&p->write[off]) = cpu_to_le32(write);
355}
356
357/*
358 * Restore the *real* BARs after we detect a FLR or backdoor reset.
359 * (backdoor = some device specific technique that we didn't catch)
360 */
361static void vfio_bar_restore(struct vfio_pci_device *vdev)
362{
363 struct pci_dev *pdev = vdev->pdev;
364 u32 *rbar = vdev->rbar;
365 int i;
366
367 if (pdev->is_virtfn)
368 return;
369
370 pr_info("%s: %s reset recovery - restoring bars\n",
371 __func__, dev_name(&pdev->dev));
372
373 for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
374 pci_user_write_config_dword(pdev, i, *rbar);
375
376 pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
377}
378
379static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
380{
381 unsigned long flags = pci_resource_flags(pdev, bar);
382 u32 val;
383
384 if (flags & IORESOURCE_IO)
385 return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
386
387 val = PCI_BASE_ADDRESS_SPACE_MEMORY;
388
389 if (flags & IORESOURCE_PREFETCH)
390 val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
391
392 if (flags & IORESOURCE_MEM_64)
393 val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
394
395 return cpu_to_le32(val);
396}
397
398/*
399 * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
400 * to reflect the hardware capabilities. This implements BAR sizing.
401 */
402static void vfio_bar_fixup(struct vfio_pci_device *vdev)
403{
404 struct pci_dev *pdev = vdev->pdev;
405 int i;
406 __le32 *bar;
407 u64 mask;
408
409 bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
410
411 for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
412 if (!pci_resource_start(pdev, i)) {
413 *bar = 0; /* Unmapped by host = unimplemented to user */
414 continue;
415 }
416
417 mask = ~(pci_resource_len(pdev, i) - 1);
418
419 *bar &= cpu_to_le32((u32)mask);
420 *bar |= vfio_generate_bar_flags(pdev, i);
421
422 if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
423 bar++;
424 *bar &= cpu_to_le32((u32)(mask >> 32));
425 i++;
426 }
427 }
428
429 bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
430
431 /*
432 * NB. we expose the actual BAR size here, regardless of whether
433 * we can read it. When we report the REGION_INFO for the ROM
434 * we report what PCI tells us is the actual ROM size.
435 */
436 if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
437 mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
438 mask |= PCI_ROM_ADDRESS_ENABLE;
439 *bar &= cpu_to_le32((u32)mask);
440 } else
441 *bar = 0;
442
443 vdev->bardirty = false;
444}
445
446static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
447 int count, struct perm_bits *perm,
448 int offset, __le32 *val)
449{
450 if (is_bar(offset)) /* pos == offset for basic config */
451 vfio_bar_fixup(vdev);
452
453 count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
454
455 /* Mask in virtual memory enable for SR-IOV devices */
456 if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
457 u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
458 u32 tmp_val = le32_to_cpu(*val);
459
460 tmp_val |= cmd & PCI_COMMAND_MEMORY;
461 *val = cpu_to_le32(tmp_val);
462 }
463
464 return count;
465}
466
467static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
468 int count, struct perm_bits *perm,
469 int offset, __le32 val)
470{
471 struct pci_dev *pdev = vdev->pdev;
472 __le16 *virt_cmd;
473 u16 new_cmd = 0;
474 int ret;
475
476 virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND];
477
478 if (offset == PCI_COMMAND) {
479 bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io;
480 u16 phys_cmd;
481
482 ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd);
483 if (ret)
484 return ret;
485
486 new_cmd = le32_to_cpu(val);
487
488 phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY);
489 virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
490 new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
491
492 phys_io = !!(phys_cmd & PCI_COMMAND_IO);
493 virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO);
494 new_io = !!(new_cmd & PCI_COMMAND_IO);
495
496 /*
497 * If the user is writing mem/io enable (new_mem/io) and we
498 * think it's already enabled (virt_mem/io), but the hardware
499 * shows it disabled (phys_mem/io, then the device has
500 * undergone some kind of backdoor reset and needs to be
501 * restored before we allow it to enable the bars.
502 * SR-IOV devices will trigger this, but we catch them later
503 */
504 if ((new_mem && virt_mem && !phys_mem) ||
505 (new_io && virt_io && !phys_io))
506 vfio_bar_restore(vdev);
507 }
508
509 count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
510 if (count < 0)
511 return count;
512
513 /*
514 * Save current memory/io enable bits in vconfig to allow for
515 * the test above next time.
516 */
517 if (offset == PCI_COMMAND) {
518 u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO;
519
520 *virt_cmd &= cpu_to_le16(~mask);
521 *virt_cmd |= cpu_to_le16(new_cmd & mask);
522 }
523
524 /* Emulate INTx disable */
525 if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) {
526 bool virt_intx_disable;
527
528 virt_intx_disable = !!(le16_to_cpu(*virt_cmd) &
529 PCI_COMMAND_INTX_DISABLE);
530
531 if (virt_intx_disable && !vdev->virq_disabled) {
532 vdev->virq_disabled = true;
533 vfio_pci_intx_mask(vdev);
534 } else if (!virt_intx_disable && vdev->virq_disabled) {
535 vdev->virq_disabled = false;
536 vfio_pci_intx_unmask(vdev);
537 }
538 }
539
540 if (is_bar(offset))
541 vdev->bardirty = true;
542
543 return count;
544}
545
546/* Permissions for the Basic PCI Header */
547static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
548{
549 if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF))
550 return -ENOMEM;
551
552 perm->readfn = vfio_basic_config_read;
553 perm->writefn = vfio_basic_config_write;
554
555 /* Virtualized for SR-IOV functions, which just have FFFF */
556 p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE);
557 p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE);
558
559 /*
560 * Virtualize INTx disable, we use it internally for interrupt
561 * control and can emulate it for non-PCI 2.3 devices.
562 */
563 p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE);
564
565 /* Virtualize capability list, we might want to skip/disable */
566 p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE);
567
568 /* No harm to write */
569 p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE);
570 p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE);
571 p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE);
572
573 /* Virtualize all bars, can't touch the real ones */
574 p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE);
575 p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE);
576 p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE);
577 p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE);
578 p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE);
579 p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE);
580 p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE);
581
582 /* Allow us to adjust capability chain */
583 p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE);
584
585 /* Sometimes used by sw, just virtualize */
586 p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE);
587 return 0;
588}
589
590/* Permissions for the Power Management capability */
591static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
592{
593 if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
594 return -ENOMEM;
595
596 /*
597 * We always virtualize the next field so we can remove
598 * capabilities from the chain if we want to.
599 */
600 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
601
602 /*
603 * Power management is defined *per function*,
604 * so we let the user write this
605 */
606 p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE);
607 return 0;
608}
609
610/* Permissions for PCI-X capability */
611static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
612{
613 /* Alloc 24, but only 8 are used in v0 */
614 if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2))
615 return -ENOMEM;
616
617 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
618
619 p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE);
620 p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE);
621 return 0;
622}
623
624/* Permissions for PCI Express capability */
625static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
626{
627 /* Alloc larger of two possible sizes */
628 if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
629 return -ENOMEM;
630
631 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
632
633 /*
634 * Allow writes to device control fields (includes FLR!)
635 * but not to devctl_phantom which could confuse IOMMU
636 * or to the ARI bit in devctl2 which is set at probe time
637 */
638 p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM);
639 p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
640 return 0;
641}
642
643/* Permissions for Advanced Function capability */
644static int __init init_pci_cap_af_perm(struct perm_bits *perm)
645{
646 if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
647 return -ENOMEM;
648
649 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
650 p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR);
651 return 0;
652}
653
654/* Permissions for Advanced Error Reporting extended capability */
655static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm)
656{
657 u32 mask;
658
659 if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR]))
660 return -ENOMEM;
661
662 /*
663 * Virtualize the first dword of all express capabilities
664 * because it includes the next pointer. This lets us later
665 * remove capabilities from the chain if we need to.
666 */
667 p_setd(perm, 0, ALL_VIRT, NO_WRITE);
668
669 /* Writable bits mask */
670 mask = PCI_ERR_UNC_TRAIN | /* Training */
671 PCI_ERR_UNC_DLP | /* Data Link Protocol */
672 PCI_ERR_UNC_SURPDN | /* Surprise Down */
673 PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */
674 PCI_ERR_UNC_FCP | /* Flow Control Protocol */
675 PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */
676 PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */
677 PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */
678 PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */
679 PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */
680 PCI_ERR_UNC_ECRC | /* ECRC Error Status */
681 PCI_ERR_UNC_UNSUP | /* Unsupported Request */
682 PCI_ERR_UNC_ACSV | /* ACS Violation */
683 PCI_ERR_UNC_INTN | /* internal error */
684 PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */
685 PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */
686 PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */
687 p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask);
688 p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask);
689 p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask);
690
691 mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */
692 PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */
693 PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */
694 PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */
695 PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */
696 PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */
697 PCI_ERR_COR_INTERNAL | /* Corrected Internal */
698 PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */
699 p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask);
700 p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask);
701
702 mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */
703 PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */
704 p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask);
705 return 0;
706}
707
708/* Permissions for Power Budgeting extended capability */
709static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm)
710{
711 if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR]))
712 return -ENOMEM;
713
714 p_setd(perm, 0, ALL_VIRT, NO_WRITE);
715
716 /* Writing the data selector is OK, the info is still read-only */
717 p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE);
718 return 0;
719}
720
721/*
722 * Initialize the shared permission tables
723 */
724void vfio_pci_uninit_perm_bits(void)
725{
726 free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]);
727
728 free_perm_bits(&cap_perms[PCI_CAP_ID_PM]);
729 free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]);
730 free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]);
731 free_perm_bits(&cap_perms[PCI_CAP_ID_AF]);
732
733 free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
734 free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
735}
736
737int __init vfio_pci_init_perm_bits(void)
738{
739 int ret;
740
741 /* Basic config space */
742 ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]);
743
744 /* Capabilities */
745 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
746 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write;
747 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
748 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write;
749 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
750 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
751
752 /* Extended capabilities */
753 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
754 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
755 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write;
756
757 if (ret)
758 vfio_pci_uninit_perm_bits();
759
760 return ret;
761}
762
763static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
764{
765 u8 cap;
766 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
767 PCI_STD_HEADER_SIZEOF;
768 base /= 4;
769 pos /= 4;
770
771 cap = vdev->pci_config_map[pos];
772
773 if (cap == PCI_CAP_ID_BASIC)
774 return 0;
775
776 /* XXX Can we have to abutting capabilities of the same type? */
777 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
778 pos--;
779
780 return pos * 4;
781}
782
783static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
784 int count, struct perm_bits *perm,
785 int offset, __le32 *val)
786{
787 /* Update max available queue size from msi_qmax */
788 if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
789 __le16 *flags;
790 int start;
791
792 start = vfio_find_cap_start(vdev, pos);
793
794 flags = (__le16 *)&vdev->vconfig[start];
795
796 *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK);
797 *flags |= cpu_to_le16(vdev->msi_qmax << 1);
798 }
799
800 return vfio_default_config_read(vdev, pos, count, perm, offset, val);
801}
802
803static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
804 int count, struct perm_bits *perm,
805 int offset, __le32 val)
806{
807 count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
808 if (count < 0)
809 return count;
810
811 /* Fixup and write configured queue size and enable to hardware */
812 if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
813 __le16 *pflags;
814 u16 flags;
815 int start, ret;
816
817 start = vfio_find_cap_start(vdev, pos);
818
819 pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS];
820
821 flags = le16_to_cpu(*pflags);
822
823 /* MSI is enabled via ioctl */
824 if (!is_msi(vdev))
825 flags &= ~PCI_MSI_FLAGS_ENABLE;
826
827 /* Check queue size */
828 if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) {
829 flags &= ~PCI_MSI_FLAGS_QSIZE;
830 flags |= vdev->msi_qmax << 4;
831 }
832
833 /* Write back to virt and to hardware */
834 *pflags = cpu_to_le16(flags);
835 ret = pci_user_write_config_word(vdev->pdev,
836 start + PCI_MSI_FLAGS,
837 flags);
838 if (ret)
839 return pcibios_err_to_errno(ret);
840 }
841
842 return count;
843}
844
845/*
846 * MSI determination is per-device, so this routine gets used beyond
847 * initialization time. Don't add __init
848 */
849static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
850{
851 if (alloc_perm_bits(perm, len))
852 return -ENOMEM;
853
854 perm->readfn = vfio_msi_config_read;
855 perm->writefn = vfio_msi_config_write;
856
857 p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
858
859 /*
860 * The upper byte of the control register is reserved,
861 * just setup the lower byte.
862 */
863 p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE);
864 p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE);
865 if (flags & PCI_MSI_FLAGS_64BIT) {
866 p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE);
867 p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE);
868 if (flags & PCI_MSI_FLAGS_MASKBIT) {
869 p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE);
870 p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE);
871 }
872 } else {
873 p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE);
874 if (flags & PCI_MSI_FLAGS_MASKBIT) {
875 p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE);
876 p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE);
877 }
878 }
879 return 0;
880}
881
882/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
883static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
884{
885 struct pci_dev *pdev = vdev->pdev;
886 int len, ret;
887 u16 flags;
888
889 ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
890 if (ret)
891 return pcibios_err_to_errno(ret);
892
893 len = 10; /* Minimum size */
894 if (flags & PCI_MSI_FLAGS_64BIT)
895 len += 4;
896 if (flags & PCI_MSI_FLAGS_MASKBIT)
897 len += 10;
898
899 if (vdev->msi_perm)
900 return len;
901
902 vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL);
903 if (!vdev->msi_perm)
904 return -ENOMEM;
905
906 ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
907 if (ret)
908 return ret;
909
910 return len;
911}
912
913/* Determine extended capability length for VC (2 & 9) and MFVC */
914static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
915{
916 struct pci_dev *pdev = vdev->pdev;
917 u32 tmp;
918 int ret, evcc, phases, vc_arb;
919 int len = PCI_CAP_VC_BASE_SIZEOF;
920
921 ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp);
922 if (ret)
923 return pcibios_err_to_errno(ret);
924
925 evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */
926 ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp);
927 if (ret)
928 return pcibios_err_to_errno(ret);
929
930 if (tmp & PCI_VC_REG2_128_PHASE)
931 phases = 128;
932 else if (tmp & PCI_VC_REG2_64_PHASE)
933 phases = 64;
934 else if (tmp & PCI_VC_REG2_32_PHASE)
935 phases = 32;
936 else
937 phases = 0;
938
939 vc_arb = phases * 4;
940
941 /*
942 * Port arbitration tables are root & switch only;
943 * function arbitration tables are function 0 only.
944 * In either case, we'll never let user write them so
945 * we don't care how big they are
946 */
947 len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF;
948 if (vc_arb) {
949 len = round_up(len, 16);
950 len += vc_arb / 8;
951 }
952 return len;
953}
954
955static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
956{
957 struct pci_dev *pdev = vdev->pdev;
958 u16 word;
959 u8 byte;
960 int ret;
961
962 switch (cap) {
963 case PCI_CAP_ID_MSI:
964 return vfio_msi_cap_len(vdev, pos);
965 case PCI_CAP_ID_PCIX:
966 ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word);
967 if (ret)
968 return pcibios_err_to_errno(ret);
969
970 if (PCI_X_CMD_VERSION(word)) {
971 vdev->extended_caps = true;
972 return PCI_CAP_PCIX_SIZEOF_V2;
973 } else
974 return PCI_CAP_PCIX_SIZEOF_V0;
975 case PCI_CAP_ID_VNDR:
976 /* length follows next field */
977 ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte);
978 if (ret)
979 return pcibios_err_to_errno(ret);
980
981 return byte;
982 case PCI_CAP_ID_EXP:
983 /* length based on version */
984 ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
985 if (ret)
986 return pcibios_err_to_errno(ret);
987
988 if ((word & PCI_EXP_FLAGS_VERS) == 1)
989 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
990 else {
991 vdev->extended_caps = true;
992 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
993 }
994 case PCI_CAP_ID_HT:
995 ret = pci_read_config_byte(pdev, pos + 3, &byte);
996 if (ret)
997 return pcibios_err_to_errno(ret);
998
999 return (byte & HT_3BIT_CAP_MASK) ?
1000 HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG;
1001 case PCI_CAP_ID_SATA:
1002 ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte);
1003 if (ret)
1004 return pcibios_err_to_errno(ret);
1005
1006 byte &= PCI_SATA_REGS_MASK;
1007 if (byte == PCI_SATA_REGS_INLINE)
1008 return PCI_SATA_SIZEOF_LONG;
1009 else
1010 return PCI_SATA_SIZEOF_SHORT;
1011 default:
1012 pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n",
1013 dev_name(&pdev->dev), __func__, cap, pos);
1014 }
1015
1016 return 0;
1017}
1018
1019static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
1020{
1021 struct pci_dev *pdev = vdev->pdev;
1022 u8 byte;
1023 u32 dword;
1024 int ret;
1025
1026 switch (ecap) {
1027 case PCI_EXT_CAP_ID_VNDR:
1028 ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
1029 if (ret)
1030 return pcibios_err_to_errno(ret);
1031
1032 return dword >> PCI_VSEC_HDR_LEN_SHIFT;
1033 case PCI_EXT_CAP_ID_VC:
1034 case PCI_EXT_CAP_ID_VC9:
1035 case PCI_EXT_CAP_ID_MFVC:
1036 return vfio_vc_cap_len(vdev, epos);
1037 case PCI_EXT_CAP_ID_ACS:
1038 ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte);
1039 if (ret)
1040 return pcibios_err_to_errno(ret);
1041
1042 if (byte & PCI_ACS_EC) {
1043 int bits;
1044
1045 ret = pci_read_config_byte(pdev,
1046 epos + PCI_ACS_EGRESS_BITS,
1047 &byte);
1048 if (ret)
1049 return pcibios_err_to_errno(ret);
1050
1051 bits = byte ? round_up(byte, 32) : 256;
1052 return 8 + (bits / 8);
1053 }
1054 return 8;
1055
1056 case PCI_EXT_CAP_ID_REBAR:
1057 ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte);
1058 if (ret)
1059 return pcibios_err_to_errno(ret);
1060
1061 byte &= PCI_REBAR_CTRL_NBAR_MASK;
1062 byte >>= PCI_REBAR_CTRL_NBAR_SHIFT;
1063
1064 return 4 + (byte * 8);
1065 case PCI_EXT_CAP_ID_DPA:
1066 ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte);
1067 if (ret)
1068 return pcibios_err_to_errno(ret);
1069
1070 byte &= PCI_DPA_CAP_SUBSTATE_MASK;
1071 byte = round_up(byte + 1, 4);
1072 return PCI_DPA_BASE_SIZEOF + byte;
1073 case PCI_EXT_CAP_ID_TPH:
1074 ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword);
1075 if (ret)
1076 return pcibios_err_to_errno(ret);
1077
1078 if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) {
1079 int sts;
1080
1081 sts = byte & PCI_TPH_CAP_ST_MASK;
1082 sts >>= PCI_TPH_CAP_ST_SHIFT;
1083 return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4);
1084 }
1085 return PCI_TPH_BASE_SIZEOF;
1086 default:
1087 pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n",
1088 dev_name(&pdev->dev), __func__, ecap, epos);
1089 }
1090
1091 return 0;
1092}
1093
1094static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
1095 int offset, int size)
1096{
1097 struct pci_dev *pdev = vdev->pdev;
1098 int ret = 0;
1099
1100 /*
1101 * We try to read physical config space in the largest chunks
1102 * we can, assuming that all of the fields support dword access.
1103 * pci_save_state() makes this same assumption and seems to do ok.
1104 */
1105 while (size) {
1106 int filled;
1107
1108 if (size >= 4 && !(offset % 4)) {
1109 __le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
1110 u32 dword;
1111
1112 ret = pci_read_config_dword(pdev, offset, &dword);
1113 if (ret)
1114 return ret;
1115 *dwordp = cpu_to_le32(dword);
1116 filled = 4;
1117 } else if (size >= 2 && !(offset % 2)) {
1118 __le16 *wordp = (__le16 *)&vdev->vconfig[offset];
1119 u16 word;
1120
1121 ret = pci_read_config_word(pdev, offset, &word);
1122 if (ret)
1123 return ret;
1124 *wordp = cpu_to_le16(word);
1125 filled = 2;
1126 } else {
1127 u8 *byte = &vdev->vconfig[offset];
1128 ret = pci_read_config_byte(pdev, offset, byte);
1129 if (ret)
1130 return ret;
1131 filled = 1;
1132 }
1133
1134 offset += filled;
1135 size -= filled;
1136 }
1137
1138 return ret;
1139}
1140
1141static int vfio_cap_init(struct vfio_pci_device *vdev)
1142{
1143 struct pci_dev *pdev = vdev->pdev;
1144 u8 *map = vdev->pci_config_map;
1145 u16 status;
1146 u8 pos, *prev, cap;
1147 int loops, ret, caps = 0;
1148
1149 /* Any capabilities? */
1150 ret = pci_read_config_word(pdev, PCI_STATUS, &status);
1151 if (ret)
1152 return ret;
1153
1154 if (!(status & PCI_STATUS_CAP_LIST))
1155 return 0; /* Done */
1156
1157 ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
1158 if (ret)
1159 return ret;
1160
1161 /* Mark the previous position in case we want to skip a capability */
1162 prev = &vdev->vconfig[PCI_CAPABILITY_LIST];
1163
1164 /* We can bound our loop, capabilities are dword aligned */
1165 loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF;
1166 while (pos && loops--) {
1167 u8 next;
1168 int i, len = 0;
1169
1170 ret = pci_read_config_byte(pdev, pos, &cap);
1171 if (ret)
1172 return ret;
1173
1174 ret = pci_read_config_byte(pdev,
1175 pos + PCI_CAP_LIST_NEXT, &next);
1176 if (ret)
1177 return ret;
1178
1179 if (cap <= PCI_CAP_ID_MAX) {
1180 len = pci_cap_length[cap];
1181 if (len == 0xFF) { /* Variable length */
1182 len = vfio_cap_len(vdev, cap, pos);
1183 if (len < 0)
1184 return len;
1185 }
1186 }
1187
1188 if (!len) {
1189 pr_info("%s: %s hiding cap 0x%x\n",
1190 __func__, dev_name(&pdev->dev), cap);
1191 *prev = next;
1192 pos = next;
1193 continue;
1194 }
1195
1196 /* Sanity check, do we overlap other capabilities? */
1197 for (i = 0; i < len; i += 4) {
1198 if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID))
1199 continue;
1200
1201 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
1202 __func__, dev_name(&pdev->dev),
1203 pos + i, map[pos + i], cap);
1204 }
1205
1206 memset(map + (pos / 4), cap, len / 4);
1207 ret = vfio_fill_vconfig_bytes(vdev, pos, len);
1208 if (ret)
1209 return ret;
1210
1211 prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT];
1212 pos = next;
1213 caps++;
1214 }
1215
1216 /* If we didn't fill any capabilities, clear the status flag */
1217 if (!caps) {
1218 __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS];
1219 *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST);
1220 }
1221
1222 return 0;
1223}
1224
1225static int vfio_ecap_init(struct vfio_pci_device *vdev)
1226{
1227 struct pci_dev *pdev = vdev->pdev;
1228 u8 *map = vdev->pci_config_map;
1229 u16 epos;
1230 __le32 *prev = NULL;
1231 int loops, ret, ecaps = 0;
1232
1233 if (!vdev->extended_caps)
1234 return 0;
1235
1236 epos = PCI_CFG_SPACE_SIZE;
1237
1238 loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
1239
1240 while (loops-- && epos >= PCI_CFG_SPACE_SIZE) {
1241 u32 header;
1242 u16 ecap;
1243 int i, len = 0;
1244 bool hidden = false;
1245
1246 ret = pci_read_config_dword(pdev, epos, &header);
1247 if (ret)
1248 return ret;
1249
1250 ecap = PCI_EXT_CAP_ID(header);
1251
1252 if (ecap <= PCI_EXT_CAP_ID_MAX) {
1253 len = pci_ext_cap_length[ecap];
1254 if (len == 0xFF) {
1255 len = vfio_ext_cap_len(vdev, ecap, epos);
1256 if (len < 0)
1257 return ret;
1258 }
1259 }
1260
1261 if (!len) {
1262 pr_info("%s: %s hiding ecap 0x%x@0x%x\n",
1263 __func__, dev_name(&pdev->dev), ecap, epos);
1264
1265 /* If not the first in the chain, we can skip over it */
1266 if (prev) {
1267 u32 val = epos = PCI_EXT_CAP_NEXT(header);
1268 *prev &= cpu_to_le32(~(0xffcU << 20));
1269 *prev |= cpu_to_le32(val << 20);
1270 continue;
1271 }
1272
1273 /*
1274 * Otherwise, fill in a placeholder, the direct
1275 * readfn will virtualize this automatically
1276 */
1277 len = PCI_CAP_SIZEOF;
1278 hidden = true;
1279 }
1280
1281 for (i = 0; i < len; i += 4) {
1282 if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID))
1283 continue;
1284
1285 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
1286 __func__, dev_name(&pdev->dev),
1287 epos + i, map[epos + i], ecap);
1288 }
1289
1290 /*
1291 * Even though ecap is 2 bytes, we're currently a long way
1292 * from exceeding 1 byte capabilities. If we ever make it
1293 * up to 0xFF we'll need to up this to a two-byte, byte map.
1294 */
1295 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
1296
1297 memset(map + (epos / 4), ecap, len / 4);
1298 ret = vfio_fill_vconfig_bytes(vdev, epos, len);
1299 if (ret)
1300 return ret;
1301
1302 /*
1303 * If we're just using this capability to anchor the list,
1304 * hide the real ID. Only count real ecaps. XXX PCI spec
1305 * indicates to use cap id = 0, version = 0, next = 0 if
1306 * ecaps are absent, hope users check all the way to next.
1307 */
1308 if (hidden)
1309 *(__le32 *)&vdev->vconfig[epos] &=
1310 cpu_to_le32((0xffcU << 20));
1311 else
1312 ecaps++;
1313
1314 prev = (__le32 *)&vdev->vconfig[epos];
1315 epos = PCI_EXT_CAP_NEXT(header);
1316 }
1317
1318 if (!ecaps)
1319 *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
1320
1321 return 0;
1322}
1323
1324/*
1325 * For each device we allocate a pci_config_map that indicates the
1326 * capability occupying each dword and thus the struct perm_bits we
1327 * use for read and write. We also allocate a virtualized config
1328 * space which tracks reads and writes to bits that we emulate for
1329 * the user. Initial values filled from device.
1330 *
1331 * Using shared stuct perm_bits between all vfio-pci devices saves
1332 * us from allocating cfg_size buffers for virt and write for every
1333 * device. We could remove vconfig and allocate individual buffers
1334 * for each area requring emulated bits, but the array of pointers
1335 * would be comparable in size (at least for standard config space).
1336 */
1337int vfio_config_init(struct vfio_pci_device *vdev)
1338{
1339 struct pci_dev *pdev = vdev->pdev;
1340 u8 *map, *vconfig;
1341 int ret;
1342
1343 /*
1344 * Config space, caps and ecaps are all dword aligned, so we can
1345 * use one byte per dword to record the type.
1346 */
1347 map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL);
1348 if (!map)
1349 return -ENOMEM;
1350
1351 vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL);
1352 if (!vconfig) {
1353 kfree(map);
1354 return -ENOMEM;
1355 }
1356
1357 vdev->pci_config_map = map;
1358 vdev->vconfig = vconfig;
1359
1360 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4);
1361 memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID,
1362 (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4);
1363
1364 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
1365 if (ret)
1366 goto out;
1367
1368 vdev->bardirty = true;
1369
1370 /*
1371 * XXX can we just pci_load_saved_state/pci_restore_state?
1372 * may need to rebuild vconfig after that
1373 */
1374
1375 /* For restore after reset */
1376 vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]);
1377 vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]);
1378 vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]);
1379 vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]);
1380 vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]);
1381 vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]);
1382 vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]);
1383
1384 if (pdev->is_virtfn) {
1385 *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
1386 *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
1387 }
1388
1389 ret = vfio_cap_init(vdev);
1390 if (ret)
1391 goto out;
1392
1393 ret = vfio_ecap_init(vdev);
1394 if (ret)
1395 goto out;
1396
1397 return 0;
1398
1399out:
1400 kfree(map);
1401 vdev->pci_config_map = NULL;
1402 kfree(vconfig);
1403 vdev->vconfig = NULL;
1404 return pcibios_err_to_errno(ret);
1405}
1406
1407void vfio_config_free(struct vfio_pci_device *vdev)
1408{
1409 kfree(vdev->vconfig);
1410 vdev->vconfig = NULL;
1411 kfree(vdev->pci_config_map);
1412 vdev->pci_config_map = NULL;
1413 kfree(vdev->msi_perm);
1414 vdev->msi_perm = NULL;
1415}
1416
1417static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1418 size_t count, loff_t *ppos, bool iswrite)
1419{
1420 struct pci_dev *pdev = vdev->pdev;
1421 struct perm_bits *perm;
1422 __le32 val = 0;
1423 int cap_start = 0, offset;
1424 u8 cap_id;
1425 ssize_t ret = count;
1426
1427 if (*ppos < 0 || *ppos + count > pdev->cfg_size)
1428 return -EFAULT;
1429
1430 /*
1431 * gcc can't seem to figure out we're a static function, only called
1432 * with count of 1/2/4 and hits copy_from_user_overflow without this.
1433 */
1434 if (count > sizeof(val))
1435 return -EINVAL;
1436
1437 cap_id = vdev->pci_config_map[*ppos / 4];
1438
1439 if (cap_id == PCI_CAP_ID_INVALID) {
1440 if (iswrite)
1441 return ret; /* drop */
1442
1443 /*
1444 * Per PCI spec 3.0, section 6.1, reads from reserved and
1445 * unimplemented registers return 0
1446 */
1447 if (copy_to_user(buf, &val, count))
1448 return -EFAULT;
1449
1450 return ret;
1451 }
1452
1453 /*
1454 * All capabilities are minimum 4 bytes and aligned on dword
1455 * boundaries. Since we don't support unaligned accesses, we're
1456 * only ever accessing a single capability.
1457 */
1458 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1459 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1460
1461 perm = &ecap_perms[cap_id];
1462 cap_start = vfio_find_cap_start(vdev, *ppos);
1463
1464 } else {
1465 WARN_ON(cap_id > PCI_CAP_ID_MAX);
1466
1467 perm = &cap_perms[cap_id];
1468
1469 if (cap_id == PCI_CAP_ID_MSI)
1470 perm = vdev->msi_perm;
1471
1472 if (cap_id > PCI_CAP_ID_BASIC)
1473 cap_start = vfio_find_cap_start(vdev, *ppos);
1474 }
1475
1476 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
1477 WARN_ON(cap_start > *ppos);
1478
1479 offset = *ppos - cap_start;
1480
1481 if (iswrite) {
1482 if (!perm->writefn)
1483 return ret;
1484
1485 if (copy_from_user(&val, buf, count))
1486 return -EFAULT;
1487
1488 ret = perm->writefn(vdev, *ppos, count, perm, offset, val);
1489 } else {
1490 if (perm->readfn) {
1491 ret = perm->readfn(vdev, *ppos, count,
1492 perm, offset, &val);
1493 if (ret < 0)
1494 return ret;
1495 }
1496
1497 if (copy_to_user(buf, &val, count))
1498 return -EFAULT;
1499 }
1500
1501 return ret;
1502}
1503
1504ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
1505 char __user *buf, size_t count,
1506 loff_t *ppos, bool iswrite)
1507{
1508 size_t done = 0;
1509 int ret = 0;
1510 loff_t pos = *ppos;
1511
1512 pos &= VFIO_PCI_OFFSET_MASK;
1513
1514 /*
1515 * We want to both keep the access size the caller users as well as
1516 * support reading large chunks of config space in a single call.
1517 * PCI doesn't support unaligned accesses, so we can safely break
1518 * those apart.
1519 */
1520 while (count) {
1521 if (count >= 4 && !(pos % 4))
1522 ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
1523 else if (count >= 2 && !(pos % 2))
1524 ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
1525 else
1526 ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
1527
1528 if (ret < 0)
1529 return ret;
1530
1531 count -= ret;
1532 done += ret;
1533 buf += ret;
1534 pos += ret;
1535 }
1536
1537 *ppos += done;
1538
1539 return done;
1540}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
new file mode 100644
index 000000000000..211a4920b88a
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -0,0 +1,740 @@
1/*
2 * VFIO PCI interrupt handling
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/device.h>
17#include <linux/interrupt.h>
18#include <linux/eventfd.h>
19#include <linux/pci.h>
20#include <linux/file.h>
21#include <linux/poll.h>
22#include <linux/vfio.h>
23#include <linux/wait.h>
24#include <linux/workqueue.h>
25
26#include "vfio_pci_private.h"
27
28/*
29 * IRQfd - generic
30 */
31struct virqfd {
32 struct vfio_pci_device *vdev;
33 struct eventfd_ctx *eventfd;
34 int (*handler)(struct vfio_pci_device *, void *);
35 void (*thread)(struct vfio_pci_device *, void *);
36 void *data;
37 struct work_struct inject;
38 wait_queue_t wait;
39 poll_table pt;
40 struct work_struct shutdown;
41 struct virqfd **pvirqfd;
42};
43
44static struct workqueue_struct *vfio_irqfd_cleanup_wq;
45
46int __init vfio_pci_virqfd_init(void)
47{
48 vfio_irqfd_cleanup_wq =
49 create_singlethread_workqueue("vfio-irqfd-cleanup");
50 if (!vfio_irqfd_cleanup_wq)
51 return -ENOMEM;
52
53 return 0;
54}
55
56void vfio_pci_virqfd_exit(void)
57{
58 destroy_workqueue(vfio_irqfd_cleanup_wq);
59}
60
61static void virqfd_deactivate(struct virqfd *virqfd)
62{
63 queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
64}
65
66static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
67{
68 struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
69 unsigned long flags = (unsigned long)key;
70
71 if (flags & POLLIN) {
72 /* An event has been signaled, call function */
73 if ((!virqfd->handler ||
74 virqfd->handler(virqfd->vdev, virqfd->data)) &&
75 virqfd->thread)
76 schedule_work(&virqfd->inject);
77 }
78
79 if (flags & POLLHUP)
80 /* The eventfd is closing, detach from VFIO */
81 virqfd_deactivate(virqfd);
82
83 return 0;
84}
85
86static void virqfd_ptable_queue_proc(struct file *file,
87 wait_queue_head_t *wqh, poll_table *pt)
88{
89 struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
90 add_wait_queue(wqh, &virqfd->wait);
91}
92
93static void virqfd_shutdown(struct work_struct *work)
94{
95 struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
96 struct virqfd **pvirqfd = virqfd->pvirqfd;
97 u64 cnt;
98
99 eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
100 flush_work(&virqfd->inject);
101 eventfd_ctx_put(virqfd->eventfd);
102
103 kfree(virqfd);
104 *pvirqfd = NULL;
105}
106
107static void virqfd_inject(struct work_struct *work)
108{
109 struct virqfd *virqfd = container_of(work, struct virqfd, inject);
110 if (virqfd->thread)
111 virqfd->thread(virqfd->vdev, virqfd->data);
112}
113
114static int virqfd_enable(struct vfio_pci_device *vdev,
115 int (*handler)(struct vfio_pci_device *, void *),
116 void (*thread)(struct vfio_pci_device *, void *),
117 void *data, struct virqfd **pvirqfd, int fd)
118{
119 struct file *file = NULL;
120 struct eventfd_ctx *ctx = NULL;
121 struct virqfd *virqfd;
122 int ret = 0;
123 unsigned int events;
124
125 if (*pvirqfd)
126 return -EBUSY;
127
128 virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
129 if (!virqfd)
130 return -ENOMEM;
131
132 virqfd->pvirqfd = pvirqfd;
133 *pvirqfd = virqfd;
134 virqfd->vdev = vdev;
135 virqfd->handler = handler;
136 virqfd->thread = thread;
137 virqfd->data = data;
138
139 INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
140 INIT_WORK(&virqfd->inject, virqfd_inject);
141
142 file = eventfd_fget(fd);
143 if (IS_ERR(file)) {
144 ret = PTR_ERR(file);
145 goto fail;
146 }
147
148 ctx = eventfd_ctx_fileget(file);
149 if (IS_ERR(ctx)) {
150 ret = PTR_ERR(ctx);
151 goto fail;
152 }
153
154 virqfd->eventfd = ctx;
155
156 /*
157 * Install our own custom wake-up handling so we are notified via
158 * a callback whenever someone signals the underlying eventfd.
159 */
160 init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
161 init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
162
163 events = file->f_op->poll(file, &virqfd->pt);
164
165 /*
166 * Check if there was an event already pending on the eventfd
167 * before we registered and trigger it as if we didn't miss it.
168 */
169 if (events & POLLIN) {
170 if ((!handler || handler(vdev, data)) && thread)
171 schedule_work(&virqfd->inject);
172 }
173
174 /*
175 * Do not drop the file until the irqfd is fully initialized,
176 * otherwise we might race against the POLLHUP.
177 */
178 fput(file);
179
180 return 0;
181
182fail:
183 if (ctx && !IS_ERR(ctx))
184 eventfd_ctx_put(ctx);
185
186 if (file && !IS_ERR(file))
187 fput(file);
188
189 kfree(virqfd);
190 *pvirqfd = NULL;
191
192 return ret;
193}
194
195static void virqfd_disable(struct virqfd *virqfd)
196{
197 if (!virqfd)
198 return;
199
200 virqfd_deactivate(virqfd);
201
202 /* Block until we know all outstanding shutdown jobs have completed. */
203 flush_workqueue(vfio_irqfd_cleanup_wq);
204}
205
206/*
207 * INTx
208 */
209static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
210{
211 if (likely(is_intx(vdev) && !vdev->virq_disabled))
212 eventfd_signal(vdev->ctx[0].trigger, 1);
213}
214
215void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
216{
217 struct pci_dev *pdev = vdev->pdev;
218 unsigned long flags;
219
220 spin_lock_irqsave(&vdev->irqlock, flags);
221
222 /*
223 * Masking can come from interrupt, ioctl, or config space
224 * via INTx disable. The latter means this can get called
225 * even when not using intx delivery. In this case, just
226 * try to have the physical bit follow the virtual bit.
227 */
228 if (unlikely(!is_intx(vdev))) {
229 if (vdev->pci_2_3)
230 pci_intx(pdev, 0);
231 } else if (!vdev->ctx[0].masked) {
232 /*
233 * Can't use check_and_mask here because we always want to
234 * mask, not just when something is pending.
235 */
236 if (vdev->pci_2_3)
237 pci_intx(pdev, 0);
238 else
239 disable_irq_nosync(pdev->irq);
240
241 vdev->ctx[0].masked = true;
242 }
243
244 spin_unlock_irqrestore(&vdev->irqlock, flags);
245}
246
247/*
248 * If this is triggered by an eventfd, we can't call eventfd_signal
249 * or else we'll deadlock on the eventfd wait queue. Return >0 when
250 * a signal is necessary, which can then be handled via a work queue
251 * or directly depending on the caller.
252 */
253int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
254{
255 struct pci_dev *pdev = vdev->pdev;
256 unsigned long flags;
257 int ret = 0;
258
259 spin_lock_irqsave(&vdev->irqlock, flags);
260
261 /*
262 * Unmasking comes from ioctl or config, so again, have the
263 * physical bit follow the virtual even when not using INTx.
264 */
265 if (unlikely(!is_intx(vdev))) {
266 if (vdev->pci_2_3)
267 pci_intx(pdev, 1);
268 } else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
269 /*
270 * A pending interrupt here would immediately trigger,
271 * but we can avoid that overhead by just re-sending
272 * the interrupt to the user.
273 */
274 if (vdev->pci_2_3) {
275 if (!pci_check_and_unmask_intx(pdev))
276 ret = 1;
277 } else
278 enable_irq(pdev->irq);
279
280 vdev->ctx[0].masked = (ret > 0);
281 }
282
283 spin_unlock_irqrestore(&vdev->irqlock, flags);
284
285 return ret;
286}
287
288void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
289{
290 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
291 vfio_send_intx_eventfd(vdev, NULL);
292}
293
294static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
295{
296 struct vfio_pci_device *vdev = dev_id;
297 unsigned long flags;
298 int ret = IRQ_NONE;
299
300 spin_lock_irqsave(&vdev->irqlock, flags);
301
302 if (!vdev->pci_2_3) {
303 disable_irq_nosync(vdev->pdev->irq);
304 vdev->ctx[0].masked = true;
305 ret = IRQ_HANDLED;
306 } else if (!vdev->ctx[0].masked && /* may be shared */
307 pci_check_and_mask_intx(vdev->pdev)) {
308 vdev->ctx[0].masked = true;
309 ret = IRQ_HANDLED;
310 }
311
312 spin_unlock_irqrestore(&vdev->irqlock, flags);
313
314 if (ret == IRQ_HANDLED)
315 vfio_send_intx_eventfd(vdev, NULL);
316
317 return ret;
318}
319
320static int vfio_intx_enable(struct vfio_pci_device *vdev)
321{
322 if (!is_irq_none(vdev))
323 return -EINVAL;
324
325 if (!vdev->pdev->irq)
326 return -ENODEV;
327
328 vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
329 if (!vdev->ctx)
330 return -ENOMEM;
331
332 vdev->num_ctx = 1;
333 vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
334
335 return 0;
336}
337
338static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
339{
340 struct pci_dev *pdev = vdev->pdev;
341 unsigned long irqflags = IRQF_SHARED;
342 struct eventfd_ctx *trigger;
343 unsigned long flags;
344 int ret;
345
346 if (vdev->ctx[0].trigger) {
347 free_irq(pdev->irq, vdev);
348 kfree(vdev->ctx[0].name);
349 eventfd_ctx_put(vdev->ctx[0].trigger);
350 vdev->ctx[0].trigger = NULL;
351 }
352
353 if (fd < 0) /* Disable only */
354 return 0;
355
356 vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
357 pci_name(pdev));
358 if (!vdev->ctx[0].name)
359 return -ENOMEM;
360
361 trigger = eventfd_ctx_fdget(fd);
362 if (IS_ERR(trigger)) {
363 kfree(vdev->ctx[0].name);
364 return PTR_ERR(trigger);
365 }
366
367 if (!vdev->pci_2_3)
368 irqflags = 0;
369
370 ret = request_irq(pdev->irq, vfio_intx_handler,
371 irqflags, vdev->ctx[0].name, vdev);
372 if (ret) {
373 kfree(vdev->ctx[0].name);
374 eventfd_ctx_put(trigger);
375 return ret;
376 }
377
378 vdev->ctx[0].trigger = trigger;
379
380 /*
381 * INTx disable will stick across the new irq setup,
382 * disable_irq won't.
383 */
384 spin_lock_irqsave(&vdev->irqlock, flags);
385 if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled))
386 disable_irq_nosync(pdev->irq);
387 spin_unlock_irqrestore(&vdev->irqlock, flags);
388
389 return 0;
390}
391
392static void vfio_intx_disable(struct vfio_pci_device *vdev)
393{
394 vfio_intx_set_signal(vdev, -1);
395 virqfd_disable(vdev->ctx[0].unmask);
396 virqfd_disable(vdev->ctx[0].mask);
397 vdev->irq_type = VFIO_PCI_NUM_IRQS;
398 vdev->num_ctx = 0;
399 kfree(vdev->ctx);
400}
401
402/*
403 * MSI/MSI-X
404 */
405static irqreturn_t vfio_msihandler(int irq, void *arg)
406{
407 struct eventfd_ctx *trigger = arg;
408
409 eventfd_signal(trigger, 1);
410 return IRQ_HANDLED;
411}
412
413static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
414{
415 struct pci_dev *pdev = vdev->pdev;
416 int ret;
417
418 if (!is_irq_none(vdev))
419 return -EINVAL;
420
421 vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
422 if (!vdev->ctx)
423 return -ENOMEM;
424
425 if (msix) {
426 int i;
427
428 vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
429 GFP_KERNEL);
430 if (!vdev->msix) {
431 kfree(vdev->ctx);
432 return -ENOMEM;
433 }
434
435 for (i = 0; i < nvec; i++)
436 vdev->msix[i].entry = i;
437
438 ret = pci_enable_msix(pdev, vdev->msix, nvec);
439 if (ret) {
440 kfree(vdev->msix);
441 kfree(vdev->ctx);
442 return ret;
443 }
444 } else {
445 ret = pci_enable_msi_block(pdev, nvec);
446 if (ret) {
447 kfree(vdev->ctx);
448 return ret;
449 }
450 }
451
452 vdev->num_ctx = nvec;
453 vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
454 VFIO_PCI_MSI_IRQ_INDEX;
455
456 if (!msix) {
457 /*
458 * Compute the virtual hardware field for max msi vectors -
459 * it is the log base 2 of the number of vectors.
460 */
461 vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
462 }
463
464 return 0;
465}
466
467static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
468 int vector, int fd, bool msix)
469{
470 struct pci_dev *pdev = vdev->pdev;
471 int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
472 char *name = msix ? "vfio-msix" : "vfio-msi";
473 struct eventfd_ctx *trigger;
474 int ret;
475
476 if (vector >= vdev->num_ctx)
477 return -EINVAL;
478
479 if (vdev->ctx[vector].trigger) {
480 free_irq(irq, vdev->ctx[vector].trigger);
481 kfree(vdev->ctx[vector].name);
482 eventfd_ctx_put(vdev->ctx[vector].trigger);
483 vdev->ctx[vector].trigger = NULL;
484 }
485
486 if (fd < 0)
487 return 0;
488
489 vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
490 name, vector, pci_name(pdev));
491 if (!vdev->ctx[vector].name)
492 return -ENOMEM;
493
494 trigger = eventfd_ctx_fdget(fd);
495 if (IS_ERR(trigger)) {
496 kfree(vdev->ctx[vector].name);
497 return PTR_ERR(trigger);
498 }
499
500 ret = request_irq(irq, vfio_msihandler, 0,
501 vdev->ctx[vector].name, trigger);
502 if (ret) {
503 kfree(vdev->ctx[vector].name);
504 eventfd_ctx_put(trigger);
505 return ret;
506 }
507
508 vdev->ctx[vector].trigger = trigger;
509
510 return 0;
511}
512
513static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
514 unsigned count, int32_t *fds, bool msix)
515{
516 int i, j, ret = 0;
517
518 if (start + count > vdev->num_ctx)
519 return -EINVAL;
520
521 for (i = 0, j = start; i < count && !ret; i++, j++) {
522 int fd = fds ? fds[i] : -1;
523 ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
524 }
525
526 if (ret) {
527 for (--j; j >= start; j--)
528 vfio_msi_set_vector_signal(vdev, j, -1, msix);
529 }
530
531 return ret;
532}
533
534static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
535{
536 struct pci_dev *pdev = vdev->pdev;
537 int i;
538
539 vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
540
541 for (i = 0; i < vdev->num_ctx; i++) {
542 virqfd_disable(vdev->ctx[i].unmask);
543 virqfd_disable(vdev->ctx[i].mask);
544 }
545
546 if (msix) {
547 pci_disable_msix(vdev->pdev);
548 kfree(vdev->msix);
549 } else
550 pci_disable_msi(pdev);
551
552 vdev->irq_type = VFIO_PCI_NUM_IRQS;
553 vdev->num_ctx = 0;
554 kfree(vdev->ctx);
555}
556
557/*
558 * IOCTL support
559 */
560static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
561 unsigned index, unsigned start,
562 unsigned count, uint32_t flags, void *data)
563{
564 if (!is_intx(vdev) || start != 0 || count != 1)
565 return -EINVAL;
566
567 if (flags & VFIO_IRQ_SET_DATA_NONE) {
568 vfio_pci_intx_unmask(vdev);
569 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
570 uint8_t unmask = *(uint8_t *)data;
571 if (unmask)
572 vfio_pci_intx_unmask(vdev);
573 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
574 int32_t fd = *(int32_t *)data;
575 if (fd >= 0)
576 return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
577 vfio_send_intx_eventfd, NULL,
578 &vdev->ctx[0].unmask, fd);
579
580 virqfd_disable(vdev->ctx[0].unmask);
581 }
582
583 return 0;
584}
585
586static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
587 unsigned index, unsigned start,
588 unsigned count, uint32_t flags, void *data)
589{
590 if (!is_intx(vdev) || start != 0 || count != 1)
591 return -EINVAL;
592
593 if (flags & VFIO_IRQ_SET_DATA_NONE) {
594 vfio_pci_intx_mask(vdev);
595 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
596 uint8_t mask = *(uint8_t *)data;
597 if (mask)
598 vfio_pci_intx_mask(vdev);
599 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
600 return -ENOTTY; /* XXX implement me */
601 }
602
603 return 0;
604}
605
606static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
607 unsigned index, unsigned start,
608 unsigned count, uint32_t flags, void *data)
609{
610 if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
611 vfio_intx_disable(vdev);
612 return 0;
613 }
614
615 if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
616 return -EINVAL;
617
618 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
619 int32_t fd = *(int32_t *)data;
620 int ret;
621
622 if (is_intx(vdev))
623 return vfio_intx_set_signal(vdev, fd);
624
625 ret = vfio_intx_enable(vdev);
626 if (ret)
627 return ret;
628
629 ret = vfio_intx_set_signal(vdev, fd);
630 if (ret)
631 vfio_intx_disable(vdev);
632
633 return ret;
634 }
635
636 if (!is_intx(vdev))
637 return -EINVAL;
638
639 if (flags & VFIO_IRQ_SET_DATA_NONE) {
640 vfio_send_intx_eventfd(vdev, NULL);
641 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
642 uint8_t trigger = *(uint8_t *)data;
643 if (trigger)
644 vfio_send_intx_eventfd(vdev, NULL);
645 }
646 return 0;
647}
648
649static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
650 unsigned index, unsigned start,
651 unsigned count, uint32_t flags, void *data)
652{
653 int i;
654 bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
655
656 if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
657 vfio_msi_disable(vdev, msix);
658 return 0;
659 }
660
661 if (!(irq_is(vdev, index) || is_irq_none(vdev)))
662 return -EINVAL;
663
664 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
665 int32_t *fds = data;
666 int ret;
667
668 if (vdev->irq_type == index)
669 return vfio_msi_set_block(vdev, start, count,
670 fds, msix);
671
672 ret = vfio_msi_enable(vdev, start + count, msix);
673 if (ret)
674 return ret;
675
676 ret = vfio_msi_set_block(vdev, start, count, fds, msix);
677 if (ret)
678 vfio_msi_disable(vdev, msix);
679
680 return ret;
681 }
682
683 if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
684 return -EINVAL;
685
686 for (i = start; i < start + count; i++) {
687 if (!vdev->ctx[i].trigger)
688 continue;
689 if (flags & VFIO_IRQ_SET_DATA_NONE) {
690 eventfd_signal(vdev->ctx[i].trigger, 1);
691 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
692 uint8_t *bools = data;
693 if (bools[i - start])
694 eventfd_signal(vdev->ctx[i].trigger, 1);
695 }
696 }
697 return 0;
698}
699
700int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
701 unsigned index, unsigned start, unsigned count,
702 void *data)
703{
704 int (*func)(struct vfio_pci_device *vdev, unsigned index,
705 unsigned start, unsigned count, uint32_t flags,
706 void *data) = NULL;
707
708 switch (index) {
709 case VFIO_PCI_INTX_IRQ_INDEX:
710 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
711 case VFIO_IRQ_SET_ACTION_MASK:
712 func = vfio_pci_set_intx_mask;
713 break;
714 case VFIO_IRQ_SET_ACTION_UNMASK:
715 func = vfio_pci_set_intx_unmask;
716 break;
717 case VFIO_IRQ_SET_ACTION_TRIGGER:
718 func = vfio_pci_set_intx_trigger;
719 break;
720 }
721 break;
722 case VFIO_PCI_MSI_IRQ_INDEX:
723 case VFIO_PCI_MSIX_IRQ_INDEX:
724 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
725 case VFIO_IRQ_SET_ACTION_MASK:
726 case VFIO_IRQ_SET_ACTION_UNMASK:
727 /* XXX Need masking support exported */
728 break;
729 case VFIO_IRQ_SET_ACTION_TRIGGER:
730 func = vfio_pci_set_msi_trigger;
731 break;
732 }
733 break;
734 }
735
736 if (!func)
737 return -ENOTTY;
738
739 return func(vdev, index, start, count, flags, data);
740}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
new file mode 100644
index 000000000000..611827cba8cd
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -0,0 +1,91 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/mutex.h>
15#include <linux/pci.h>
16
17#ifndef VFIO_PCI_PRIVATE_H
18#define VFIO_PCI_PRIVATE_H
19
20#define VFIO_PCI_OFFSET_SHIFT 40
21
22#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
23#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
24#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
25
26struct vfio_pci_irq_ctx {
27 struct eventfd_ctx *trigger;
28 struct virqfd *unmask;
29 struct virqfd *mask;
30 char *name;
31 bool masked;
32};
33
34struct vfio_pci_device {
35 struct pci_dev *pdev;
36 void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
37 u8 *pci_config_map;
38 u8 *vconfig;
39 struct perm_bits *msi_perm;
40 spinlock_t irqlock;
41 struct mutex igate;
42 struct msix_entry *msix;
43 struct vfio_pci_irq_ctx *ctx;
44 int num_ctx;
45 int irq_type;
46 u8 msi_qmax;
47 u8 msix_bar;
48 u16 msix_size;
49 u32 msix_offset;
50 u32 rbar[7];
51 bool pci_2_3;
52 bool virq_disabled;
53 bool reset_works;
54 bool extended_caps;
55 bool bardirty;
56 struct pci_saved_state *pci_saved_state;
57 atomic_t refcnt;
58};
59
60#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
61#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
62#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
63#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
64#define irq_is(vdev, type) (vdev->irq_type == type)
65
66extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
67extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
68
69extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
70 uint32_t flags, unsigned index,
71 unsigned start, unsigned count, void *data);
72
73extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
74 char __user *buf, size_t count,
75 loff_t *ppos, bool iswrite);
76extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev,
77 char __user *buf, size_t count,
78 loff_t *ppos, bool iswrite);
79extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev,
80 char __user *buf, size_t count,
81 loff_t *ppos, bool iswrite);
82
83extern int vfio_pci_init_perm_bits(void);
84extern void vfio_pci_uninit_perm_bits(void);
85
86extern int vfio_pci_virqfd_init(void);
87extern void vfio_pci_virqfd_exit(void);
88
89extern int vfio_config_init(struct vfio_pci_device *vdev);
90extern void vfio_config_free(struct vfio_pci_device *vdev);
91#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
new file mode 100644
index 000000000000..4362d9e7baa3
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -0,0 +1,269 @@
1/*
2 * VFIO PCI I/O Port & MMIO access
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/fs.h>
17#include <linux/pci.h>
18#include <linux/uaccess.h>
19#include <linux/io.h>
20
21#include "vfio_pci_private.h"
22
23/* I/O Port BAR access */
24ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf,
25 size_t count, loff_t *ppos, bool iswrite)
26{
27 struct pci_dev *pdev = vdev->pdev;
28 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
29 int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
30 void __iomem *io;
31 size_t done = 0;
32
33 if (!pci_resource_start(pdev, bar))
34 return -EINVAL;
35
36 if (pos + count > pci_resource_len(pdev, bar))
37 return -EINVAL;
38
39 if (!vdev->barmap[bar]) {
40 int ret;
41
42 ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
43 if (ret)
44 return ret;
45
46 vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
47
48 if (!vdev->barmap[bar]) {
49 pci_release_selected_regions(pdev, 1 << bar);
50 return -EINVAL;
51 }
52 }
53
54 io = vdev->barmap[bar];
55
56 while (count) {
57 int filled;
58
59 if (count >= 3 && !(pos % 4)) {
60 __le32 val;
61
62 if (iswrite) {
63 if (copy_from_user(&val, buf, 4))
64 return -EFAULT;
65
66 iowrite32(le32_to_cpu(val), io + pos);
67 } else {
68 val = cpu_to_le32(ioread32(io + pos));
69
70 if (copy_to_user(buf, &val, 4))
71 return -EFAULT;
72 }
73
74 filled = 4;
75
76 } else if ((pos % 2) == 0 && count >= 2) {
77 __le16 val;
78
79 if (iswrite) {
80 if (copy_from_user(&val, buf, 2))
81 return -EFAULT;
82
83 iowrite16(le16_to_cpu(val), io + pos);
84 } else {
85 val = cpu_to_le16(ioread16(io + pos));
86
87 if (copy_to_user(buf, &val, 2))
88 return -EFAULT;
89 }
90
91 filled = 2;
92 } else {
93 u8 val;
94
95 if (iswrite) {
96 if (copy_from_user(&val, buf, 1))
97 return -EFAULT;
98
99 iowrite8(val, io + pos);
100 } else {
101 val = ioread8(io + pos);
102
103 if (copy_to_user(buf, &val, 1))
104 return -EFAULT;
105 }
106
107 filled = 1;
108 }
109
110 count -= filled;
111 done += filled;
112 buf += filled;
113 pos += filled;
114 }
115
116 *ppos += done;
117
118 return done;
119}
120
121/*
122 * MMIO BAR access
123 * We handle two excluded ranges here as well, if the user tries to read
124 * the ROM beyond what PCI tells us is available or the MSI-X table region,
125 * we return 0xFF and writes are dropped.
126 */
127ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf,
128 size_t count, loff_t *ppos, bool iswrite)
129{
130 struct pci_dev *pdev = vdev->pdev;
131 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
132 int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
133 void __iomem *io;
134 resource_size_t end;
135 size_t done = 0;
136 size_t x_start = 0, x_end = 0; /* excluded range */
137
138 if (!pci_resource_start(pdev, bar))
139 return -EINVAL;
140
141 end = pci_resource_len(pdev, bar);
142
143 if (pos > end)
144 return -EINVAL;
145
146 if (pos == end)
147 return 0;
148
149 if (pos + count > end)
150 count = end - pos;
151
152 if (bar == PCI_ROM_RESOURCE) {
153 io = pci_map_rom(pdev, &x_start);
154 x_end = end;
155 } else {
156 if (!vdev->barmap[bar]) {
157 int ret;
158
159 ret = pci_request_selected_regions(pdev, 1 << bar,
160 "vfio");
161 if (ret)
162 return ret;
163
164 vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
165
166 if (!vdev->barmap[bar]) {
167 pci_release_selected_regions(pdev, 1 << bar);
168 return -EINVAL;
169 }
170 }
171
172 io = vdev->barmap[bar];
173
174 if (bar == vdev->msix_bar) {
175 x_start = vdev->msix_offset;
176 x_end = vdev->msix_offset + vdev->msix_size;
177 }
178 }
179
180 if (!io)
181 return -EINVAL;
182
183 while (count) {
184 size_t fillable, filled;
185
186 if (pos < x_start)
187 fillable = x_start - pos;
188 else if (pos >= x_end)
189 fillable = end - pos;
190 else
191 fillable = 0;
192
193 if (fillable >= 4 && !(pos % 4) && (count >= 4)) {
194 __le32 val;
195
196 if (iswrite) {
197 if (copy_from_user(&val, buf, 4))
198 goto out;
199
200 iowrite32(le32_to_cpu(val), io + pos);
201 } else {
202 val = cpu_to_le32(ioread32(io + pos));
203
204 if (copy_to_user(buf, &val, 4))
205 goto out;
206 }
207
208 filled = 4;
209 } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) {
210 __le16 val;
211
212 if (iswrite) {
213 if (copy_from_user(&val, buf, 2))
214 goto out;
215
216 iowrite16(le16_to_cpu(val), io + pos);
217 } else {
218 val = cpu_to_le16(ioread16(io + pos));
219
220 if (copy_to_user(buf, &val, 2))
221 goto out;
222 }
223
224 filled = 2;
225 } else if (fillable) {
226 u8 val;
227
228 if (iswrite) {
229 if (copy_from_user(&val, buf, 1))
230 goto out;
231
232 iowrite8(val, io + pos);
233 } else {
234 val = ioread8(io + pos);
235
236 if (copy_to_user(buf, &val, 1))
237 goto out;
238 }
239
240 filled = 1;
241 } else {
242 /* Drop writes, fill reads with FF */
243 if (!iswrite) {
244 char val = 0xFF;
245 size_t i;
246
247 for (i = 0; i < x_end - pos; i++) {
248 if (put_user(val, buf + i))
249 goto out;
250 }
251 }
252
253 filled = x_end - pos;
254 }
255
256 count -= filled;
257 done += filled;
258 buf += filled;
259 pos += filled;
260 }
261
262 *ppos += done;
263
264out:
265 if (bar == PCI_ROM_RESOURCE)
266 pci_unmap_rom(pdev, io);
267
268 return count ? -EFAULT : done;
269}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index acb046fd5b70..0a4f180a11d8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -223,6 +223,7 @@ struct vfio_device_info {
223 __u32 argsz; 223 __u32 argsz;
224 __u32 flags; 224 __u32 flags;
225#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ 225#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
226#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
226 __u32 num_regions; /* Max region index + 1 */ 227 __u32 num_regions; /* Max region index + 1 */
227 __u32 num_irqs; /* Max IRQ index + 1 */ 228 __u32 num_irqs; /* Max IRQ index + 1 */
228}; 229};
@@ -364,6 +365,31 @@ struct vfio_irq_set {
364 */ 365 */
365#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) 366#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
366 367
368/*
369 * The VFIO-PCI bus driver makes use of the following fixed region and
370 * IRQ index mapping. Unimplemented regions return a size of zero.
371 * Unimplemented IRQ types return a count of zero.
372 */
373
374enum {
375 VFIO_PCI_BAR0_REGION_INDEX,
376 VFIO_PCI_BAR1_REGION_INDEX,
377 VFIO_PCI_BAR2_REGION_INDEX,
378 VFIO_PCI_BAR3_REGION_INDEX,
379 VFIO_PCI_BAR4_REGION_INDEX,
380 VFIO_PCI_BAR5_REGION_INDEX,
381 VFIO_PCI_ROM_REGION_INDEX,
382 VFIO_PCI_CONFIG_REGION_INDEX,
383 VFIO_PCI_NUM_REGIONS
384};
385
386enum {
387 VFIO_PCI_INTX_IRQ_INDEX,
388 VFIO_PCI_MSI_IRQ_INDEX,
389 VFIO_PCI_MSIX_IRQ_INDEX,
390 VFIO_PCI_NUM_IRQS
391};
392
367/* -------- API for Type1 VFIO IOMMU -------- */ 393/* -------- API for Type1 VFIO IOMMU -------- */
368 394
369/** 395/**