aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vfio/pci/vfio_pci.c
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2012-07-31 10:16:24 -0400
committerAlex Williamson <alex.williamson@redhat.com>2012-07-31 10:16:24 -0400
commit89e1f7d4c66d85f42c3d52ea3866eb10cadf6153 (patch)
tree6bea54ae5eaea48c17d309855d36d801259b64d1 /drivers/vfio/pci/vfio_pci.c
parent73fa0d10d077d9521ee2dace2307ae2c9a965336 (diff)
vfio: Add PCI device driver
Add PCI device support for VFIO. PCI devices expose regions for accessing config space, I/O port space, and MMIO areas of the device. PCI config access is virtualized in the kernel, allowing us to ensure the integrity of the system, by preventing various accesses while reducing duplicate support across various userspace drivers. I/O port supports read/write access while MMIO also supports mmap of sufficiently sized regions. Support for INTx, MSI, and MSI-X interrupts are provided using eventfds to userspace. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Diffstat (limited to 'drivers/vfio/pci/vfio_pci.c')
-rw-r--r--drivers/vfio/pci/vfio_pci.c579
1 files changed, 579 insertions, 0 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 000000000000..6968b7232232
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/interrupt.h>
17#include <linux/iommu.h>
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/notifier.h>
21#include <linux/pci.h>
22#include <linux/pm_runtime.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/uaccess.h>
26#include <linux/vfio.h>
27
28#include "vfio_pci_private.h"
29
30#define DRIVER_VERSION "0.2"
31#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
33
34static bool nointxmask;
35module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36MODULE_PARM_DESC(nointxmask,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38
39static int vfio_pci_enable(struct vfio_pci_device *vdev)
40{
41 struct pci_dev *pdev = vdev->pdev;
42 int ret;
43 u16 cmd;
44 u8 msix_pos;
45
46 vdev->reset_works = (pci_reset_function(pdev) == 0);
47 pci_save_state(pdev);
48 vdev->pci_saved_state = pci_store_saved_state(pdev);
49 if (!vdev->pci_saved_state)
50 pr_debug("%s: Couldn't store %s saved state\n",
51 __func__, dev_name(&pdev->dev));
52
53 ret = vfio_config_init(vdev);
54 if (ret)
55 goto out;
56
57 if (likely(!nointxmask))
58 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
59
60 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
61 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
62 cmd &= ~PCI_COMMAND_INTX_DISABLE;
63 pci_write_config_word(pdev, PCI_COMMAND, cmd);
64 }
65
66 msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
67 if (msix_pos) {
68 u16 flags;
69 u32 table;
70
71 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
72 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
73
74 vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
75 vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
76 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
77 } else
78 vdev->msix_bar = 0xFF;
79
80 ret = pci_enable_device(pdev);
81 if (ret)
82 goto out;
83
84 return ret;
85
86out:
87 kfree(vdev->pci_saved_state);
88 vdev->pci_saved_state = NULL;
89 vfio_config_free(vdev);
90 return ret;
91}
92
93static void vfio_pci_disable(struct vfio_pci_device *vdev)
94{
95 int bar;
96
97 pci_disable_device(vdev->pdev);
98
99 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
100 VFIO_IRQ_SET_ACTION_TRIGGER,
101 vdev->irq_type, 0, 0, NULL);
102
103 vdev->virq_disabled = false;
104
105 vfio_config_free(vdev);
106
107 pci_reset_function(vdev->pdev);
108
109 if (pci_load_and_free_saved_state(vdev->pdev,
110 &vdev->pci_saved_state) == 0)
111 pci_restore_state(vdev->pdev);
112 else
113 pr_info("%s: Couldn't reload %s saved state\n",
114 __func__, dev_name(&vdev->pdev->dev));
115
116 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
117 if (!vdev->barmap[bar])
118 continue;
119 pci_iounmap(vdev->pdev, vdev->barmap[bar]);
120 pci_release_selected_regions(vdev->pdev, 1 << bar);
121 vdev->barmap[bar] = NULL;
122 }
123}
124
125static void vfio_pci_release(void *device_data)
126{
127 struct vfio_pci_device *vdev = device_data;
128
129 if (atomic_dec_and_test(&vdev->refcnt))
130 vfio_pci_disable(vdev);
131
132 module_put(THIS_MODULE);
133}
134
135static int vfio_pci_open(void *device_data)
136{
137 struct vfio_pci_device *vdev = device_data;
138
139 if (!try_module_get(THIS_MODULE))
140 return -ENODEV;
141
142 if (atomic_inc_return(&vdev->refcnt) == 1) {
143 int ret = vfio_pci_enable(vdev);
144 if (ret) {
145 module_put(THIS_MODULE);
146 return ret;
147 }
148 }
149
150 return 0;
151}
152
153static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
154{
155 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
156 u8 pin;
157 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
158 if (pin)
159 return 1;
160
161 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
162 u8 pos;
163 u16 flags;
164
165 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
166 if (pos) {
167 pci_read_config_word(vdev->pdev,
168 pos + PCI_MSI_FLAGS, &flags);
169
170 return 1 << (flags & PCI_MSI_FLAGS_QMASK);
171 }
172 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
173 u8 pos;
174 u16 flags;
175
176 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
177 if (pos) {
178 pci_read_config_word(vdev->pdev,
179 pos + PCI_MSIX_FLAGS, &flags);
180
181 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
182 }
183 }
184
185 return 0;
186}
187
188static long vfio_pci_ioctl(void *device_data,
189 unsigned int cmd, unsigned long arg)
190{
191 struct vfio_pci_device *vdev = device_data;
192 unsigned long minsz;
193
194 if (cmd == VFIO_DEVICE_GET_INFO) {
195 struct vfio_device_info info;
196
197 minsz = offsetofend(struct vfio_device_info, num_irqs);
198
199 if (copy_from_user(&info, (void __user *)arg, minsz))
200 return -EFAULT;
201
202 if (info.argsz < minsz)
203 return -EINVAL;
204
205 info.flags = VFIO_DEVICE_FLAGS_PCI;
206
207 if (vdev->reset_works)
208 info.flags |= VFIO_DEVICE_FLAGS_RESET;
209
210 info.num_regions = VFIO_PCI_NUM_REGIONS;
211 info.num_irqs = VFIO_PCI_NUM_IRQS;
212
213 return copy_to_user((void __user *)arg, &info, minsz);
214
215 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
216 struct pci_dev *pdev = vdev->pdev;
217 struct vfio_region_info info;
218
219 minsz = offsetofend(struct vfio_region_info, offset);
220
221 if (copy_from_user(&info, (void __user *)arg, minsz))
222 return -EFAULT;
223
224 if (info.argsz < minsz)
225 return -EINVAL;
226
227 switch (info.index) {
228 case VFIO_PCI_CONFIG_REGION_INDEX:
229 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
230 info.size = pdev->cfg_size;
231 info.flags = VFIO_REGION_INFO_FLAG_READ |
232 VFIO_REGION_INFO_FLAG_WRITE;
233 break;
234 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
235 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
236 info.size = pci_resource_len(pdev, info.index);
237 if (!info.size) {
238 info.flags = 0;
239 break;
240 }
241
242 info.flags = VFIO_REGION_INFO_FLAG_READ |
243 VFIO_REGION_INFO_FLAG_WRITE;
244 if (pci_resource_flags(pdev, info.index) &
245 IORESOURCE_MEM && info.size >= PAGE_SIZE)
246 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
247 break;
248 case VFIO_PCI_ROM_REGION_INDEX:
249 {
250 void __iomem *io;
251 size_t size;
252
253 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
254 info.flags = 0;
255
256 /* Report the BAR size, not the ROM size */
257 info.size = pci_resource_len(pdev, info.index);
258 if (!info.size)
259 break;
260
261 /* Is it really there? */
262 io = pci_map_rom(pdev, &size);
263 if (!io || !size) {
264 info.size = 0;
265 break;
266 }
267 pci_unmap_rom(pdev, io);
268
269 info.flags = VFIO_REGION_INFO_FLAG_READ;
270 break;
271 }
272 default:
273 return -EINVAL;
274 }
275
276 return copy_to_user((void __user *)arg, &info, minsz);
277
278 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
279 struct vfio_irq_info info;
280
281 minsz = offsetofend(struct vfio_irq_info, count);
282
283 if (copy_from_user(&info, (void __user *)arg, minsz))
284 return -EFAULT;
285
286 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
287 return -EINVAL;
288
289 info.flags = VFIO_IRQ_INFO_EVENTFD;
290
291 info.count = vfio_pci_get_irq_count(vdev, info.index);
292
293 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
294 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
295 VFIO_IRQ_INFO_AUTOMASKED);
296 else
297 info.flags |= VFIO_IRQ_INFO_NORESIZE;
298
299 return copy_to_user((void __user *)arg, &info, minsz);
300
301 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
302 struct vfio_irq_set hdr;
303 u8 *data = NULL;
304 int ret = 0;
305
306 minsz = offsetofend(struct vfio_irq_set, count);
307
308 if (copy_from_user(&hdr, (void __user *)arg, minsz))
309 return -EFAULT;
310
311 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
312 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
313 VFIO_IRQ_SET_ACTION_TYPE_MASK))
314 return -EINVAL;
315
316 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
317 size_t size;
318
319 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
320 size = sizeof(uint8_t);
321 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
322 size = sizeof(int32_t);
323 else
324 return -EINVAL;
325
326 if (hdr.argsz - minsz < hdr.count * size ||
327 hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
328 return -EINVAL;
329
330 data = kmalloc(hdr.count * size, GFP_KERNEL);
331 if (!data)
332 return -ENOMEM;
333
334 if (copy_from_user(data, (void __user *)(arg + minsz),
335 hdr.count * size)) {
336 kfree(data);
337 return -EFAULT;
338 }
339 }
340
341 mutex_lock(&vdev->igate);
342
343 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
344 hdr.start, hdr.count, data);
345
346 mutex_unlock(&vdev->igate);
347 kfree(data);
348
349 return ret;
350
351 } else if (cmd == VFIO_DEVICE_RESET)
352 return vdev->reset_works ?
353 pci_reset_function(vdev->pdev) : -EINVAL;
354
355 return -ENOTTY;
356}
357
358static ssize_t vfio_pci_read(void *device_data, char __user *buf,
359 size_t count, loff_t *ppos)
360{
361 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
362 struct vfio_pci_device *vdev = device_data;
363 struct pci_dev *pdev = vdev->pdev;
364
365 if (index >= VFIO_PCI_NUM_REGIONS)
366 return -EINVAL;
367
368 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
369 return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
370 else if (index == VFIO_PCI_ROM_REGION_INDEX)
371 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
372 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
373 return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
374 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
375 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
376
377 return -EINVAL;
378}
379
380static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
381 size_t count, loff_t *ppos)
382{
383 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
384 struct vfio_pci_device *vdev = device_data;
385 struct pci_dev *pdev = vdev->pdev;
386
387 if (index >= VFIO_PCI_NUM_REGIONS)
388 return -EINVAL;
389
390 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
391 return vfio_pci_config_readwrite(vdev, (char __user *)buf,
392 count, ppos, true);
393 else if (index == VFIO_PCI_ROM_REGION_INDEX)
394 return -EINVAL;
395 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
396 return vfio_pci_io_readwrite(vdev, (char __user *)buf,
397 count, ppos, true);
398 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
399 return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
400 count, ppos, true);
401 }
402
403 return -EINVAL;
404}
405
406static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
407{
408 struct vfio_pci_device *vdev = device_data;
409 struct pci_dev *pdev = vdev->pdev;
410 unsigned int index;
411 u64 phys_len, req_len, pgoff, req_start, phys;
412 int ret;
413
414 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
415
416 if (vma->vm_end < vma->vm_start)
417 return -EINVAL;
418 if ((vma->vm_flags & VM_SHARED) == 0)
419 return -EINVAL;
420 if (index >= VFIO_PCI_ROM_REGION_INDEX)
421 return -EINVAL;
422 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
423 return -EINVAL;
424
425 phys_len = pci_resource_len(pdev, index);
426 req_len = vma->vm_end - vma->vm_start;
427 pgoff = vma->vm_pgoff &
428 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
429 req_start = pgoff << PAGE_SHIFT;
430
431 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
432 return -EINVAL;
433
434 if (index == vdev->msix_bar) {
435 /*
436 * Disallow mmaps overlapping the MSI-X table; users don't
437 * get to touch this directly. We could find somewhere
438 * else to map the overlap, but page granularity is only
439 * a recommendation, not a requirement, so the user needs
440 * to know which bits are real. Requiring them to mmap
441 * around the table makes that clear.
442 */
443
444 /* If neither entirely above nor below, then it overlaps */
445 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
446 req_start + req_len <= vdev->msix_offset))
447 return -EINVAL;
448 }
449
450 /*
451 * Even though we don't make use of the barmap for the mmap,
452 * we need to request the region and the barmap tracks that.
453 */
454 if (!vdev->barmap[index]) {
455 ret = pci_request_selected_regions(pdev,
456 1 << index, "vfio-pci");
457 if (ret)
458 return ret;
459
460 vdev->barmap[index] = pci_iomap(pdev, index, 0);
461 }
462
463 vma->vm_private_data = vdev;
464 vma->vm_flags |= (VM_IO | VM_RESERVED);
465 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
466
467 phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
468
469 return remap_pfn_range(vma, vma->vm_start, phys,
470 req_len, vma->vm_page_prot);
471}
472
473static const struct vfio_device_ops vfio_pci_ops = {
474 .name = "vfio-pci",
475 .open = vfio_pci_open,
476 .release = vfio_pci_release,
477 .ioctl = vfio_pci_ioctl,
478 .read = vfio_pci_read,
479 .write = vfio_pci_write,
480 .mmap = vfio_pci_mmap,
481};
482
483static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
484{
485 u8 type;
486 struct vfio_pci_device *vdev;
487 struct iommu_group *group;
488 int ret;
489
490 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
491 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
492 return -EINVAL;
493
494 group = iommu_group_get(&pdev->dev);
495 if (!group)
496 return -EINVAL;
497
498 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
499 if (!vdev) {
500 iommu_group_put(group);
501 return -ENOMEM;
502 }
503
504 vdev->pdev = pdev;
505 vdev->irq_type = VFIO_PCI_NUM_IRQS;
506 mutex_init(&vdev->igate);
507 spin_lock_init(&vdev->irqlock);
508 atomic_set(&vdev->refcnt, 0);
509
510 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
511 if (ret) {
512 iommu_group_put(group);
513 kfree(vdev);
514 }
515
516 return ret;
517}
518
519static void vfio_pci_remove(struct pci_dev *pdev)
520{
521 struct vfio_pci_device *vdev;
522
523 vdev = vfio_del_group_dev(&pdev->dev);
524 if (!vdev)
525 return;
526
527 iommu_group_put(pdev->dev.iommu_group);
528 kfree(vdev);
529}
530
531static struct pci_driver vfio_pci_driver = {
532 .name = "vfio-pci",
533 .id_table = NULL, /* only dynamic ids */
534 .probe = vfio_pci_probe,
535 .remove = vfio_pci_remove,
536};
537
538static void __exit vfio_pci_cleanup(void)
539{
540 pci_unregister_driver(&vfio_pci_driver);
541 vfio_pci_virqfd_exit();
542 vfio_pci_uninit_perm_bits();
543}
544
545static int __init vfio_pci_init(void)
546{
547 int ret;
548
549 /* Allocate shared config space permision data used by all devices */
550 ret = vfio_pci_init_perm_bits();
551 if (ret)
552 return ret;
553
554 /* Start the virqfd cleanup handler */
555 ret = vfio_pci_virqfd_init();
556 if (ret)
557 goto out_virqfd;
558
559 /* Register and scan for devices */
560 ret = pci_register_driver(&vfio_pci_driver);
561 if (ret)
562 goto out_driver;
563
564 return 0;
565
566out_virqfd:
567 vfio_pci_virqfd_exit();
568out_driver:
569 vfio_pci_uninit_perm_bits();
570 return ret;
571}
572
573module_init(vfio_pci_init);
574module_exit(vfio_pci_cleanup);
575
576MODULE_VERSION(DRIVER_VERSION);
577MODULE_LICENSE("GPL v2");
578MODULE_AUTHOR(DRIVER_AUTHOR);
579MODULE_DESCRIPTION(DRIVER_DESC);