aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 17:02:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 17:02:32 -0400
commit0b2e3b6bb4a415379f16e38fc92db42379be47a1 (patch)
treeac6af620793ecd5e4b1d5523e0f431d2d8a5ef66 /drivers/vfio
parente95893004104054d49406fd108fefa3ddc054366 (diff)
parent664e9386bd05dbdfecfb28d6cf2fde983aabc65c (diff)
Merge tag 'vfio-for-v3.10' of git://github.com/awilliam/linux-vfio
Pull vfio updates from Alex Williamson: "Changes include extension to support PCI AER notification to userspace, byte granularity of PCI config space and access to unarchitected PCI config space, better protection around IOMMU driver accesses, default file mode fix, and a few misc cleanups." * tag 'vfio-for-v3.10' of git://github.com/awilliam/linux-vfio: vfio: Set container device mode vfio: Use down_reads to protect iommu disconnects vfio: Convert container->group_lock to rwsem PCI/VFIO: use pcie_flags_reg instead of access PCI-E Capabilities Register vfio-pci: Enable raw access to unassigned config space vfio-pci: Use byte granularity in config map vfio: make local function vfio_pci_intx_unmask_handler() static VFIO-AER: Vfio-pci driver changes for supporting AER VFIO: Wrapper for getting reference to vfio_device
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/pci/vfio_pci.c44
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c172
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c67
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h1
-rw-r--r--drivers/vfio/vfio.c117
5 files changed, 294 insertions, 107 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 09d2e3ffd6fc..ac3725440d64 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
201 201
202 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 202 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
203 } 203 }
204 } 204 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
205 if (pci_is_pcie(vdev->pdev))
206 return 1;
205 207
206 return 0; 208 return 0;
207} 209}
@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
317 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 319 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
318 return -EINVAL; 320 return -EINVAL;
319 321
322 switch (info.index) {
323 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
324 break;
325 case VFIO_PCI_ERR_IRQ_INDEX:
326 if (pci_is_pcie(vdev->pdev))
327 break;
328 /* pass thru to return error */
329 default:
330 return -EINVAL;
331 }
332
320 info.flags = VFIO_IRQ_INFO_EVENTFD; 333 info.flags = VFIO_IRQ_INFO_EVENTFD;
321 334
322 info.count = vfio_pci_get_irq_count(vdev, info.index); 335 info.count = vfio_pci_get_irq_count(vdev, info.index);
@@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
552 kfree(vdev); 565 kfree(vdev);
553} 566}
554 567
568static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
569 pci_channel_state_t state)
570{
571 struct vfio_pci_device *vdev;
572 struct vfio_device *device;
573
574 device = vfio_device_get_from_dev(&pdev->dev);
575 if (device == NULL)
576 return PCI_ERS_RESULT_DISCONNECT;
577
578 vdev = vfio_device_data(device);
579 if (vdev == NULL) {
580 vfio_device_put(device);
581 return PCI_ERS_RESULT_DISCONNECT;
582 }
583
584 if (vdev->err_trigger)
585 eventfd_signal(vdev->err_trigger, 1);
586
587 vfio_device_put(device);
588
589 return PCI_ERS_RESULT_CAN_RECOVER;
590}
591
592static struct pci_error_handlers vfio_err_handlers = {
593 .error_detected = vfio_pci_aer_err_detected,
594};
595
555static struct pci_driver vfio_pci_driver = { 596static struct pci_driver vfio_pci_driver = {
556 .name = "vfio-pci", 597 .name = "vfio-pci",
557 .id_table = NULL, /* only dynamic ids */ 598 .id_table = NULL, /* only dynamic ids */
558 .probe = vfio_pci_probe, 599 .probe = vfio_pci_probe,
559 .remove = vfio_pci_remove, 600 .remove = vfio_pci_remove,
601 .err_handler = &vfio_err_handlers,
560}; 602};
561 603
562static void __exit vfio_pci_cleanup(void) 604static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index aeb00fc2d3be..affa34745be9 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -274,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
274 return count; 274 return count;
275} 275}
276 276
277static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, 277/* Raw access skips any kind of virtualization */
278 int count, struct perm_bits *perm, 278static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
279 int offset, __le32 val) 279 int count, struct perm_bits *perm,
280 int offset, __le32 val)
280{ 281{
281 int ret; 282 int ret;
282 283
@@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
287 return count; 288 return count;
288} 289}
289 290
290/* Default all regions to read-only, no-virtualization */ 291static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
292 int count, struct perm_bits *perm,
293 int offset, __le32 *val)
294{
295 int ret;
296
297 ret = vfio_user_config_read(vdev->pdev, pos, val, count);
298 if (ret)
299 return pcibios_err_to_errno(ret);
300
301 return count;
302}
303
304/* Default capability regions to read-only, no-virtualization */
291static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { 305static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
292 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } 306 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
293}; 307};
294static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { 308static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
295 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } 309 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
296}; 310};
311/*
312 * Default unassigned regions to raw read-write access. Some devices
313 * require this to function as they hide registers between the gaps in
314 * config space (be2net). Like MMIO and I/O port registers, we have
315 * to trust the hardware isolation.
316 */
317static struct perm_bits unassigned_perms = {
318 .readfn = vfio_raw_config_read,
319 .writefn = vfio_raw_config_write
320};
297 321
298static void free_perm_bits(struct perm_bits *perm) 322static void free_perm_bits(struct perm_bits *perm)
299{ 323{
@@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void)
779 803
780 /* Capabilities */ 804 /* Capabilities */
781 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); 805 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
782 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; 806 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
783 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); 807 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
784 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; 808 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
785 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); 809 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
786 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); 810 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
787 811
788 /* Extended capabilities */ 812 /* Extended capabilities */
789 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); 813 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
790 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); 814 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
791 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; 815 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
792 816
793 if (ret) 817 if (ret)
794 vfio_pci_uninit_perm_bits(); 818 vfio_pci_uninit_perm_bits();
@@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
801 u8 cap; 825 u8 cap;
802 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : 826 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
803 PCI_STD_HEADER_SIZEOF; 827 PCI_STD_HEADER_SIZEOF;
804 base /= 4;
805 pos /= 4;
806
807 cap = vdev->pci_config_map[pos]; 828 cap = vdev->pci_config_map[pos];
808 829
809 if (cap == PCI_CAP_ID_BASIC) 830 if (cap == PCI_CAP_ID_BASIC)
@@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
813 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) 834 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
814 pos--; 835 pos--;
815 836
816 return pos * 4; 837 return pos;
817} 838}
818 839
819static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, 840static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
@@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
1017 return byte; 1038 return byte;
1018 case PCI_CAP_ID_EXP: 1039 case PCI_CAP_ID_EXP:
1019 /* length based on version */ 1040 /* length based on version */
1020 ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
1021 if (ret)
1022 return pcibios_err_to_errno(ret);
1023
1024 vdev->extended_caps = true; 1041 vdev->extended_caps = true;
1025 1042
1026 if ((word & PCI_EXP_FLAGS_VERS) == 1) 1043 if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
1027 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; 1044 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
1028 else 1045 else
1029 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; 1046 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
@@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
1230 } 1247 }
1231 1248
1232 /* Sanity check, do we overlap other capabilities? */ 1249 /* Sanity check, do we overlap other capabilities? */
1233 for (i = 0; i < len; i += 4) { 1250 for (i = 0; i < len; i++) {
1234 if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) 1251 if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
1235 continue; 1252 continue;
1236 1253
1237 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", 1254 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
@@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
1239 pos + i, map[pos + i], cap); 1256 pos + i, map[pos + i], cap);
1240 } 1257 }
1241 1258
1242 memset(map + (pos / 4), cap, len / 4); 1259 memset(map + pos, cap, len);
1243 ret = vfio_fill_vconfig_bytes(vdev, pos, len); 1260 ret = vfio_fill_vconfig_bytes(vdev, pos, len);
1244 if (ret) 1261 if (ret)
1245 return ret; 1262 return ret;
@@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
1314 hidden = true; 1331 hidden = true;
1315 } 1332 }
1316 1333
1317 for (i = 0; i < len; i += 4) { 1334 for (i = 0; i < len; i++) {
1318 if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) 1335 if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
1319 continue; 1336 continue;
1320 1337
1321 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", 1338 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
@@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
1330 */ 1347 */
1331 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); 1348 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
1332 1349
1333 memset(map + (epos / 4), ecap, len / 4); 1350 memset(map + epos, ecap, len);
1334 ret = vfio_fill_vconfig_bytes(vdev, epos, len); 1351 ret = vfio_fill_vconfig_bytes(vdev, epos, len);
1335 if (ret) 1352 if (ret)
1336 return ret; 1353 return ret;
@@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev)
1377 int ret; 1394 int ret;
1378 1395
1379 /* 1396 /*
1380 * Config space, caps and ecaps are all dword aligned, so we can 1397 * Config space, caps and ecaps are all dword aligned, so we could
1381 * use one byte per dword to record the type. 1398 * use one byte per dword to record the type. However, there are
1399 * no requiremenst on the length of a capability, so the gap between
1400 * capabilities needs byte granularity.
1382 */ 1401 */
1383 map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); 1402 map = kmalloc(pdev->cfg_size, GFP_KERNEL);
1384 if (!map) 1403 if (!map)
1385 return -ENOMEM; 1404 return -ENOMEM;
1386 1405
@@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev)
1393 vdev->pci_config_map = map; 1412 vdev->pci_config_map = map;
1394 vdev->vconfig = vconfig; 1413 vdev->vconfig = vconfig;
1395 1414
1396 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); 1415 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
1397 memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, 1416 memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
1398 (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); 1417 pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
1399 1418
1400 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); 1419 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
1401 if (ret) 1420 if (ret)
@@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev)
1450 vdev->msi_perm = NULL; 1469 vdev->msi_perm = NULL;
1451} 1470}
1452 1471
1472/*
1473 * Find the remaining number of bytes in a dword that match the given
1474 * position. Stop at either the end of the capability or the dword boundary.
1475 */
1476static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
1477 loff_t pos)
1478{
1479 u8 cap = vdev->pci_config_map[pos];
1480 size_t i;
1481
1482 for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
1483 /* nop */;
1484
1485 return i;
1486}
1487
1453static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, 1488static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1454 size_t count, loff_t *ppos, bool iswrite) 1489 size_t count, loff_t *ppos, bool iswrite)
1455{ 1490{
@@ -1458,55 +1493,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1458 __le32 val = 0; 1493 __le32 val = 0;
1459 int cap_start = 0, offset; 1494 int cap_start = 0, offset;
1460 u8 cap_id; 1495 u8 cap_id;
1461 ssize_t ret = count; 1496 ssize_t ret;
1462 1497
1463 if (*ppos < 0 || *ppos + count > pdev->cfg_size) 1498 if (*ppos < 0 || *ppos >= pdev->cfg_size ||
1499 *ppos + count > pdev->cfg_size)
1464 return -EFAULT; 1500 return -EFAULT;
1465 1501
1466 /* 1502 /*
1467 * gcc can't seem to figure out we're a static function, only called 1503 * Chop accesses into aligned chunks containing no more than a
1468 * with count of 1/2/4 and hits copy_from_user_overflow without this. 1504 * single capability. Caller increments to the next chunk.
1469 */ 1505 */
1470 if (count > sizeof(val)) 1506 count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
1471 return -EINVAL; 1507 if (count >= 4 && !(*ppos % 4))
1472 1508 count = 4;
1473 cap_id = vdev->pci_config_map[*ppos / 4]; 1509 else if (count >= 2 && !(*ppos % 2))
1474 1510 count = 2;
1475 if (cap_id == PCI_CAP_ID_INVALID) { 1511 else
1476 if (iswrite) 1512 count = 1;
1477 return ret; /* drop */
1478
1479 /*
1480 * Per PCI spec 3.0, section 6.1, reads from reserved and
1481 * unimplemented registers return 0
1482 */
1483 if (copy_to_user(buf, &val, count))
1484 return -EFAULT;
1485
1486 return ret;
1487 }
1488 1513
1489 /* 1514 ret = count;
1490 * All capabilities are minimum 4 bytes and aligned on dword
1491 * boundaries. Since we don't support unaligned accesses, we're
1492 * only ever accessing a single capability.
1493 */
1494 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1495 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1496 1515
1497 perm = &ecap_perms[cap_id]; 1516 cap_id = vdev->pci_config_map[*ppos];
1498 cap_start = vfio_find_cap_start(vdev, *ppos);
1499 1517
1518 if (cap_id == PCI_CAP_ID_INVALID) {
1519 perm = &unassigned_perms;
1520 cap_start = *ppos;
1500 } else { 1521 } else {
1501 WARN_ON(cap_id > PCI_CAP_ID_MAX); 1522 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1523 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1502 1524
1503 perm = &cap_perms[cap_id]; 1525 perm = &ecap_perms[cap_id];
1526 cap_start = vfio_find_cap_start(vdev, *ppos);
1527 } else {
1528 WARN_ON(cap_id > PCI_CAP_ID_MAX);
1504 1529
1505 if (cap_id == PCI_CAP_ID_MSI) 1530 perm = &cap_perms[cap_id];
1506 perm = vdev->msi_perm;
1507 1531
1508 if (cap_id > PCI_CAP_ID_BASIC) 1532 if (cap_id == PCI_CAP_ID_MSI)
1509 cap_start = vfio_find_cap_start(vdev, *ppos); 1533 perm = vdev->msi_perm;
1534
1535 if (cap_id > PCI_CAP_ID_BASIC)
1536 cap_start = vfio_find_cap_start(vdev, *ppos);
1537 }
1510 } 1538 }
1511 1539
1512 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); 1540 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
@@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
1546 1574
1547 pos &= VFIO_PCI_OFFSET_MASK; 1575 pos &= VFIO_PCI_OFFSET_MASK;
1548 1576
1549 /*
1550 * We want to both keep the access size the caller users as well as
1551 * support reading large chunks of config space in a single call.
1552 * PCI doesn't support unaligned accesses, so we can safely break
1553 * those apart.
1554 */
1555 while (count) { 1577 while (count) {
1556 if (count >= 4 && !(pos % 4)) 1578 ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
1557 ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
1558 else if (count >= 2 && !(pos % 2))
1559 ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
1560 else
1561 ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
1562
1563 if (ret < 0) 1579 if (ret < 0)
1564 return ret; 1580 return ret;
1565 1581
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index a96509187deb..4bc704e1b7c7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
287 * a signal is necessary, which can then be handled via a work queue 287 * a signal is necessary, which can then be handled via a work queue
288 * or directly depending on the caller. 288 * or directly depending on the caller.
289 */ 289 */
290int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) 290static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
291 void *unused)
291{ 292{
292 struct pci_dev *pdev = vdev->pdev; 293 struct pci_dev *pdev = vdev->pdev;
293 unsigned long flags; 294 unsigned long flags;
@@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
746 return 0; 747 return 0;
747} 748}
748 749
750static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
751 unsigned index, unsigned start,
752 unsigned count, uint32_t flags, void *data)
753{
754 int32_t fd = *(int32_t *)data;
755 struct pci_dev *pdev = vdev->pdev;
756
757 if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
758 !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
759 return -EINVAL;
760
761 /*
762 * device_lock synchronizes setting and checking of
763 * err_trigger. The vfio_pci_aer_err_detected() is also
764 * called with device_lock held.
765 */
766
767 /* DATA_NONE/DATA_BOOL enables loopback testing */
768
769 if (flags & VFIO_IRQ_SET_DATA_NONE) {
770 device_lock(&pdev->dev);
771 if (vdev->err_trigger)
772 eventfd_signal(vdev->err_trigger, 1);
773 device_unlock(&pdev->dev);
774 return 0;
775 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
776 uint8_t trigger = *(uint8_t *)data;
777 device_lock(&pdev->dev);
778 if (trigger && vdev->err_trigger)
779 eventfd_signal(vdev->err_trigger, 1);
780 device_unlock(&pdev->dev);
781 return 0;
782 }
783
784 /* Handle SET_DATA_EVENTFD */
785
786 if (fd == -1) {
787 device_lock(&pdev->dev);
788 if (vdev->err_trigger)
789 eventfd_ctx_put(vdev->err_trigger);
790 vdev->err_trigger = NULL;
791 device_unlock(&pdev->dev);
792 return 0;
793 } else if (fd >= 0) {
794 struct eventfd_ctx *efdctx;
795 efdctx = eventfd_ctx_fdget(fd);
796 if (IS_ERR(efdctx))
797 return PTR_ERR(efdctx);
798 device_lock(&pdev->dev);
799 if (vdev->err_trigger)
800 eventfd_ctx_put(vdev->err_trigger);
801 vdev->err_trigger = efdctx;
802 device_unlock(&pdev->dev);
803 return 0;
804 } else
805 return -EINVAL;
806}
749int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 807int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
750 unsigned index, unsigned start, unsigned count, 808 unsigned index, unsigned start, unsigned count,
751 void *data) 809 void *data)
@@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
780 break; 838 break;
781 } 839 }
782 break; 840 break;
841 case VFIO_PCI_ERR_IRQ_INDEX:
842 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
843 case VFIO_IRQ_SET_ACTION_TRIGGER:
844 if (pci_is_pcie(vdev->pdev))
845 func = vfio_pci_set_err_trigger;
846 break;
847 }
783 } 848 }
784 849
785 if (!func) 850 if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index d7e55d03f49e..9c6d5d0f3b02 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -56,6 +56,7 @@ struct vfio_pci_device {
56 bool has_vga; 56 bool has_vga;
57 struct pci_saved_state *pci_saved_state; 57 struct pci_saved_state *pci_saved_state;
58 atomic_t refcnt; 58 atomic_t refcnt;
59 struct eventfd_ctx *err_trigger;
59}; 60};
60 61
61#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) 62#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3e60a3..acb7121a9316 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -24,8 +24,10 @@
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/rwsem.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/stat.h>
29#include <linux/string.h> 31#include <linux/string.h>
30#include <linux/uaccess.h> 32#include <linux/uaccess.h>
31#include <linux/vfio.h> 33#include <linux/vfio.h>
@@ -57,7 +59,7 @@ struct vfio_iommu_driver {
57struct vfio_container { 59struct vfio_container {
58 struct kref kref; 60 struct kref kref;
59 struct list_head group_list; 61 struct list_head group_list;
60 struct mutex group_lock; 62 struct rw_semaphore group_lock;
61 struct vfio_iommu_driver *iommu_driver; 63 struct vfio_iommu_driver *iommu_driver;
62 void *iommu_data; 64 void *iommu_data;
63}; 65};
@@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref)
392} 394}
393 395
394/* Device reference always implies a group reference */ 396/* Device reference always implies a group reference */
395static void vfio_device_put(struct vfio_device *device) 397void vfio_device_put(struct vfio_device *device)
396{ 398{
397 struct vfio_group *group = device->group; 399 struct vfio_group *group = device->group;
398 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); 400 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
399 vfio_group_put(group); 401 vfio_group_put(group);
400} 402}
403EXPORT_SYMBOL_GPL(vfio_device_put);
401 404
402static void vfio_device_get(struct vfio_device *device) 405static void vfio_device_get(struct vfio_device *device)
403{ 406{
@@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev,
627} 630}
628EXPORT_SYMBOL_GPL(vfio_add_group_dev); 631EXPORT_SYMBOL_GPL(vfio_add_group_dev);
629 632
633/**
634 * Get a reference to the vfio_device for a device that is known to
635 * be bound to a vfio driver. The driver implicitly holds a
636 * vfio_device reference between vfio_add_group_dev and
637 * vfio_del_group_dev. We can therefore use drvdata to increment
638 * that reference from the struct device. This additional
639 * reference must be released by calling vfio_device_put.
640 */
641struct vfio_device *vfio_device_get_from_dev(struct device *dev)
642{
643 struct vfio_device *device = dev_get_drvdata(dev);
644
645 vfio_device_get(device);
646
647 return device;
648}
649EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
650
651/*
652 * Caller must hold a reference to the vfio_device
653 */
654void *vfio_device_data(struct vfio_device *device)
655{
656 return device->device_data;
657}
658EXPORT_SYMBOL_GPL(vfio_device_data);
659
630/* Given a referenced group, check if it contains the device */ 660/* Given a referenced group, check if it contains the device */
631static bool vfio_dev_present(struct vfio_group *group, struct device *dev) 661static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
632{ 662{
@@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
675static long vfio_ioctl_check_extension(struct vfio_container *container, 705static long vfio_ioctl_check_extension(struct vfio_container *container,
676 unsigned long arg) 706 unsigned long arg)
677{ 707{
678 struct vfio_iommu_driver *driver = container->iommu_driver; 708 struct vfio_iommu_driver *driver;
679 long ret = 0; 709 long ret = 0;
680 710
711 down_read(&container->group_lock);
712
713 driver = container->iommu_driver;
714
681 switch (arg) { 715 switch (arg) {
682 /* No base extensions yet */ 716 /* No base extensions yet */
683 default: 717 default:
@@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
707 VFIO_CHECK_EXTENSION, arg); 741 VFIO_CHECK_EXTENSION, arg);
708 } 742 }
709 743
744 up_read(&container->group_lock);
745
710 return ret; 746 return ret;
711} 747}
712 748
713/* hold container->group_lock */ 749/* hold write lock on container->group_lock */
714static int __vfio_container_attach_groups(struct vfio_container *container, 750static int __vfio_container_attach_groups(struct vfio_container *container,
715 struct vfio_iommu_driver *driver, 751 struct vfio_iommu_driver *driver,
716 void *data) 752 void *data)
@@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
741 struct vfio_iommu_driver *driver; 777 struct vfio_iommu_driver *driver;
742 long ret = -ENODEV; 778 long ret = -ENODEV;
743 779
744 mutex_lock(&container->group_lock); 780 down_write(&container->group_lock);
745 781
746 /* 782 /*
747 * The container is designed to be an unprivileged interface while 783 * The container is designed to be an unprivileged interface while
@@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
752 * the container is deprivileged and returns to an unset state. 788 * the container is deprivileged and returns to an unset state.
753 */ 789 */
754 if (list_empty(&container->group_list) || container->iommu_driver) { 790 if (list_empty(&container->group_list) || container->iommu_driver) {
755 mutex_unlock(&container->group_lock); 791 up_write(&container->group_lock);
756 return -EINVAL; 792 return -EINVAL;
757 } 793 }
758 794
@@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
799 835
800 mutex_unlock(&vfio.iommu_drivers_lock); 836 mutex_unlock(&vfio.iommu_drivers_lock);
801skip_drivers_unlock: 837skip_drivers_unlock:
802 mutex_unlock(&container->group_lock); 838 up_write(&container->group_lock);
803 839
804 return ret; 840 return ret;
805} 841}
@@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
815 if (!container) 851 if (!container)
816 return ret; 852 return ret;
817 853
818 driver = container->iommu_driver;
819 data = container->iommu_data;
820
821 switch (cmd) { 854 switch (cmd) {
822 case VFIO_GET_API_VERSION: 855 case VFIO_GET_API_VERSION:
823 ret = VFIO_API_VERSION; 856 ret = VFIO_API_VERSION;
@@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
829 ret = vfio_ioctl_set_iommu(container, arg); 862 ret = vfio_ioctl_set_iommu(container, arg);
830 break; 863 break;
831 default: 864 default:
865 down_read(&container->group_lock);
866
867 driver = container->iommu_driver;
868 data = container->iommu_data;
869
832 if (driver) /* passthrough all unrecognized ioctls */ 870 if (driver) /* passthrough all unrecognized ioctls */
833 ret = driver->ops->ioctl(data, cmd, arg); 871 ret = driver->ops->ioctl(data, cmd, arg);
872
873 up_read(&container->group_lock);
834 } 874 }
835 875
836 return ret; 876 return ret;
@@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
854 return -ENOMEM; 894 return -ENOMEM;
855 895
856 INIT_LIST_HEAD(&container->group_list); 896 INIT_LIST_HEAD(&container->group_list);
857 mutex_init(&container->group_lock); 897 init_rwsem(&container->group_lock);
858 kref_init(&container->kref); 898 kref_init(&container->kref);
859 899
860 filep->private_data = container; 900 filep->private_data = container;
@@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
881 size_t count, loff_t *ppos) 921 size_t count, loff_t *ppos)
882{ 922{
883 struct vfio_container *container = filep->private_data; 923 struct vfio_container *container = filep->private_data;
884 struct vfio_iommu_driver *driver = container->iommu_driver; 924 struct vfio_iommu_driver *driver;
925 ssize_t ret = -EINVAL;
885 926
886 if (unlikely(!driver || !driver->ops->read)) 927 down_read(&container->group_lock);
887 return -EINVAL;
888 928
889 return driver->ops->read(container->iommu_data, buf, count, ppos); 929 driver = container->iommu_driver;
930 if (likely(driver && driver->ops->read))
931 ret = driver->ops->read(container->iommu_data,
932 buf, count, ppos);
933
934 up_read(&container->group_lock);
935
936 return ret;
890} 937}
891 938
892static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, 939static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
893 size_t count, loff_t *ppos) 940 size_t count, loff_t *ppos)
894{ 941{
895 struct vfio_container *container = filep->private_data; 942 struct vfio_container *container = filep->private_data;
896 struct vfio_iommu_driver *driver = container->iommu_driver; 943 struct vfio_iommu_driver *driver;
944 ssize_t ret = -EINVAL;
897 945
898 if (unlikely(!driver || !driver->ops->write)) 946 down_read(&container->group_lock);
899 return -EINVAL;
900 947
901 return driver->ops->write(container->iommu_data, buf, count, ppos); 948 driver = container->iommu_driver;
949 if (likely(driver && driver->ops->write))
950 ret = driver->ops->write(container->iommu_data,
951 buf, count, ppos);
952
953 up_read(&container->group_lock);
954
955 return ret;
902} 956}
903 957
904static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) 958static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
905{ 959{
906 struct vfio_container *container = filep->private_data; 960 struct vfio_container *container = filep->private_data;
907 struct vfio_iommu_driver *driver = container->iommu_driver; 961 struct vfio_iommu_driver *driver;
962 int ret = -EINVAL;
908 963
909 if (unlikely(!driver || !driver->ops->mmap)) 964 down_read(&container->group_lock);
910 return -EINVAL;
911 965
912 return driver->ops->mmap(container->iommu_data, vma); 966 driver = container->iommu_driver;
967 if (likely(driver && driver->ops->mmap))
968 ret = driver->ops->mmap(container->iommu_data, vma);
969
970 up_read(&container->group_lock);
971
972 return ret;
913} 973}
914 974
915static const struct file_operations vfio_fops = { 975static const struct file_operations vfio_fops = {
@@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
933 struct vfio_container *container = group->container; 993 struct vfio_container *container = group->container;
934 struct vfio_iommu_driver *driver; 994 struct vfio_iommu_driver *driver;
935 995
936 mutex_lock(&container->group_lock); 996 down_write(&container->group_lock);
937 997
938 driver = container->iommu_driver; 998 driver = container->iommu_driver;
939 if (driver) 999 if (driver)
@@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
951 container->iommu_data = NULL; 1011 container->iommu_data = NULL;
952 } 1012 }
953 1013
954 mutex_unlock(&container->group_lock); 1014 up_write(&container->group_lock);
955 1015
956 vfio_container_put(container); 1016 vfio_container_put(container);
957} 1017}
@@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1011 container = f.file->private_data; 1071 container = f.file->private_data;
1012 WARN_ON(!container); /* fget ensures we don't race vfio_release */ 1072 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1013 1073
1014 mutex_lock(&container->group_lock); 1074 down_write(&container->group_lock);
1015 1075
1016 driver = container->iommu_driver; 1076 driver = container->iommu_driver;
1017 if (driver) { 1077 if (driver) {
@@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1029 atomic_inc(&group->container_users); 1089 atomic_inc(&group->container_users);
1030 1090
1031unlock_out: 1091unlock_out:
1032 mutex_unlock(&container->group_lock); 1092 up_write(&container->group_lock);
1033 fdput(f); 1093 fdput(f);
1034 return ret; 1094 return ret;
1035} 1095}
@@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = {
1300 */ 1360 */
1301static char *vfio_devnode(struct device *dev, umode_t *mode) 1361static char *vfio_devnode(struct device *dev, umode_t *mode)
1302{ 1362{
1363 if (MINOR(dev->devt) == 0)
1364 *mode = S_IRUGO | S_IWUGO;
1365
1303 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); 1366 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1304} 1367}
1305 1368