aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/vfio/pci/vfio_pci.c44
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c172
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c67
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h1
-rw-r--r--drivers/vfio/vfio.c117
-rw-r--r--include/linux/vfio.h3
-rw-r--r--include/uapi/linux/vfio.h1
7 files changed, 298 insertions, 107 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 09d2e3ffd6fc..ac3725440d64 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
201 201
202 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 202 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
203 } 203 }
204 } 204 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
205 if (pci_is_pcie(vdev->pdev))
206 return 1;
205 207
206 return 0; 208 return 0;
207} 209}
@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
317 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 319 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
318 return -EINVAL; 320 return -EINVAL;
319 321
322 switch (info.index) {
323 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
324 break;
325 case VFIO_PCI_ERR_IRQ_INDEX:
326 if (pci_is_pcie(vdev->pdev))
327 break;
328 /* pass thru to return error */
329 default:
330 return -EINVAL;
331 }
332
320 info.flags = VFIO_IRQ_INFO_EVENTFD; 333 info.flags = VFIO_IRQ_INFO_EVENTFD;
321 334
322 info.count = vfio_pci_get_irq_count(vdev, info.index); 335 info.count = vfio_pci_get_irq_count(vdev, info.index);
@@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
552 kfree(vdev); 565 kfree(vdev);
553} 566}
554 567
568static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
569 pci_channel_state_t state)
570{
571 struct vfio_pci_device *vdev;
572 struct vfio_device *device;
573
574 device = vfio_device_get_from_dev(&pdev->dev);
575 if (device == NULL)
576 return PCI_ERS_RESULT_DISCONNECT;
577
578 vdev = vfio_device_data(device);
579 if (vdev == NULL) {
580 vfio_device_put(device);
581 return PCI_ERS_RESULT_DISCONNECT;
582 }
583
584 if (vdev->err_trigger)
585 eventfd_signal(vdev->err_trigger, 1);
586
587 vfio_device_put(device);
588
589 return PCI_ERS_RESULT_CAN_RECOVER;
590}
591
592static struct pci_error_handlers vfio_err_handlers = {
593 .error_detected = vfio_pci_aer_err_detected,
594};
595
555static struct pci_driver vfio_pci_driver = { 596static struct pci_driver vfio_pci_driver = {
556 .name = "vfio-pci", 597 .name = "vfio-pci",
557 .id_table = NULL, /* only dynamic ids */ 598 .id_table = NULL, /* only dynamic ids */
558 .probe = vfio_pci_probe, 599 .probe = vfio_pci_probe,
559 .remove = vfio_pci_remove, 600 .remove = vfio_pci_remove,
601 .err_handler = &vfio_err_handlers,
560}; 602};
561 603
562static void __exit vfio_pci_cleanup(void) 604static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index aeb00fc2d3be..affa34745be9 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -274,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
274 return count; 274 return count;
275} 275}
276 276
277static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, 277/* Raw access skips any kind of virtualization */
278 int count, struct perm_bits *perm, 278static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
279 int offset, __le32 val) 279 int count, struct perm_bits *perm,
280 int offset, __le32 val)
280{ 281{
281 int ret; 282 int ret;
282 283
@@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
287 return count; 288 return count;
288} 289}
289 290
290/* Default all regions to read-only, no-virtualization */ 291static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
292 int count, struct perm_bits *perm,
293 int offset, __le32 *val)
294{
295 int ret;
296
297 ret = vfio_user_config_read(vdev->pdev, pos, val, count);
298 if (ret)
299 return pcibios_err_to_errno(ret);
300
301 return count;
302}
303
304/* Default capability regions to read-only, no-virtualization */
291static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { 305static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
292 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } 306 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
293}; 307};
294static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { 308static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
295 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } 309 [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
296}; 310};
311/*
312 * Default unassigned regions to raw read-write access. Some devices
313 * require this to function as they hide registers between the gaps in
314 * config space (be2net). Like MMIO and I/O port registers, we have
315 * to trust the hardware isolation.
316 */
317static struct perm_bits unassigned_perms = {
318 .readfn = vfio_raw_config_read,
319 .writefn = vfio_raw_config_write
320};
297 321
298static void free_perm_bits(struct perm_bits *perm) 322static void free_perm_bits(struct perm_bits *perm)
299{ 323{
@@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void)
779 803
780 /* Capabilities */ 804 /* Capabilities */
781 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); 805 ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
782 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; 806 cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
783 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); 807 ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
784 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; 808 cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
785 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); 809 ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
786 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); 810 ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
787 811
788 /* Extended capabilities */ 812 /* Extended capabilities */
789 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); 813 ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
790 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); 814 ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
791 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; 815 ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
792 816
793 if (ret) 817 if (ret)
794 vfio_pci_uninit_perm_bits(); 818 vfio_pci_uninit_perm_bits();
@@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
801 u8 cap; 825 u8 cap;
802 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : 826 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
803 PCI_STD_HEADER_SIZEOF; 827 PCI_STD_HEADER_SIZEOF;
804 base /= 4;
805 pos /= 4;
806
807 cap = vdev->pci_config_map[pos]; 828 cap = vdev->pci_config_map[pos];
808 829
809 if (cap == PCI_CAP_ID_BASIC) 830 if (cap == PCI_CAP_ID_BASIC)
@@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
813 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) 834 while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
814 pos--; 835 pos--;
815 836
816 return pos * 4; 837 return pos;
817} 838}
818 839
819static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, 840static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
@@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
1017 return byte; 1038 return byte;
1018 case PCI_CAP_ID_EXP: 1039 case PCI_CAP_ID_EXP:
1019 /* length based on version */ 1040 /* length based on version */
1020 ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
1021 if (ret)
1022 return pcibios_err_to_errno(ret);
1023
1024 vdev->extended_caps = true; 1041 vdev->extended_caps = true;
1025 1042
1026 if ((word & PCI_EXP_FLAGS_VERS) == 1) 1043 if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
1027 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; 1044 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
1028 else 1045 else
1029 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; 1046 return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
@@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
1230 } 1247 }
1231 1248
1232 /* Sanity check, do we overlap other capabilities? */ 1249 /* Sanity check, do we overlap other capabilities? */
1233 for (i = 0; i < len; i += 4) { 1250 for (i = 0; i < len; i++) {
1234 if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) 1251 if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
1235 continue; 1252 continue;
1236 1253
1237 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", 1254 pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
@@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
1239 pos + i, map[pos + i], cap); 1256 pos + i, map[pos + i], cap);
1240 } 1257 }
1241 1258
1242 memset(map + (pos / 4), cap, len / 4); 1259 memset(map + pos, cap, len);
1243 ret = vfio_fill_vconfig_bytes(vdev, pos, len); 1260 ret = vfio_fill_vconfig_bytes(vdev, pos, len);
1244 if (ret) 1261 if (ret)
1245 return ret; 1262 return ret;
@@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
1314 hidden = true; 1331 hidden = true;
1315 } 1332 }
1316 1333
1317 for (i = 0; i < len; i += 4) { 1334 for (i = 0; i < len; i++) {
1318 if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) 1335 if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
1319 continue; 1336 continue;
1320 1337
1321 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", 1338 pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
@@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
1330 */ 1347 */
1331 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); 1348 BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
1332 1349
1333 memset(map + (epos / 4), ecap, len / 4); 1350 memset(map + epos, ecap, len);
1334 ret = vfio_fill_vconfig_bytes(vdev, epos, len); 1351 ret = vfio_fill_vconfig_bytes(vdev, epos, len);
1335 if (ret) 1352 if (ret)
1336 return ret; 1353 return ret;
@@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev)
1377 int ret; 1394 int ret;
1378 1395
1379 /* 1396 /*
1380 * Config space, caps and ecaps are all dword aligned, so we can 1397 * Config space, caps and ecaps are all dword aligned, so we could
1381 * use one byte per dword to record the type. 1398 * use one byte per dword to record the type. However, there are
1399 * no requiremenst on the length of a capability, so the gap between
1400 * capabilities needs byte granularity.
1382 */ 1401 */
1383 map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); 1402 map = kmalloc(pdev->cfg_size, GFP_KERNEL);
1384 if (!map) 1403 if (!map)
1385 return -ENOMEM; 1404 return -ENOMEM;
1386 1405
@@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev)
1393 vdev->pci_config_map = map; 1412 vdev->pci_config_map = map;
1394 vdev->vconfig = vconfig; 1413 vdev->vconfig = vconfig;
1395 1414
1396 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); 1415 memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
1397 memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, 1416 memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
1398 (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); 1417 pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
1399 1418
1400 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); 1419 ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
1401 if (ret) 1420 if (ret)
@@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev)
1450 vdev->msi_perm = NULL; 1469 vdev->msi_perm = NULL;
1451} 1470}
1452 1471
1472/*
1473 * Find the remaining number of bytes in a dword that match the given
1474 * position. Stop at either the end of the capability or the dword boundary.
1475 */
1476static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
1477 loff_t pos)
1478{
1479 u8 cap = vdev->pci_config_map[pos];
1480 size_t i;
1481
1482 for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
1483 /* nop */;
1484
1485 return i;
1486}
1487
1453static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, 1488static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1454 size_t count, loff_t *ppos, bool iswrite) 1489 size_t count, loff_t *ppos, bool iswrite)
1455{ 1490{
@@ -1458,55 +1493,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1458 __le32 val = 0; 1493 __le32 val = 0;
1459 int cap_start = 0, offset; 1494 int cap_start = 0, offset;
1460 u8 cap_id; 1495 u8 cap_id;
1461 ssize_t ret = count; 1496 ssize_t ret;
1462 1497
1463 if (*ppos < 0 || *ppos + count > pdev->cfg_size) 1498 if (*ppos < 0 || *ppos >= pdev->cfg_size ||
1499 *ppos + count > pdev->cfg_size)
1464 return -EFAULT; 1500 return -EFAULT;
1465 1501
1466 /* 1502 /*
1467 * gcc can't seem to figure out we're a static function, only called 1503 * Chop accesses into aligned chunks containing no more than a
1468 * with count of 1/2/4 and hits copy_from_user_overflow without this. 1504 * single capability. Caller increments to the next chunk.
1469 */ 1505 */
1470 if (count > sizeof(val)) 1506 count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
1471 return -EINVAL; 1507 if (count >= 4 && !(*ppos % 4))
1472 1508 count = 4;
1473 cap_id = vdev->pci_config_map[*ppos / 4]; 1509 else if (count >= 2 && !(*ppos % 2))
1474 1510 count = 2;
1475 if (cap_id == PCI_CAP_ID_INVALID) { 1511 else
1476 if (iswrite) 1512 count = 1;
1477 return ret; /* drop */
1478
1479 /*
1480 * Per PCI spec 3.0, section 6.1, reads from reserved and
1481 * unimplemented registers return 0
1482 */
1483 if (copy_to_user(buf, &val, count))
1484 return -EFAULT;
1485
1486 return ret;
1487 }
1488 1513
1489 /* 1514 ret = count;
1490 * All capabilities are minimum 4 bytes and aligned on dword
1491 * boundaries. Since we don't support unaligned accesses, we're
1492 * only ever accessing a single capability.
1493 */
1494 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1495 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1496 1515
1497 perm = &ecap_perms[cap_id]; 1516 cap_id = vdev->pci_config_map[*ppos];
1498 cap_start = vfio_find_cap_start(vdev, *ppos);
1499 1517
1518 if (cap_id == PCI_CAP_ID_INVALID) {
1519 perm = &unassigned_perms;
1520 cap_start = *ppos;
1500 } else { 1521 } else {
1501 WARN_ON(cap_id > PCI_CAP_ID_MAX); 1522 if (*ppos >= PCI_CFG_SPACE_SIZE) {
1523 WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1502 1524
1503 perm = &cap_perms[cap_id]; 1525 perm = &ecap_perms[cap_id];
1526 cap_start = vfio_find_cap_start(vdev, *ppos);
1527 } else {
1528 WARN_ON(cap_id > PCI_CAP_ID_MAX);
1504 1529
1505 if (cap_id == PCI_CAP_ID_MSI) 1530 perm = &cap_perms[cap_id];
1506 perm = vdev->msi_perm;
1507 1531
1508 if (cap_id > PCI_CAP_ID_BASIC) 1532 if (cap_id == PCI_CAP_ID_MSI)
1509 cap_start = vfio_find_cap_start(vdev, *ppos); 1533 perm = vdev->msi_perm;
1534
1535 if (cap_id > PCI_CAP_ID_BASIC)
1536 cap_start = vfio_find_cap_start(vdev, *ppos);
1537 }
1510 } 1538 }
1511 1539
1512 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); 1540 WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
@@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
1546 1574
1547 pos &= VFIO_PCI_OFFSET_MASK; 1575 pos &= VFIO_PCI_OFFSET_MASK;
1548 1576
1549 /*
1550 * We want to both keep the access size the caller users as well as
1551 * support reading large chunks of config space in a single call.
1552 * PCI doesn't support unaligned accesses, so we can safely break
1553 * those apart.
1554 */
1555 while (count) { 1577 while (count) {
1556 if (count >= 4 && !(pos % 4)) 1578 ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
1557 ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
1558 else if (count >= 2 && !(pos % 2))
1559 ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
1560 else
1561 ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
1562
1563 if (ret < 0) 1579 if (ret < 0)
1564 return ret; 1580 return ret;
1565 1581
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index a96509187deb..4bc704e1b7c7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
287 * a signal is necessary, which can then be handled via a work queue 287 * a signal is necessary, which can then be handled via a work queue
288 * or directly depending on the caller. 288 * or directly depending on the caller.
289 */ 289 */
290int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) 290static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
291 void *unused)
291{ 292{
292 struct pci_dev *pdev = vdev->pdev; 293 struct pci_dev *pdev = vdev->pdev;
293 unsigned long flags; 294 unsigned long flags;
@@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
746 return 0; 747 return 0;
747} 748}
748 749
750static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
751 unsigned index, unsigned start,
752 unsigned count, uint32_t flags, void *data)
753{
754 int32_t fd = *(int32_t *)data;
755 struct pci_dev *pdev = vdev->pdev;
756
757 if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
758 !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
759 return -EINVAL;
760
761 /*
762 * device_lock synchronizes setting and checking of
763 * err_trigger. The vfio_pci_aer_err_detected() is also
764 * called with device_lock held.
765 */
766
767 /* DATA_NONE/DATA_BOOL enables loopback testing */
768
769 if (flags & VFIO_IRQ_SET_DATA_NONE) {
770 device_lock(&pdev->dev);
771 if (vdev->err_trigger)
772 eventfd_signal(vdev->err_trigger, 1);
773 device_unlock(&pdev->dev);
774 return 0;
775 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
776 uint8_t trigger = *(uint8_t *)data;
777 device_lock(&pdev->dev);
778 if (trigger && vdev->err_trigger)
779 eventfd_signal(vdev->err_trigger, 1);
780 device_unlock(&pdev->dev);
781 return 0;
782 }
783
784 /* Handle SET_DATA_EVENTFD */
785
786 if (fd == -1) {
787 device_lock(&pdev->dev);
788 if (vdev->err_trigger)
789 eventfd_ctx_put(vdev->err_trigger);
790 vdev->err_trigger = NULL;
791 device_unlock(&pdev->dev);
792 return 0;
793 } else if (fd >= 0) {
794 struct eventfd_ctx *efdctx;
795 efdctx = eventfd_ctx_fdget(fd);
796 if (IS_ERR(efdctx))
797 return PTR_ERR(efdctx);
798 device_lock(&pdev->dev);
799 if (vdev->err_trigger)
800 eventfd_ctx_put(vdev->err_trigger);
801 vdev->err_trigger = efdctx;
802 device_unlock(&pdev->dev);
803 return 0;
804 } else
805 return -EINVAL;
806}
749int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 807int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
750 unsigned index, unsigned start, unsigned count, 808 unsigned index, unsigned start, unsigned count,
751 void *data) 809 void *data)
@@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
780 break; 838 break;
781 } 839 }
782 break; 840 break;
841 case VFIO_PCI_ERR_IRQ_INDEX:
842 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
843 case VFIO_IRQ_SET_ACTION_TRIGGER:
844 if (pci_is_pcie(vdev->pdev))
845 func = vfio_pci_set_err_trigger;
846 break;
847 }
783 } 848 }
784 849
785 if (!func) 850 if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index d7e55d03f49e..9c6d5d0f3b02 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -56,6 +56,7 @@ struct vfio_pci_device {
56 bool has_vga; 56 bool has_vga;
57 struct pci_saved_state *pci_saved_state; 57 struct pci_saved_state *pci_saved_state;
58 atomic_t refcnt; 58 atomic_t refcnt;
59 struct eventfd_ctx *err_trigger;
59}; 60};
60 61
61#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) 62#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3e60a3..acb7121a9316 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -24,8 +24,10 @@
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/rwsem.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/stat.h>
29#include <linux/string.h> 31#include <linux/string.h>
30#include <linux/uaccess.h> 32#include <linux/uaccess.h>
31#include <linux/vfio.h> 33#include <linux/vfio.h>
@@ -57,7 +59,7 @@ struct vfio_iommu_driver {
57struct vfio_container { 59struct vfio_container {
58 struct kref kref; 60 struct kref kref;
59 struct list_head group_list; 61 struct list_head group_list;
60 struct mutex group_lock; 62 struct rw_semaphore group_lock;
61 struct vfio_iommu_driver *iommu_driver; 63 struct vfio_iommu_driver *iommu_driver;
62 void *iommu_data; 64 void *iommu_data;
63}; 65};
@@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref)
392} 394}
393 395
394/* Device reference always implies a group reference */ 396/* Device reference always implies a group reference */
395static void vfio_device_put(struct vfio_device *device) 397void vfio_device_put(struct vfio_device *device)
396{ 398{
397 struct vfio_group *group = device->group; 399 struct vfio_group *group = device->group;
398 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); 400 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
399 vfio_group_put(group); 401 vfio_group_put(group);
400} 402}
403EXPORT_SYMBOL_GPL(vfio_device_put);
401 404
402static void vfio_device_get(struct vfio_device *device) 405static void vfio_device_get(struct vfio_device *device)
403{ 406{
@@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev,
627} 630}
628EXPORT_SYMBOL_GPL(vfio_add_group_dev); 631EXPORT_SYMBOL_GPL(vfio_add_group_dev);
629 632
633/**
634 * Get a reference to the vfio_device for a device that is known to
635 * be bound to a vfio driver. The driver implicitly holds a
636 * vfio_device reference between vfio_add_group_dev and
637 * vfio_del_group_dev. We can therefore use drvdata to increment
638 * that reference from the struct device. This additional
639 * reference must be released by calling vfio_device_put.
640 */
641struct vfio_device *vfio_device_get_from_dev(struct device *dev)
642{
643 struct vfio_device *device = dev_get_drvdata(dev);
644
645 vfio_device_get(device);
646
647 return device;
648}
649EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
650
651/*
652 * Caller must hold a reference to the vfio_device
653 */
654void *vfio_device_data(struct vfio_device *device)
655{
656 return device->device_data;
657}
658EXPORT_SYMBOL_GPL(vfio_device_data);
659
630/* Given a referenced group, check if it contains the device */ 660/* Given a referenced group, check if it contains the device */
631static bool vfio_dev_present(struct vfio_group *group, struct device *dev) 661static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
632{ 662{
@@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
675static long vfio_ioctl_check_extension(struct vfio_container *container, 705static long vfio_ioctl_check_extension(struct vfio_container *container,
676 unsigned long arg) 706 unsigned long arg)
677{ 707{
678 struct vfio_iommu_driver *driver = container->iommu_driver; 708 struct vfio_iommu_driver *driver;
679 long ret = 0; 709 long ret = 0;
680 710
711 down_read(&container->group_lock);
712
713 driver = container->iommu_driver;
714
681 switch (arg) { 715 switch (arg) {
682 /* No base extensions yet */ 716 /* No base extensions yet */
683 default: 717 default:
@@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
707 VFIO_CHECK_EXTENSION, arg); 741 VFIO_CHECK_EXTENSION, arg);
708 } 742 }
709 743
744 up_read(&container->group_lock);
745
710 return ret; 746 return ret;
711} 747}
712 748
713/* hold container->group_lock */ 749/* hold write lock on container->group_lock */
714static int __vfio_container_attach_groups(struct vfio_container *container, 750static int __vfio_container_attach_groups(struct vfio_container *container,
715 struct vfio_iommu_driver *driver, 751 struct vfio_iommu_driver *driver,
716 void *data) 752 void *data)
@@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
741 struct vfio_iommu_driver *driver; 777 struct vfio_iommu_driver *driver;
742 long ret = -ENODEV; 778 long ret = -ENODEV;
743 779
744 mutex_lock(&container->group_lock); 780 down_write(&container->group_lock);
745 781
746 /* 782 /*
747 * The container is designed to be an unprivileged interface while 783 * The container is designed to be an unprivileged interface while
@@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
752 * the container is deprivileged and returns to an unset state. 788 * the container is deprivileged and returns to an unset state.
753 */ 789 */
754 if (list_empty(&container->group_list) || container->iommu_driver) { 790 if (list_empty(&container->group_list) || container->iommu_driver) {
755 mutex_unlock(&container->group_lock); 791 up_write(&container->group_lock);
756 return -EINVAL; 792 return -EINVAL;
757 } 793 }
758 794
@@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
799 835
800 mutex_unlock(&vfio.iommu_drivers_lock); 836 mutex_unlock(&vfio.iommu_drivers_lock);
801skip_drivers_unlock: 837skip_drivers_unlock:
802 mutex_unlock(&container->group_lock); 838 up_write(&container->group_lock);
803 839
804 return ret; 840 return ret;
805} 841}
@@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
815 if (!container) 851 if (!container)
816 return ret; 852 return ret;
817 853
818 driver = container->iommu_driver;
819 data = container->iommu_data;
820
821 switch (cmd) { 854 switch (cmd) {
822 case VFIO_GET_API_VERSION: 855 case VFIO_GET_API_VERSION:
823 ret = VFIO_API_VERSION; 856 ret = VFIO_API_VERSION;
@@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
829 ret = vfio_ioctl_set_iommu(container, arg); 862 ret = vfio_ioctl_set_iommu(container, arg);
830 break; 863 break;
831 default: 864 default:
865 down_read(&container->group_lock);
866
867 driver = container->iommu_driver;
868 data = container->iommu_data;
869
832 if (driver) /* passthrough all unrecognized ioctls */ 870 if (driver) /* passthrough all unrecognized ioctls */
833 ret = driver->ops->ioctl(data, cmd, arg); 871 ret = driver->ops->ioctl(data, cmd, arg);
872
873 up_read(&container->group_lock);
834 } 874 }
835 875
836 return ret; 876 return ret;
@@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
854 return -ENOMEM; 894 return -ENOMEM;
855 895
856 INIT_LIST_HEAD(&container->group_list); 896 INIT_LIST_HEAD(&container->group_list);
857 mutex_init(&container->group_lock); 897 init_rwsem(&container->group_lock);
858 kref_init(&container->kref); 898 kref_init(&container->kref);
859 899
860 filep->private_data = container; 900 filep->private_data = container;
@@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
881 size_t count, loff_t *ppos) 921 size_t count, loff_t *ppos)
882{ 922{
883 struct vfio_container *container = filep->private_data; 923 struct vfio_container *container = filep->private_data;
884 struct vfio_iommu_driver *driver = container->iommu_driver; 924 struct vfio_iommu_driver *driver;
925 ssize_t ret = -EINVAL;
885 926
886 if (unlikely(!driver || !driver->ops->read)) 927 down_read(&container->group_lock);
887 return -EINVAL;
888 928
889 return driver->ops->read(container->iommu_data, buf, count, ppos); 929 driver = container->iommu_driver;
930 if (likely(driver && driver->ops->read))
931 ret = driver->ops->read(container->iommu_data,
932 buf, count, ppos);
933
934 up_read(&container->group_lock);
935
936 return ret;
890} 937}
891 938
892static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, 939static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
893 size_t count, loff_t *ppos) 940 size_t count, loff_t *ppos)
894{ 941{
895 struct vfio_container *container = filep->private_data; 942 struct vfio_container *container = filep->private_data;
896 struct vfio_iommu_driver *driver = container->iommu_driver; 943 struct vfio_iommu_driver *driver;
944 ssize_t ret = -EINVAL;
897 945
898 if (unlikely(!driver || !driver->ops->write)) 946 down_read(&container->group_lock);
899 return -EINVAL;
900 947
901 return driver->ops->write(container->iommu_data, buf, count, ppos); 948 driver = container->iommu_driver;
949 if (likely(driver && driver->ops->write))
950 ret = driver->ops->write(container->iommu_data,
951 buf, count, ppos);
952
953 up_read(&container->group_lock);
954
955 return ret;
902} 956}
903 957
904static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) 958static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
905{ 959{
906 struct vfio_container *container = filep->private_data; 960 struct vfio_container *container = filep->private_data;
907 struct vfio_iommu_driver *driver = container->iommu_driver; 961 struct vfio_iommu_driver *driver;
962 int ret = -EINVAL;
908 963
909 if (unlikely(!driver || !driver->ops->mmap)) 964 down_read(&container->group_lock);
910 return -EINVAL;
911 965
912 return driver->ops->mmap(container->iommu_data, vma); 966 driver = container->iommu_driver;
967 if (likely(driver && driver->ops->mmap))
968 ret = driver->ops->mmap(container->iommu_data, vma);
969
970 up_read(&container->group_lock);
971
972 return ret;
913} 973}
914 974
915static const struct file_operations vfio_fops = { 975static const struct file_operations vfio_fops = {
@@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
933 struct vfio_container *container = group->container; 993 struct vfio_container *container = group->container;
934 struct vfio_iommu_driver *driver; 994 struct vfio_iommu_driver *driver;
935 995
936 mutex_lock(&container->group_lock); 996 down_write(&container->group_lock);
937 997
938 driver = container->iommu_driver; 998 driver = container->iommu_driver;
939 if (driver) 999 if (driver)
@@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
951 container->iommu_data = NULL; 1011 container->iommu_data = NULL;
952 } 1012 }
953 1013
954 mutex_unlock(&container->group_lock); 1014 up_write(&container->group_lock);
955 1015
956 vfio_container_put(container); 1016 vfio_container_put(container);
957} 1017}
@@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1011 container = f.file->private_data; 1071 container = f.file->private_data;
1012 WARN_ON(!container); /* fget ensures we don't race vfio_release */ 1072 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1013 1073
1014 mutex_lock(&container->group_lock); 1074 down_write(&container->group_lock);
1015 1075
1016 driver = container->iommu_driver; 1076 driver = container->iommu_driver;
1017 if (driver) { 1077 if (driver) {
@@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1029 atomic_inc(&group->container_users); 1089 atomic_inc(&group->container_users);
1030 1090
1031unlock_out: 1091unlock_out:
1032 mutex_unlock(&container->group_lock); 1092 up_write(&container->group_lock);
1033 fdput(f); 1093 fdput(f);
1034 return ret; 1094 return ret;
1035} 1095}
@@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = {
1300 */ 1360 */
1301static char *vfio_devnode(struct device *dev, umode_t *mode) 1361static char *vfio_devnode(struct device *dev, umode_t *mode)
1302{ 1362{
1363 if (MINOR(dev->devt) == 0)
1364 *mode = S_IRUGO | S_IWUGO;
1365
1303 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); 1366 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1304} 1367}
1305 1368
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e86224c54..ac8d488e4372 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
45 void *device_data); 45 void *device_data);
46 46
47extern void *vfio_del_group_dev(struct device *dev); 47extern void *vfio_del_group_dev(struct device *dev);
48extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
49extern void vfio_device_put(struct vfio_device *device);
50extern void *vfio_device_data(struct vfio_device *device);
48 51
49/** 52/**
50 * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks 53 * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4f41f309911e..284ff2436829 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -319,6 +319,7 @@ enum {
319 VFIO_PCI_INTX_IRQ_INDEX, 319 VFIO_PCI_INTX_IRQ_INDEX,
320 VFIO_PCI_MSI_IRQ_INDEX, 320 VFIO_PCI_MSI_IRQ_INDEX,
321 VFIO_PCI_MSIX_IRQ_INDEX, 321 VFIO_PCI_MSIX_IRQ_INDEX,
322 VFIO_PCI_ERR_IRQ_INDEX,
322 VFIO_PCI_NUM_IRQS 323 VFIO_PCI_NUM_IRQS
323}; 324};
324 325