aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/vfio/vfio_iommu_type1.c637
-rw-r--r--include/uapi/linux/vfio.h1
2 files changed, 336 insertions, 302 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4fb7a8f83c8a..8c7bb9befdab 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -30,7 +30,6 @@
30#include <linux/iommu.h> 30#include <linux/iommu.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/pci.h> /* pci_bus_type */
34#include <linux/rbtree.h> 33#include <linux/rbtree.h>
35#include <linux/sched.h> 34#include <linux/sched.h>
36#include <linux/slab.h> 35#include <linux/slab.h>
@@ -55,11 +54,17 @@ MODULE_PARM_DESC(disable_hugepages,
55 "Disable VFIO IOMMU support for IOMMU hugepages."); 54 "Disable VFIO IOMMU support for IOMMU hugepages.");
56 55
57struct vfio_iommu { 56struct vfio_iommu {
58 struct iommu_domain *domain; 57 struct list_head domain_list;
59 struct mutex lock; 58 struct mutex lock;
60 struct rb_root dma_list; 59 struct rb_root dma_list;
60 bool v2;
61};
62
63struct vfio_domain {
64 struct iommu_domain *domain;
65 struct list_head next;
61 struct list_head group_list; 66 struct list_head group_list;
62 bool cache; 67 int prot; /* IOMMU_CACHE */
63}; 68};
64 69
65struct vfio_dma { 70struct vfio_dma {
@@ -99,7 +104,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
99 return NULL; 104 return NULL;
100} 105}
101 106
102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 107static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
103{ 108{
104 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 109 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
105 struct vfio_dma *dma; 110 struct vfio_dma *dma;
@@ -118,7 +123,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
118 rb_insert_color(&new->node, &iommu->dma_list); 123 rb_insert_color(&new->node, &iommu->dma_list);
119} 124}
120 125
121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 126static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
122{ 127{
123 rb_erase(&old->node, &iommu->dma_list); 128 rb_erase(&old->node, &iommu->dma_list);
124} 129}
@@ -322,32 +327,39 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
322 return unlocked; 327 return unlocked;
323} 328}
324 329
325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 330static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
326 dma_addr_t iova, size_t *size)
327{ 331{
328 dma_addr_t start = iova, end = iova + *size; 332 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
333 struct vfio_domain *domain, *d;
329 long unlocked = 0; 334 long unlocked = 0;
330 335
336 if (!dma->size)
337 return;
338 /*
339 * We use the IOMMU to track the physical addresses, otherwise we'd
340 * need a much more complicated tracking system. Unfortunately that
341 * means we need to use one of the iommu domains to figure out the
342 * pfns to unpin. The rest need to be unmapped in advance so we have
343 * no iommu translations remaining when the pages are unpinned.
344 */
345 domain = d = list_first_entry(&iommu->domain_list,
346 struct vfio_domain, next);
347
348 list_for_each_entry_continue(d, &iommu->domain_list, next)
349 iommu_unmap(d->domain, dma->iova, dma->size);
350
331 while (iova < end) { 351 while (iova < end) {
332 size_t unmapped; 352 size_t unmapped;
333 phys_addr_t phys; 353 phys_addr_t phys;
334 354
335 /* 355 phys = iommu_iova_to_phys(domain->domain, iova);
336 * We use the IOMMU to track the physical address. This
337 * saves us from having a lot more entries in our mapping
338 * tree. The downside is that we don't track the size
339 * used to do the mapping. We request unmap of a single
340 * page, but expect IOMMUs that support large pages to
341 * unmap a larger chunk.
342 */
343 phys = iommu_iova_to_phys(iommu->domain, iova);
344 if (WARN_ON(!phys)) { 356 if (WARN_ON(!phys)) {
345 iova += PAGE_SIZE; 357 iova += PAGE_SIZE;
346 continue; 358 continue;
347 } 359 }
348 360
349 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); 361 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
350 if (!unmapped) 362 if (WARN_ON(!unmapped))
351 break; 363 break;
352 364
353 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, 365 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
@@ -357,119 +369,26 @@ static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
357 } 369 }
358 370
359 vfio_lock_acct(-unlocked); 371 vfio_lock_acct(-unlocked);
360
361 *size = iova - start;
362
363 return 0;
364} 372}
365 373
366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, 374static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
367 size_t *size, struct vfio_dma *dma)
368{ 375{
369 size_t offset, overlap, tmp; 376 vfio_unmap_unpin(iommu, dma);
370 struct vfio_dma *split; 377 vfio_unlink_dma(iommu, dma);
371 int ret; 378 kfree(dma);
372 379}
373 if (!*size)
374 return 0;
375
376 /*
377 * Existing dma region is completely covered, unmap all. This is
378 * the likely case since userspace tends to map and unmap buffers
379 * in one shot rather than multiple mappings within a buffer.
380 */
381 if (likely(start <= dma->iova &&
382 start + *size >= dma->iova + dma->size)) {
383 *size = dma->size;
384 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
385 if (ret)
386 return ret;
387
388 /*
389 * Did we remove more than we have? Should never happen
390 * since a vfio_dma is contiguous in iova and vaddr.
391 */
392 WARN_ON(*size != dma->size);
393
394 vfio_remove_dma(iommu, dma);
395 kfree(dma);
396 return 0;
397 }
398
399 /* Overlap low address of existing range */
400 if (start <= dma->iova) {
401 overlap = start + *size - dma->iova;
402 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
403 if (ret)
404 return ret;
405
406 vfio_remove_dma(iommu, dma);
407
408 /*
409 * Check, we may have removed to whole vfio_dma. If not
410 * fixup and re-insert.
411 */
412 if (overlap < dma->size) {
413 dma->iova += overlap;
414 dma->vaddr += overlap;
415 dma->size -= overlap;
416 vfio_insert_dma(iommu, dma);
417 } else
418 kfree(dma);
419
420 *size = overlap;
421 return 0;
422 }
423
424 /* Overlap high address of existing range */
425 if (start + *size >= dma->iova + dma->size) {
426 offset = start - dma->iova;
427 overlap = dma->size - offset;
428
429 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
430 if (ret)
431 return ret;
432
433 dma->size -= overlap;
434 *size = overlap;
435 return 0;
436 }
437
438 /* Split existing */
439
440 /*
441 * Allocate our tracking structure early even though it may not
442 * be used. An Allocation failure later loses track of pages and
443 * is more difficult to unwind.
444 */
445 split = kzalloc(sizeof(*split), GFP_KERNEL);
446 if (!split)
447 return -ENOMEM;
448
449 offset = start - dma->iova;
450
451 ret = vfio_unmap_unpin(iommu, dma, start, size);
452 if (ret || !*size) {
453 kfree(split);
454 return ret;
455 }
456
457 tmp = dma->size;
458 380
459 /* Resize the lower vfio_dma in place, before the below insert */ 381static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
460 dma->size = offset; 382{
383 struct vfio_domain *domain;
384 unsigned long bitmap = PAGE_MASK;
461 385
462 /* Insert new for remainder, assuming it didn't all get unmapped */ 386 mutex_lock(&iommu->lock);
463 if (likely(offset + *size < tmp)) { 387 list_for_each_entry(domain, &iommu->domain_list, next)
464 split->size = tmp - offset - *size; 388 bitmap &= domain->domain->ops->pgsize_bitmap;
465 split->iova = dma->iova + offset + *size; 389 mutex_unlock(&iommu->lock);
466 split->vaddr = dma->vaddr + offset + *size;
467 split->prot = dma->prot;
468 vfio_insert_dma(iommu, split);
469 } else
470 kfree(split);
471 390
472 return 0; 391 return bitmap;
473} 392}
474 393
475static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 394static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -477,10 +396,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
477{ 396{
478 uint64_t mask; 397 uint64_t mask;
479 struct vfio_dma *dma; 398 struct vfio_dma *dma;
480 size_t unmapped = 0, size; 399 size_t unmapped = 0;
481 int ret = 0; 400 int ret = 0;
482 401
483 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 402 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
484 403
485 if (unmap->iova & mask) 404 if (unmap->iova & mask)
486 return -EINVAL; 405 return -EINVAL;
@@ -491,20 +410,61 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
491 410
492 mutex_lock(&iommu->lock); 411 mutex_lock(&iommu->lock);
493 412
413 /*
414 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
415 * avoid tracking individual mappings. This means that the granularity
416 * of the original mapping was lost and the user was allowed to attempt
417 * to unmap any range. Depending on the contiguousness of physical
418 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
419 * or may not have worked. We only guaranteed unmap granularity
420 * matching the original mapping; even though it was untracked here,
421 * the original mappings are reflected in IOMMU mappings. This
422 * resulted in a couple unusual behaviors. First, if a range is not
423 * able to be unmapped, ex. a set of 4k pages that was mapped as a
424 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
425 * a zero sized unmap. Also, if an unmap request overlaps the first
426 * address of a hugepage, the IOMMU will unmap the entire hugepage.
427 * This also returns success and the returned unmap size reflects the
428 * actual size unmapped.
429 *
430 * We attempt to maintain compatibility with this "v1" interface, but
431 * we take control out of the hands of the IOMMU. Therefore, an unmap
432 * request offset from the beginning of the original mapping will
433 * return success with zero sized unmap. And an unmap request covering
434 * the first iova of mapping will unmap the entire range.
435 *
436 * The v2 version of this interface intends to be more deterministic.
437 * Unmap requests must fully cover previous mappings. Multiple
438 * mappings may still be unmaped by specifying large ranges, but there
439 * must not be any previous mappings bisected by the range. An error
440 * will be returned if these conditions are not met. The v2 interface
441 * will only return success and a size of zero if there were no
442 * mappings within the range.
443 */
444 if (iommu->v2) {
445 dma = vfio_find_dma(iommu, unmap->iova, 0);
446 if (dma && dma->iova != unmap->iova) {
447 ret = -EINVAL;
448 goto unlock;
449 }
450 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
451 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
452 ret = -EINVAL;
453 goto unlock;
454 }
455 }
456
494 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { 457 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
495 size = unmap->size; 458 if (!iommu->v2 && unmap->iova > dma->iova)
496 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
497 if (ret || !size)
498 break; 459 break;
499 unmapped += size; 460 unmapped += dma->size;
461 vfio_remove_dma(iommu, dma);
500 } 462 }
501 463
464unlock:
502 mutex_unlock(&iommu->lock); 465 mutex_unlock(&iommu->lock);
503 466
504 /* 467 /* Report how much was unmapped */
505 * We may unmap more than requested, update the unmap struct so
506 * userspace can know.
507 */
508 unmap->size = unmapped; 468 unmap->size = unmapped;
509 469
510 return ret; 470 return ret;
@@ -516,22 +476,47 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
516 * soon, so this is just a temporary workaround to break mappings down into 476 * soon, so this is just a temporary workaround to break mappings down into
517 * PAGE_SIZE. Better to map smaller pages than nothing. 477 * PAGE_SIZE. Better to map smaller pages than nothing.
518 */ 478 */
519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, 479static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
520 unsigned long pfn, long npage, int prot) 480 unsigned long pfn, long npage, int prot)
521{ 481{
522 long i; 482 long i;
523 int ret; 483 int ret;
524 484
525 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { 485 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
526 ret = iommu_map(iommu->domain, iova, 486 ret = iommu_map(domain->domain, iova,
527 (phys_addr_t)pfn << PAGE_SHIFT, 487 (phys_addr_t)pfn << PAGE_SHIFT,
528 PAGE_SIZE, prot); 488 PAGE_SIZE, prot | domain->prot);
529 if (ret) 489 if (ret)
530 break; 490 break;
531 } 491 }
532 492
533 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) 493 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
534 iommu_unmap(iommu->domain, iova, PAGE_SIZE); 494 iommu_unmap(domain->domain, iova, PAGE_SIZE);
495
496 return ret;
497}
498
499static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
500 unsigned long pfn, long npage, int prot)
501{
502 struct vfio_domain *d;
503 int ret;
504
505 list_for_each_entry(d, &iommu->domain_list, next) {
506 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
507 npage << PAGE_SHIFT, prot | d->prot);
508 if (ret) {
509 if (ret != -EBUSY ||
510 map_try_harder(d, iova, pfn, npage, prot))
511 goto unwind;
512 }
513 }
514
515 return 0;
516
517unwind:
518 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
519 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
535 520
536 return ret; 521 return ret;
537} 522}
@@ -545,12 +530,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
545 long npage; 530 long npage;
546 int ret = 0, prot = 0; 531 int ret = 0, prot = 0;
547 uint64_t mask; 532 uint64_t mask;
548 struct vfio_dma *dma = NULL; 533 struct vfio_dma *dma;
549 unsigned long pfn; 534 unsigned long pfn;
550 535
551 end = map->iova + map->size; 536 end = map->iova + map->size;
552 537
553 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 538 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
554 539
555 /* READ/WRITE from device perspective */ 540 /* READ/WRITE from device perspective */
556 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 541 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
@@ -561,9 +546,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
561 if (!prot) 546 if (!prot)
562 return -EINVAL; /* No READ/WRITE? */ 547 return -EINVAL; /* No READ/WRITE? */
563 548
564 if (iommu->cache)
565 prot |= IOMMU_CACHE;
566
567 if (vaddr & mask) 549 if (vaddr & mask)
568 return -EINVAL; 550 return -EINVAL;
569 if (map->iova & mask) 551 if (map->iova & mask)
@@ -588,180 +570,257 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
588 return -EEXIST; 570 return -EEXIST;
589 } 571 }
590 572
591 for (iova = map->iova; iova < end; iova += size, vaddr += size) { 573 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
592 long i; 574 if (!dma) {
575 mutex_unlock(&iommu->lock);
576 return -ENOMEM;
577 }
578
579 dma->iova = map->iova;
580 dma->vaddr = map->vaddr;
581 dma->prot = prot;
593 582
583 /* Insert zero-sized and grow as we map chunks of it */
584 vfio_link_dma(iommu, dma);
585
586 for (iova = map->iova; iova < end; iova += size, vaddr += size) {
594 /* Pin a contiguous chunk of memory */ 587 /* Pin a contiguous chunk of memory */
595 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, 588 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
596 prot, &pfn); 589 prot, &pfn);
597 if (npage <= 0) { 590 if (npage <= 0) {
598 WARN_ON(!npage); 591 WARN_ON(!npage);
599 ret = (int)npage; 592 ret = (int)npage;
600 goto out; 593 break;
601 }
602
603 /* Verify pages are not already mapped */
604 for (i = 0; i < npage; i++) {
605 if (iommu_iova_to_phys(iommu->domain,
606 iova + (i << PAGE_SHIFT))) {
607 ret = -EBUSY;
608 goto out_unpin;
609 }
610 } 594 }
611 595
612 ret = iommu_map(iommu->domain, iova, 596 /* Map it! */
613 (phys_addr_t)pfn << PAGE_SHIFT, 597 ret = vfio_iommu_map(iommu, iova, pfn, npage, prot);
614 npage << PAGE_SHIFT, prot);
615 if (ret) { 598 if (ret) {
616 if (ret != -EBUSY || 599 vfio_unpin_pages(pfn, npage, prot, true);
617 map_try_harder(iommu, iova, pfn, npage, prot)) { 600 break;
618 goto out_unpin;
619 }
620 } 601 }
621 602
622 size = npage << PAGE_SHIFT; 603 size = npage << PAGE_SHIFT;
604 dma->size += size;
605 }
623 606
624 /* 607 if (ret)
625 * Check if we abut a region below - nothing below 0. 608 vfio_remove_dma(iommu, dma);
626 * This is the most likely case when mapping chunks of
627 * physically contiguous regions within a virtual address
628 * range. Update the abutting entry in place since iova
629 * doesn't change.
630 */
631 if (likely(iova)) {
632 struct vfio_dma *tmp;
633 tmp = vfio_find_dma(iommu, iova - 1, 1);
634 if (tmp && tmp->prot == prot &&
635 tmp->vaddr + tmp->size == vaddr) {
636 tmp->size += size;
637 iova = tmp->iova;
638 size = tmp->size;
639 vaddr = tmp->vaddr;
640 dma = tmp;
641 }
642 }
643 609
644 /* 610 mutex_unlock(&iommu->lock);
645 * Check if we abut a region above - nothing above ~0 + 1. 611 return ret;
646 * If we abut above and below, remove and free. If only 612}
647 * abut above, remove, modify, reinsert. 613
648 */ 614static int vfio_bus_type(struct device *dev, void *data)
649 if (likely(iova + size)) { 615{
650 struct vfio_dma *tmp; 616 struct bus_type **bus = data;
651 tmp = vfio_find_dma(iommu, iova + size, 1); 617
652 if (tmp && tmp->prot == prot && 618 if (*bus && *bus != dev->bus)
653 tmp->vaddr == vaddr + size) { 619 return -EINVAL;
654 vfio_remove_dma(iommu, tmp); 620
655 if (dma) { 621 *bus = dev->bus;
656 dma->size += tmp->size; 622
657 kfree(tmp); 623 return 0;
658 } else { 624}
659 size += tmp->size; 625
660 tmp->size = size; 626static int vfio_iommu_replay(struct vfio_iommu *iommu,
661 tmp->iova = iova; 627 struct vfio_domain *domain)
662 tmp->vaddr = vaddr; 628{
663 vfio_insert_dma(iommu, tmp); 629 struct vfio_domain *d;
664 dma = tmp; 630 struct rb_node *n;
665 } 631 int ret;
666 } 632
667 } 633 /* Arbitrarily pick the first domain in the list for lookups */
634 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
635 n = rb_first(&iommu->dma_list);
636
637 /* If there's not a domain, there better not be any mappings */
638 if (WARN_ON(n && !d))
639 return -EINVAL;
640
641 for (; n; n = rb_next(n)) {
642 struct vfio_dma *dma;
643 dma_addr_t iova;
644
645 dma = rb_entry(n, struct vfio_dma, node);
646 iova = dma->iova;
647
648 while (iova < dma->iova + dma->size) {
649 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
650 size_t size;
668 651
669 if (!dma) { 652 if (WARN_ON(!phys)) {
670 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 653 iova += PAGE_SIZE;
671 if (!dma) { 654 continue;
672 iommu_unmap(iommu->domain, iova, size);
673 ret = -ENOMEM;
674 goto out_unpin;
675 } 655 }
676 656
677 dma->size = size; 657 size = PAGE_SIZE;
678 dma->iova = iova;
679 dma->vaddr = vaddr;
680 dma->prot = prot;
681 vfio_insert_dma(iommu, dma);
682 }
683 }
684 658
685 WARN_ON(ret); 659 while (iova + size < dma->iova + dma->size &&
686 mutex_unlock(&iommu->lock); 660 phys + size == iommu_iova_to_phys(d->domain,
687 return ret; 661 iova + size))
662 size += PAGE_SIZE;
688 663
689out_unpin: 664 ret = iommu_map(domain->domain, iova, phys,
690 vfio_unpin_pages(pfn, npage, prot, true); 665 size, dma->prot | domain->prot);
666 if (ret)
667 return ret;
691 668
692out: 669 iova += size;
693 iova = map->iova; 670 }
694 size = map->size;
695 while ((dma = vfio_find_dma(iommu, iova, size))) {
696 int r = vfio_remove_dma_overlap(iommu, iova,
697 &size, dma);
698 if (WARN_ON(r || !size))
699 break;
700 } 671 }
701 672
702 mutex_unlock(&iommu->lock); 673 return 0;
703 return ret;
704} 674}
705 675
706static int vfio_iommu_type1_attach_group(void *iommu_data, 676static int vfio_iommu_type1_attach_group(void *iommu_data,
707 struct iommu_group *iommu_group) 677 struct iommu_group *iommu_group)
708{ 678{
709 struct vfio_iommu *iommu = iommu_data; 679 struct vfio_iommu *iommu = iommu_data;
710 struct vfio_group *group, *tmp; 680 struct vfio_group *group, *g;
681 struct vfio_domain *domain, *d;
682 struct bus_type *bus = NULL;
711 int ret; 683 int ret;
712 684
713 group = kzalloc(sizeof(*group), GFP_KERNEL);
714 if (!group)
715 return -ENOMEM;
716
717 mutex_lock(&iommu->lock); 685 mutex_lock(&iommu->lock);
718 686
719 list_for_each_entry(tmp, &iommu->group_list, next) { 687 list_for_each_entry(d, &iommu->domain_list, next) {
720 if (tmp->iommu_group == iommu_group) { 688 list_for_each_entry(g, &d->group_list, next) {
689 if (g->iommu_group != iommu_group)
690 continue;
691
721 mutex_unlock(&iommu->lock); 692 mutex_unlock(&iommu->lock);
722 kfree(group);
723 return -EINVAL; 693 return -EINVAL;
724 } 694 }
725 } 695 }
726 696
697 group = kzalloc(sizeof(*group), GFP_KERNEL);
698 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
699 if (!group || !domain) {
700 ret = -ENOMEM;
701 goto out_free;
702 }
703
704 group->iommu_group = iommu_group;
705
706 /* Determine bus_type in order to allocate a domain */
707 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
708 if (ret)
709 goto out_free;
710
711 domain->domain = iommu_domain_alloc(bus);
712 if (!domain->domain) {
713 ret = -EIO;
714 goto out_free;
715 }
716
717 ret = iommu_attach_group(domain->domain, iommu_group);
718 if (ret)
719 goto out_domain;
720
721 INIT_LIST_HEAD(&domain->group_list);
722 list_add(&group->next, &domain->group_list);
723
724 if (!allow_unsafe_interrupts &&
725 !iommu_domain_has_cap(domain->domain, IOMMU_CAP_INTR_REMAP)) {
726 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
727 __func__);
728 ret = -EPERM;
729 goto out_detach;
730 }
731
732 if (iommu_domain_has_cap(domain->domain, IOMMU_CAP_CACHE_COHERENCY))
733 domain->prot |= IOMMU_CACHE;
734
727 /* 735 /*
728 * TODO: Domain have capabilities that might change as we add 736 * Try to match an existing compatible domain. We don't want to
729 * groups (see iommu->cache, currently never set). Check for 737 * preclude an IOMMU driver supporting multiple bus_types and being
730 * them and potentially disallow groups to be attached when it 738 * able to include different bus_types in the same IOMMU domain, so
731 * would change capabilities (ugh). 739 * we test whether the domains use the same iommu_ops rather than
740 * testing if they're on the same bus_type.
732 */ 741 */
733 ret = iommu_attach_group(iommu->domain, iommu_group); 742 list_for_each_entry(d, &iommu->domain_list, next) {
734 if (ret) { 743 if (d->domain->ops == domain->domain->ops &&
735 mutex_unlock(&iommu->lock); 744 d->prot == domain->prot) {
736 kfree(group); 745 iommu_detach_group(domain->domain, iommu_group);
737 return ret; 746 if (!iommu_attach_group(d->domain, iommu_group)) {
747 list_add(&group->next, &d->group_list);
748 iommu_domain_free(domain->domain);
749 kfree(domain);
750 mutex_unlock(&iommu->lock);
751 return 0;
752 }
753
754 ret = iommu_attach_group(domain->domain, iommu_group);
755 if (ret)
756 goto out_domain;
757 }
738 } 758 }
739 759
740 group->iommu_group = iommu_group; 760 /* replay mappings on new domains */
741 list_add(&group->next, &iommu->group_list); 761 ret = vfio_iommu_replay(iommu, domain);
762 if (ret)
763 goto out_detach;
764
765 list_add(&domain->next, &iommu->domain_list);
742 766
743 mutex_unlock(&iommu->lock); 767 mutex_unlock(&iommu->lock);
744 768
745 return 0; 769 return 0;
770
771out_detach:
772 iommu_detach_group(domain->domain, iommu_group);
773out_domain:
774 iommu_domain_free(domain->domain);
775out_free:
776 kfree(domain);
777 kfree(group);
778 mutex_unlock(&iommu->lock);
779 return ret;
780}
781
782static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
783{
784 struct rb_node *node;
785
786 while ((node = rb_first(&iommu->dma_list)))
787 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
746} 788}
747 789
748static void vfio_iommu_type1_detach_group(void *iommu_data, 790static void vfio_iommu_type1_detach_group(void *iommu_data,
749 struct iommu_group *iommu_group) 791 struct iommu_group *iommu_group)
750{ 792{
751 struct vfio_iommu *iommu = iommu_data; 793 struct vfio_iommu *iommu = iommu_data;
794 struct vfio_domain *domain;
752 struct vfio_group *group; 795 struct vfio_group *group;
753 796
754 mutex_lock(&iommu->lock); 797 mutex_lock(&iommu->lock);
755 798
756 list_for_each_entry(group, &iommu->group_list, next) { 799 list_for_each_entry(domain, &iommu->domain_list, next) {
757 if (group->iommu_group == iommu_group) { 800 list_for_each_entry(group, &domain->group_list, next) {
758 iommu_detach_group(iommu->domain, iommu_group); 801 if (group->iommu_group != iommu_group)
802 continue;
803
804 iommu_detach_group(domain->domain, iommu_group);
759 list_del(&group->next); 805 list_del(&group->next);
760 kfree(group); 806 kfree(group);
761 break; 807 /*
808 * Group ownership provides privilege, if the group
809 * list is empty, the domain goes away. If it's the
810 * last domain, then all the mappings go away too.
811 */
812 if (list_empty(&domain->group_list)) {
813 if (list_is_singular(&iommu->domain_list))
814 vfio_iommu_unmap_unpin_all(iommu);
815 iommu_domain_free(domain->domain);
816 list_del(&domain->next);
817 kfree(domain);
818 }
819 goto done;
762 } 820 }
763 } 821 }
764 822
823done:
765 mutex_unlock(&iommu->lock); 824 mutex_unlock(&iommu->lock);
766} 825}
767 826
@@ -769,40 +828,17 @@ static void *vfio_iommu_type1_open(unsigned long arg)
769{ 828{
770 struct vfio_iommu *iommu; 829 struct vfio_iommu *iommu;
771 830
772 if (arg != VFIO_TYPE1_IOMMU) 831 if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU)
773 return ERR_PTR(-EINVAL); 832 return ERR_PTR(-EINVAL);
774 833
775 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 834 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
776 if (!iommu) 835 if (!iommu)
777 return ERR_PTR(-ENOMEM); 836 return ERR_PTR(-ENOMEM);
778 837
779 INIT_LIST_HEAD(&iommu->group_list); 838 INIT_LIST_HEAD(&iommu->domain_list);
780 iommu->dma_list = RB_ROOT; 839 iommu->dma_list = RB_ROOT;
781 mutex_init(&iommu->lock); 840 mutex_init(&iommu->lock);
782 841 iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU);
783 /*
784 * Wish we didn't have to know about bus_type here.
785 */
786 iommu->domain = iommu_domain_alloc(&pci_bus_type);
787 if (!iommu->domain) {
788 kfree(iommu);
789 return ERR_PTR(-EIO);
790 }
791
792 /*
793 * Wish we could specify required capabilities rather than create
794 * a domain, see what comes out and hope it doesn't change along
795 * the way. Fortunately we know interrupt remapping is global for
796 * our iommus.
797 */
798 if (!allow_unsafe_interrupts &&
799 !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
800 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
801 __func__);
802 iommu_domain_free(iommu->domain);
803 kfree(iommu);
804 return ERR_PTR(-EPERM);
805 }
806 842
807 return iommu; 843 return iommu;
808} 844}
@@ -810,25 +846,24 @@ static void *vfio_iommu_type1_open(unsigned long arg)
810static void vfio_iommu_type1_release(void *iommu_data) 846static void vfio_iommu_type1_release(void *iommu_data)
811{ 847{
812 struct vfio_iommu *iommu = iommu_data; 848 struct vfio_iommu *iommu = iommu_data;
849 struct vfio_domain *domain, *domain_tmp;
813 struct vfio_group *group, *group_tmp; 850 struct vfio_group *group, *group_tmp;
814 struct rb_node *node;
815 851
816 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { 852 vfio_iommu_unmap_unpin_all(iommu);
817 iommu_detach_group(iommu->domain, group->iommu_group);
818 list_del(&group->next);
819 kfree(group);
820 }
821 853
822 while ((node = rb_first(&iommu->dma_list))) { 854 list_for_each_entry_safe(domain, domain_tmp,
823 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 855 &iommu->domain_list, next) {
824 size_t size = dma->size; 856 list_for_each_entry_safe(group, group_tmp,
825 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); 857 &domain->group_list, next) {
826 if (WARN_ON(!size)) 858 iommu_detach_group(domain->domain, group->iommu_group);
827 break; 859 list_del(&group->next);
860 kfree(group);
861 }
862 iommu_domain_free(domain->domain);
863 list_del(&domain->next);
864 kfree(domain);
828 } 865 }
829 866
830 iommu_domain_free(iommu->domain);
831 iommu->domain = NULL;
832 kfree(iommu); 867 kfree(iommu);
833} 868}
834 869
@@ -841,6 +876,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
841 if (cmd == VFIO_CHECK_EXTENSION) { 876 if (cmd == VFIO_CHECK_EXTENSION) {
842 switch (arg) { 877 switch (arg) {
843 case VFIO_TYPE1_IOMMU: 878 case VFIO_TYPE1_IOMMU:
879 case VFIO_TYPE1v2_IOMMU:
844 return 1; 880 return 1;
845 default: 881 default:
846 return 0; 882 return 0;
@@ -858,7 +894,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
858 894
859 info.flags = 0; 895 info.flags = 0;
860 896
861 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; 897 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
862 898
863 return copy_to_user((void __user *)arg, &info, minsz); 899 return copy_to_user((void __user *)arg, &info, minsz);
864 900
@@ -911,9 +947,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
911 947
912static int __init vfio_iommu_type1_init(void) 948static int __init vfio_iommu_type1_init(void)
913{ 949{
914 if (!iommu_present(&pci_bus_type))
915 return -ENODEV;
916
917 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 950 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
918} 951}
919 952
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5bc146..460fdf2e26f1 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,7 @@
23 23
24#define VFIO_TYPE1_IOMMU 1 24#define VFIO_TYPE1_IOMMU 1
25#define VFIO_SPAPR_TCE_IOMMU 2 25#define VFIO_SPAPR_TCE_IOMMU 2
26#define VFIO_TYPE1v2_IOMMU 3
26 27
27/* 28/*
28 * The IOCTL interface is designed for extensibility by embedding the 29 * The IOCTL interface is designed for extensibility by embedding the