aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2014-02-26 13:38:36 -0500
committerAlex Williamson <alex.williamson@redhat.com>2014-02-26 13:38:36 -0500
commit1ef3e2bc04223ff956dc62abaf2dff1f3322a431 (patch)
treeff3d2b15264d6a8fec4b7780d80fc8ca79a997f4
parentcfbf8d4857c26a8a307fb7cd258074c9dcd8c691 (diff)
vfio/iommu_type1: Multi-IOMMU domain support
We currently have a problem that we cannot support advanced features of an IOMMU domain (ex. IOMMU_CACHE), because we have no guarantee that those features will be supported by all of the hardware units involved with the domain over its lifetime. For instance, the Intel VT-d architecture does not require that all DRHDs support snoop control. If we create a domain based on a device behind a DRHD that does support snoop control and enable SNP support via the IOMMU_CACHE mapping option, we cannot then add a device behind a DRHD which does not support snoop control or we'll get reserved bit faults from the SNP bit in the pagetables. To add to the complexity, we can't know the properties of a domain until a device is attached. We could pass this problem off to userspace and require that a separate vfio container be used, but we don't know how to handle page accounting in that case. How do we know that a page pinned in one container is the same page as a different container and avoid double billing the user for the page. The solution is therefore to support multiple IOMMU domains per container. In the majority of cases, only one domain will be required since hardware is typically consistent within a system. However, this provides us the ability to validate compatibility of domains and support mixed environments where page table flags can be different between domains. To do this, our DMA tracking needs to change. We currently try to coalesce user mappings into as few tracking entries as possible. The problem then becomes that we lose granularity of user mappings. We've never guaranteed that a user is able to unmap at a finer granularity than the original mapping, but we must honor the granularity of the original mapping. This coalescing code is therefore removed, allowing only unmaps covering complete maps. The change in accounting is fairly small here, a typical QEMU VM will start out with roughly a dozen entries, so it's arguable if this coalescing was ever needed. We also move IOMMU domain creation to the point where a group is attached to the container. An interesting side-effect of this is that we now have access to the device at the time of domain creation and can probe the devices within the group to determine the bus_type. This finally makes vfio_iommu_type1 completely device/bus agnostic. In fact, each IOMMU domain can host devices on different buses managed by different physical IOMMUs, and present a single DMA mapping interface to the user. When a new domain is created, mappings are replayed to bring the IOMMU pagetables up to the state of the current container. And of course, DMA mapping and unmapping automatically traverse all of the configured IOMMU domains. Signed-off-by: Alex Williamson <alex.williamson@redhat.com> Cc: Varun Sethi <Varun.Sethi@freescale.com>
-rw-r--r--drivers/vfio/vfio_iommu_type1.c637
-rw-r--r--include/uapi/linux/vfio.h1
2 files changed, 336 insertions, 302 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4fb7a8f83c8a..8c7bb9befdab 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -30,7 +30,6 @@
30#include <linux/iommu.h> 30#include <linux/iommu.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/pci.h> /* pci_bus_type */
34#include <linux/rbtree.h> 33#include <linux/rbtree.h>
35#include <linux/sched.h> 34#include <linux/sched.h>
36#include <linux/slab.h> 35#include <linux/slab.h>
@@ -55,11 +54,17 @@ MODULE_PARM_DESC(disable_hugepages,
55 "Disable VFIO IOMMU support for IOMMU hugepages."); 54 "Disable VFIO IOMMU support for IOMMU hugepages.");
56 55
57struct vfio_iommu { 56struct vfio_iommu {
58 struct iommu_domain *domain; 57 struct list_head domain_list;
59 struct mutex lock; 58 struct mutex lock;
60 struct rb_root dma_list; 59 struct rb_root dma_list;
60 bool v2;
61};
62
63struct vfio_domain {
64 struct iommu_domain *domain;
65 struct list_head next;
61 struct list_head group_list; 66 struct list_head group_list;
62 bool cache; 67 int prot; /* IOMMU_CACHE */
63}; 68};
64 69
65struct vfio_dma { 70struct vfio_dma {
@@ -99,7 +104,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
99 return NULL; 104 return NULL;
100} 105}
101 106
102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 107static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
103{ 108{
104 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 109 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
105 struct vfio_dma *dma; 110 struct vfio_dma *dma;
@@ -118,7 +123,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
118 rb_insert_color(&new->node, &iommu->dma_list); 123 rb_insert_color(&new->node, &iommu->dma_list);
119} 124}
120 125
121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 126static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
122{ 127{
123 rb_erase(&old->node, &iommu->dma_list); 128 rb_erase(&old->node, &iommu->dma_list);
124} 129}
@@ -322,32 +327,39 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
322 return unlocked; 327 return unlocked;
323} 328}
324 329
325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 330static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
326 dma_addr_t iova, size_t *size)
327{ 331{
328 dma_addr_t start = iova, end = iova + *size; 332 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
333 struct vfio_domain *domain, *d;
329 long unlocked = 0; 334 long unlocked = 0;
330 335
336 if (!dma->size)
337 return;
338 /*
339 * We use the IOMMU to track the physical addresses, otherwise we'd
340 * need a much more complicated tracking system. Unfortunately that
341 * means we need to use one of the iommu domains to figure out the
342 * pfns to unpin. The rest need to be unmapped in advance so we have
343 * no iommu translations remaining when the pages are unpinned.
344 */
345 domain = d = list_first_entry(&iommu->domain_list,
346 struct vfio_domain, next);
347
348 list_for_each_entry_continue(d, &iommu->domain_list, next)
349 iommu_unmap(d->domain, dma->iova, dma->size);
350
331 while (iova < end) { 351 while (iova < end) {
332 size_t unmapped; 352 size_t unmapped;
333 phys_addr_t phys; 353 phys_addr_t phys;
334 354
335 /* 355 phys = iommu_iova_to_phys(domain->domain, iova);
336 * We use the IOMMU to track the physical address. This
337 * saves us from having a lot more entries in our mapping
338 * tree. The downside is that we don't track the size
339 * used to do the mapping. We request unmap of a single
340 * page, but expect IOMMUs that support large pages to
341 * unmap a larger chunk.
342 */
343 phys = iommu_iova_to_phys(iommu->domain, iova);
344 if (WARN_ON(!phys)) { 356 if (WARN_ON(!phys)) {
345 iova += PAGE_SIZE; 357 iova += PAGE_SIZE;
346 continue; 358 continue;
347 } 359 }
348 360
349 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); 361 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
350 if (!unmapped) 362 if (WARN_ON(!unmapped))
351 break; 363 break;
352 364
353 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, 365 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
@@ -357,119 +369,26 @@ static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
357 } 369 }
358 370
359 vfio_lock_acct(-unlocked); 371 vfio_lock_acct(-unlocked);
360
361 *size = iova - start;
362
363 return 0;
364} 372}
365 373
366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, 374static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
367 size_t *size, struct vfio_dma *dma)
368{ 375{
369 size_t offset, overlap, tmp; 376 vfio_unmap_unpin(iommu, dma);
370 struct vfio_dma *split; 377 vfio_unlink_dma(iommu, dma);
371 int ret; 378 kfree(dma);
372 379}
373 if (!*size)
374 return 0;
375
376 /*
377 * Existing dma region is completely covered, unmap all. This is
378 * the likely case since userspace tends to map and unmap buffers
379 * in one shot rather than multiple mappings within a buffer.
380 */
381 if (likely(start <= dma->iova &&
382 start + *size >= dma->iova + dma->size)) {
383 *size = dma->size;
384 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
385 if (ret)
386 return ret;
387
388 /*
389 * Did we remove more than we have? Should never happen
390 * since a vfio_dma is contiguous in iova and vaddr.
391 */
392 WARN_ON(*size != dma->size);
393
394 vfio_remove_dma(iommu, dma);
395 kfree(dma);
396 return 0;
397 }
398
399 /* Overlap low address of existing range */
400 if (start <= dma->iova) {
401 overlap = start + *size - dma->iova;
402 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
403 if (ret)
404 return ret;
405
406 vfio_remove_dma(iommu, dma);
407
408 /*
409 * Check, we may have removed to whole vfio_dma. If not
410 * fixup and re-insert.
411 */
412 if (overlap < dma->size) {
413 dma->iova += overlap;
414 dma->vaddr += overlap;
415 dma->size -= overlap;
416 vfio_insert_dma(iommu, dma);
417 } else
418 kfree(dma);
419
420 *size = overlap;
421 return 0;
422 }
423
424 /* Overlap high address of existing range */
425 if (start + *size >= dma->iova + dma->size) {
426 offset = start - dma->iova;
427 overlap = dma->size - offset;
428
429 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
430 if (ret)
431 return ret;
432
433 dma->size -= overlap;
434 *size = overlap;
435 return 0;
436 }
437
438 /* Split existing */
439
440 /*
441 * Allocate our tracking structure early even though it may not
442 * be used. An Allocation failure later loses track of pages and
443 * is more difficult to unwind.
444 */
445 split = kzalloc(sizeof(*split), GFP_KERNEL);
446 if (!split)
447 return -ENOMEM;
448
449 offset = start - dma->iova;
450
451 ret = vfio_unmap_unpin(iommu, dma, start, size);
452 if (ret || !*size) {
453 kfree(split);
454 return ret;
455 }
456
457 tmp = dma->size;
458 380
459 /* Resize the lower vfio_dma in place, before the below insert */ 381static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
460 dma->size = offset; 382{
383 struct vfio_domain *domain;
384 unsigned long bitmap = PAGE_MASK;
461 385
462 /* Insert new for remainder, assuming it didn't all get unmapped */ 386 mutex_lock(&iommu->lock);
463 if (likely(offset + *size < tmp)) { 387 list_for_each_entry(domain, &iommu->domain_list, next)
464 split->size = tmp - offset - *size; 388 bitmap &= domain->domain->ops->pgsize_bitmap;
465 split->iova = dma->iova + offset + *size; 389 mutex_unlock(&iommu->lock);
466 split->vaddr = dma->vaddr + offset + *size;
467 split->prot = dma->prot;
468 vfio_insert_dma(iommu, split);
469 } else
470 kfree(split);
471 390
472 return 0; 391 return bitmap;
473} 392}
474 393
475static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 394static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -477,10 +396,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
477{ 396{
478 uint64_t mask; 397 uint64_t mask;
479 struct vfio_dma *dma; 398 struct vfio_dma *dma;
480 size_t unmapped = 0, size; 399 size_t unmapped = 0;
481 int ret = 0; 400 int ret = 0;
482 401
483 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 402 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
484 403
485 if (unmap->iova & mask) 404 if (unmap->iova & mask)
486 return -EINVAL; 405 return -EINVAL;
@@ -491,20 +410,61 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
491 410
492 mutex_lock(&iommu->lock); 411 mutex_lock(&iommu->lock);
493 412
413 /*
414 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
415 * avoid tracking individual mappings. This means that the granularity
416 * of the original mapping was lost and the user was allowed to attempt
417 * to unmap any range. Depending on the contiguousness of physical
418 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
419 * or may not have worked. We only guaranteed unmap granularity
420 * matching the original mapping; even though it was untracked here,
421 * the original mappings are reflected in IOMMU mappings. This
422 * resulted in a couple unusual behaviors. First, if a range is not
423 * able to be unmapped, ex. a set of 4k pages that was mapped as a
424 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
425 * a zero sized unmap. Also, if an unmap request overlaps the first
426 * address of a hugepage, the IOMMU will unmap the entire hugepage.
427 * This also returns success and the returned unmap size reflects the
428 * actual size unmapped.
429 *
430 * We attempt to maintain compatibility with this "v1" interface, but
431 * we take control out of the hands of the IOMMU. Therefore, an unmap
432 * request offset from the beginning of the original mapping will
433 * return success with zero sized unmap. And an unmap request covering
434 * the first iova of mapping will unmap the entire range.
435 *
436 * The v2 version of this interface intends to be more deterministic.
437 * Unmap requests must fully cover previous mappings. Multiple
438 * mappings may still be unmaped by specifying large ranges, but there
439 * must not be any previous mappings bisected by the range. An error
440 * will be returned if these conditions are not met. The v2 interface
441 * will only return success and a size of zero if there were no
442 * mappings within the range.
443 */
444 if (iommu->v2) {
445 dma = vfio_find_dma(iommu, unmap->iova, 0);
446 if (dma && dma->iova != unmap->iova) {
447 ret = -EINVAL;
448 goto unlock;
449 }
450 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
451 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
452 ret = -EINVAL;
453 goto unlock;
454 }
455 }
456
494 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { 457 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
495 size = unmap->size; 458 if (!iommu->v2 && unmap->iova > dma->iova)
496 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
497 if (ret || !size)
498 break; 459 break;
499 unmapped += size; 460 unmapped += dma->size;
461 vfio_remove_dma(iommu, dma);
500 } 462 }
501 463
464unlock:
502 mutex_unlock(&iommu->lock); 465 mutex_unlock(&iommu->lock);
503 466
504 /* 467 /* Report how much was unmapped */
505 * We may unmap more than requested, update the unmap struct so
506 * userspace can know.
507 */
508 unmap->size = unmapped; 468 unmap->size = unmapped;
509 469
510 return ret; 470 return ret;
@@ -516,22 +476,47 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
516 * soon, so this is just a temporary workaround to break mappings down into 476 * soon, so this is just a temporary workaround to break mappings down into
517 * PAGE_SIZE. Better to map smaller pages than nothing. 477 * PAGE_SIZE. Better to map smaller pages than nothing.
518 */ 478 */
519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, 479static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
520 unsigned long pfn, long npage, int prot) 480 unsigned long pfn, long npage, int prot)
521{ 481{
522 long i; 482 long i;
523 int ret; 483 int ret;
524 484
525 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { 485 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
526 ret = iommu_map(iommu->domain, iova, 486 ret = iommu_map(domain->domain, iova,
527 (phys_addr_t)pfn << PAGE_SHIFT, 487 (phys_addr_t)pfn << PAGE_SHIFT,
528 PAGE_SIZE, prot); 488 PAGE_SIZE, prot | domain->prot);
529 if (ret) 489 if (ret)
530 break; 490 break;
531 } 491 }
532 492
533 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) 493 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
534 iommu_unmap(iommu->domain, iova, PAGE_SIZE); 494 iommu_unmap(domain->domain, iova, PAGE_SIZE);
495
496 return ret;
497}
498
499static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
500 unsigned long pfn, long npage, int prot)
501{
502 struct vfio_domain *d;
503 int ret;
504
505 list_for_each_entry(d, &iommu->domain_list, next) {
506 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
507 npage << PAGE_SHIFT, prot | d->prot);
508 if (ret) {
509 if (ret != -EBUSY ||
510 map_try_harder(d, iova, pfn, npage, prot))
511 goto unwind;
512 }
513 }
514
515 return 0;
516
517unwind:
518 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
519 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
535 520
536 return ret; 521 return ret;
537} 522}
@@ -545,12 +530,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
545 long npage; 530 long npage;
546 int ret = 0, prot = 0; 531 int ret = 0, prot = 0;
547 uint64_t mask; 532 uint64_t mask;
548 struct vfio_dma *dma = NULL; 533 struct vfio_dma *dma;
549 unsigned long pfn; 534 unsigned long pfn;
550 535
551 end = map->iova + map->size; 536 end = map->iova + map->size;
552 537
553 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 538 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
554 539
555 /* READ/WRITE from device perspective */ 540 /* READ/WRITE from device perspective */
556 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 541 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
@@ -561,9 +546,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
561 if (!prot) 546 if (!prot)
562 return -EINVAL; /* No READ/WRITE? */ 547 return -EINVAL; /* No READ/WRITE? */
563 548
564 if (iommu->cache)
565 prot |= IOMMU_CACHE;
566
567 if (vaddr & mask) 549 if (vaddr & mask)
568 return -EINVAL; 550 return -EINVAL;
569 if (map->iova & mask) 551 if (map->iova & mask)
@@ -588,180 +570,257 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
588 return -EEXIST; 570 return -EEXIST;
589 } 571 }
590 572
591 for (iova = map->iova; iova < end; iova += size, vaddr += size) { 573 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
592 long i; 574 if (!dma) {
575 mutex_unlock(&iommu->lock);
576 return -ENOMEM;
577 }
578
579 dma->iova = map->iova;
580 dma->vaddr = map->vaddr;
581 dma->prot = prot;
593 582
583 /* Insert zero-sized and grow as we map chunks of it */
584 vfio_link_dma(iommu, dma);
585
586 for (iova = map->iova; iova < end; iova += size, vaddr += size) {
594 /* Pin a contiguous chunk of memory */ 587 /* Pin a contiguous chunk of memory */
595 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, 588 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
596 prot, &pfn); 589 prot, &pfn);
597 if (npage <= 0) { 590 if (npage <= 0) {
598 WARN_ON(!npage); 591 WARN_ON(!npage);
599 ret = (int)npage; 592 ret = (int)npage;
600 goto out; 593 break;
601 }
602
603 /* Verify pages are not already mapped */
604 for (i = 0; i < npage; i++) {
605 if (iommu_iova_to_phys(iommu->domain,
606 iova + (i << PAGE_SHIFT))) {
607 ret = -EBUSY;
608 goto out_unpin;
609 }
610 } 594 }
611 595
612 ret = iommu_map(iommu->domain, iova, 596 /* Map it! */
613 (phys_addr_t)pfn << PAGE_SHIFT, 597 ret = vfio_iommu_map(iommu, iova, pfn, npage, prot);
614 npage << PAGE_SHIFT, prot);
615 if (ret) { 598 if (ret) {
616 if (ret != -EBUSY || 599 vfio_unpin_pages(pfn, npage, prot, true);
617 map_try_harder(iommu, iova, pfn, npage, prot)) { 600 break;
618 goto out_unpin;
619 }
620 } 601 }
621 602
622 size = npage << PAGE_SHIFT; 603 size = npage << PAGE_SHIFT;
604 dma->size += size;
605 }
623 606
624 /* 607 if (ret)
625 * Check if we abut a region below - nothing below 0. 608 vfio_remove_dma(iommu, dma);
626 * This is the most likely case when mapping chunks of
627 * physically contiguous regions within a virtual address
628 * range. Update the abutting entry in place since iova
629 * doesn't change.
630 */
631 if (likely(iova)) {
632 struct vfio_dma *tmp;
633 tmp = vfio_find_dma(iommu, iova - 1, 1);
634 if (tmp && tmp->prot == prot &&
635 tmp->vaddr + tmp->size == vaddr) {
636 tmp->size += size;
637 iova = tmp->iova;
638 size = tmp->size;
639 vaddr = tmp->vaddr;
640 dma = tmp;
641 }
642 }
643 609
644 /* 610 mutex_unlock(&iommu->lock);
645 * Check if we abut a region above - nothing above ~0 + 1. 611 return ret;
646 * If we abut above and below, remove and free. If only 612}
647 * abut above, remove, modify, reinsert. 613
648 */ 614static int vfio_bus_type(struct device *dev, void *data)
649 if (likely(iova + size)) { 615{
650 struct vfio_dma *tmp; 616 struct bus_type **bus = data;
651 tmp = vfio_find_dma(iommu, iova + size, 1); 617
652 if (tmp && tmp->prot == prot && 618 if (*bus && *bus != dev->bus)
653 tmp->vaddr == vaddr + size) { 619 return -EINVAL;
654 vfio_remove_dma(iommu, tmp); 620
655 if (dma) { 621 *bus = dev->bus;
656 dma->size += tmp->size; 622
657 kfree(tmp); 623 return 0;
658 } else { 624}
659 size += tmp->size; 625
660 tmp->size = size; 626static int vfio_iommu_replay(struct vfio_iommu *iommu,
661 tmp->iova = iova; 627 struct vfio_domain *domain)
662 tmp->vaddr = vaddr; 628{
663 vfio_insert_dma(iommu, tmp); 629 struct vfio_domain *d;
664 dma = tmp; 630 struct rb_node *n;
665 } 631 int ret;
666 } 632
667 } 633 /* Arbitrarily pick the first domain in the list for lookups */
634 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
635 n = rb_first(&iommu->dma_list);
636
637 /* If there's not a domain, there better not be any mappings */
638 if (WARN_ON(n && !d))
639 return -EINVAL;
640
641 for (; n; n = rb_next(n)) {
642 struct vfio_dma *dma;
643 dma_addr_t iova;
644
645 dma = rb_entry(n, struct vfio_dma, node);
646 iova = dma->iova;
647
648 while (iova < dma->iova + dma->size) {
649 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
650 size_t size;
668 651
669 if (!dma) { 652 if (WARN_ON(!phys)) {
670 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 653 iova += PAGE_SIZE;
671 if (!dma) { 654 continue;
672 iommu_unmap(iommu->domain, iova, size);
673 ret = -ENOMEM;
674 goto out_unpin;
675 } 655 }
676 656
677 dma->size = size; 657 size = PAGE_SIZE;
678 dma->iova = iova;
679 dma->vaddr = vaddr;
680 dma->prot = prot;
681 vfio_insert_dma(iommu, dma);
682 }
683 }
684 658
685 WARN_ON(ret); 659 while (iova + size < dma->iova + dma->size &&
686 mutex_unlock(&iommu->lock); 660 phys + size == iommu_iova_to_phys(d->domain,
687 return ret; 661 iova + size))
662 size += PAGE_SIZE;
688 663
689out_unpin: 664 ret = iommu_map(domain->domain, iova, phys,
690 vfio_unpin_pages(pfn, npage, prot, true); 665 size, dma->prot | domain->prot);
666 if (ret)
667 return ret;
691 668
692out: 669 iova += size;
693 iova = map->iova; 670 }
694 size = map->size;
695 while ((dma = vfio_find_dma(iommu, iova, size))) {
696 int r = vfio_remove_dma_overlap(iommu, iova,
697 &size, dma);
698 if (WARN_ON(r || !size))
699 break;
700 } 671 }
701 672
702 mutex_unlock(&iommu->lock); 673 return 0;
703 return ret;
704} 674}
705 675
706static int vfio_iommu_type1_attach_group(void *iommu_data, 676static int vfio_iommu_type1_attach_group(void *iommu_data,
707 struct iommu_group *iommu_group) 677 struct iommu_group *iommu_group)
708{ 678{
709 struct vfio_iommu *iommu = iommu_data; 679 struct vfio_iommu *iommu = iommu_data;
710 struct vfio_group *group, *tmp; 680 struct vfio_group *group, *g;
681 struct vfio_domain *domain, *d;
682 struct bus_type *bus = NULL;
711 int ret; 683 int ret;
712 684
713 group = kzalloc(sizeof(*group), GFP_KERNEL);
714 if (!group)
715 return -ENOMEM;
716
717 mutex_lock(&iommu->lock); 685 mutex_lock(&iommu->lock);
718 686
719 list_for_each_entry(tmp, &iommu->group_list, next) { 687 list_for_each_entry(d, &iommu->domain_list, next) {
720 if (tmp->iommu_group == iommu_group) { 688 list_for_each_entry(g, &d->group_list, next) {
689 if (g->iommu_group != iommu_group)
690 continue;
691
721 mutex_unlock(&iommu->lock); 692 mutex_unlock(&iommu->lock);
722 kfree(group);
723 return -EINVAL; 693 return -EINVAL;
724 } 694 }
725 } 695 }
726 696
697 group = kzalloc(sizeof(*group), GFP_KERNEL);
698 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
699 if (!group || !domain) {
700 ret = -ENOMEM;
701 goto out_free;
702 }
703
704 group->iommu_group = iommu_group;
705
706 /* Determine bus_type in order to allocate a domain */
707 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
708 if (ret)
709 goto out_free;
710
711 domain->domain = iommu_domain_alloc(bus);
712 if (!domain->domain) {
713 ret = -EIO;
714 goto out_free;
715 }
716
717 ret = iommu_attach_group(domain->domain, iommu_group);
718 if (ret)
719 goto out_domain;
720
721 INIT_LIST_HEAD(&domain->group_list);
722 list_add(&group->next, &domain->group_list);
723
724 if (!allow_unsafe_interrupts &&
725 !iommu_domain_has_cap(domain->domain, IOMMU_CAP_INTR_REMAP)) {
726 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
727 __func__);
728 ret = -EPERM;
729 goto out_detach;
730 }
731
732 if (iommu_domain_has_cap(domain->domain, IOMMU_CAP_CACHE_COHERENCY))
733 domain->prot |= IOMMU_CACHE;
734
727 /* 735 /*
728 * TODO: Domain have capabilities that might change as we add 736 * Try to match an existing compatible domain. We don't want to
729 * groups (see iommu->cache, currently never set). Check for 737 * preclude an IOMMU driver supporting multiple bus_types and being
730 * them and potentially disallow groups to be attached when it 738 * able to include different bus_types in the same IOMMU domain, so
731 * would change capabilities (ugh). 739 * we test whether the domains use the same iommu_ops rather than
740 * testing if they're on the same bus_type.
732 */ 741 */
733 ret = iommu_attach_group(iommu->domain, iommu_group); 742 list_for_each_entry(d, &iommu->domain_list, next) {
734 if (ret) { 743 if (d->domain->ops == domain->domain->ops &&
735 mutex_unlock(&iommu->lock); 744 d->prot == domain->prot) {
736 kfree(group); 745 iommu_detach_group(domain->domain, iommu_group);
737 return ret; 746 if (!iommu_attach_group(d->domain, iommu_group)) {
747 list_add(&group->next, &d->group_list);
748 iommu_domain_free(domain->domain);
749 kfree(domain);
750 mutex_unlock(&iommu->lock);
751 return 0;
752 }
753
754 ret = iommu_attach_group(domain->domain, iommu_group);
755 if (ret)
756 goto out_domain;
757 }
738 } 758 }
739 759
740 group->iommu_group = iommu_group; 760 /* replay mappings on new domains */
741 list_add(&group->next, &iommu->group_list); 761 ret = vfio_iommu_replay(iommu, domain);
762 if (ret)
763 goto out_detach;
764
765 list_add(&domain->next, &iommu->domain_list);
742 766
743 mutex_unlock(&iommu->lock); 767 mutex_unlock(&iommu->lock);
744 768
745 return 0; 769 return 0;
770
771out_detach:
772 iommu_detach_group(domain->domain, iommu_group);
773out_domain:
774 iommu_domain_free(domain->domain);
775out_free:
776 kfree(domain);
777 kfree(group);
778 mutex_unlock(&iommu->lock);
779 return ret;
780}
781
782static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
783{
784 struct rb_node *node;
785
786 while ((node = rb_first(&iommu->dma_list)))
787 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
746} 788}
747 789
748static void vfio_iommu_type1_detach_group(void *iommu_data, 790static void vfio_iommu_type1_detach_group(void *iommu_data,
749 struct iommu_group *iommu_group) 791 struct iommu_group *iommu_group)
750{ 792{
751 struct vfio_iommu *iommu = iommu_data; 793 struct vfio_iommu *iommu = iommu_data;
794 struct vfio_domain *domain;
752 struct vfio_group *group; 795 struct vfio_group *group;
753 796
754 mutex_lock(&iommu->lock); 797 mutex_lock(&iommu->lock);
755 798
756 list_for_each_entry(group, &iommu->group_list, next) { 799 list_for_each_entry(domain, &iommu->domain_list, next) {
757 if (group->iommu_group == iommu_group) { 800 list_for_each_entry(group, &domain->group_list, next) {
758 iommu_detach_group(iommu->domain, iommu_group); 801 if (group->iommu_group != iommu_group)
802 continue;
803
804 iommu_detach_group(domain->domain, iommu_group);
759 list_del(&group->next); 805 list_del(&group->next);
760 kfree(group); 806 kfree(group);
761 break; 807 /*
808 * Group ownership provides privilege, if the group
809 * list is empty, the domain goes away. If it's the
810 * last domain, then all the mappings go away too.
811 */
812 if (list_empty(&domain->group_list)) {
813 if (list_is_singular(&iommu->domain_list))
814 vfio_iommu_unmap_unpin_all(iommu);
815 iommu_domain_free(domain->domain);
816 list_del(&domain->next);
817 kfree(domain);
818 }
819 goto done;
762 } 820 }
763 } 821 }
764 822
823done:
765 mutex_unlock(&iommu->lock); 824 mutex_unlock(&iommu->lock);
766} 825}
767 826
@@ -769,40 +828,17 @@ static void *vfio_iommu_type1_open(unsigned long arg)
769{ 828{
770 struct vfio_iommu *iommu; 829 struct vfio_iommu *iommu;
771 830
772 if (arg != VFIO_TYPE1_IOMMU) 831 if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU)
773 return ERR_PTR(-EINVAL); 832 return ERR_PTR(-EINVAL);
774 833
775 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 834 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
776 if (!iommu) 835 if (!iommu)
777 return ERR_PTR(-ENOMEM); 836 return ERR_PTR(-ENOMEM);
778 837
779 INIT_LIST_HEAD(&iommu->group_list); 838 INIT_LIST_HEAD(&iommu->domain_list);
780 iommu->dma_list = RB_ROOT; 839 iommu->dma_list = RB_ROOT;
781 mutex_init(&iommu->lock); 840 mutex_init(&iommu->lock);
782 841 iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU);
783 /*
784 * Wish we didn't have to know about bus_type here.
785 */
786 iommu->domain = iommu_domain_alloc(&pci_bus_type);
787 if (!iommu->domain) {
788 kfree(iommu);
789 return ERR_PTR(-EIO);
790 }
791
792 /*
793 * Wish we could specify required capabilities rather than create
794 * a domain, see what comes out and hope it doesn't change along
795 * the way. Fortunately we know interrupt remapping is global for
796 * our iommus.
797 */
798 if (!allow_unsafe_interrupts &&
799 !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
800 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
801 __func__);
802 iommu_domain_free(iommu->domain);
803 kfree(iommu);
804 return ERR_PTR(-EPERM);
805 }
806 842
807 return iommu; 843 return iommu;
808} 844}
@@ -810,25 +846,24 @@ static void *vfio_iommu_type1_open(unsigned long arg)
810static void vfio_iommu_type1_release(void *iommu_data) 846static void vfio_iommu_type1_release(void *iommu_data)
811{ 847{
812 struct vfio_iommu *iommu = iommu_data; 848 struct vfio_iommu *iommu = iommu_data;
849 struct vfio_domain *domain, *domain_tmp;
813 struct vfio_group *group, *group_tmp; 850 struct vfio_group *group, *group_tmp;
814 struct rb_node *node;
815 851
816 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { 852 vfio_iommu_unmap_unpin_all(iommu);
817 iommu_detach_group(iommu->domain, group->iommu_group);
818 list_del(&group->next);
819 kfree(group);
820 }
821 853
822 while ((node = rb_first(&iommu->dma_list))) { 854 list_for_each_entry_safe(domain, domain_tmp,
823 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 855 &iommu->domain_list, next) {
824 size_t size = dma->size; 856 list_for_each_entry_safe(group, group_tmp,
825 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); 857 &domain->group_list, next) {
826 if (WARN_ON(!size)) 858 iommu_detach_group(domain->domain, group->iommu_group);
827 break; 859 list_del(&group->next);
860 kfree(group);
861 }
862 iommu_domain_free(domain->domain);
863 list_del(&domain->next);
864 kfree(domain);
828 } 865 }
829 866
830 iommu_domain_free(iommu->domain);
831 iommu->domain = NULL;
832 kfree(iommu); 867 kfree(iommu);
833} 868}
834 869
@@ -841,6 +876,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
841 if (cmd == VFIO_CHECK_EXTENSION) { 876 if (cmd == VFIO_CHECK_EXTENSION) {
842 switch (arg) { 877 switch (arg) {
843 case VFIO_TYPE1_IOMMU: 878 case VFIO_TYPE1_IOMMU:
879 case VFIO_TYPE1v2_IOMMU:
844 return 1; 880 return 1;
845 default: 881 default:
846 return 0; 882 return 0;
@@ -858,7 +894,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
858 894
859 info.flags = 0; 895 info.flags = 0;
860 896
861 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; 897 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
862 898
863 return copy_to_user((void __user *)arg, &info, minsz); 899 return copy_to_user((void __user *)arg, &info, minsz);
864 900
@@ -911,9 +947,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
911 947
912static int __init vfio_iommu_type1_init(void) 948static int __init vfio_iommu_type1_init(void)
913{ 949{
914 if (!iommu_present(&pci_bus_type))
915 return -ENODEV;
916
917 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 950 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
918} 951}
919 952
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5bc146..460fdf2e26f1 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,7 @@
23 23
24#define VFIO_TYPE1_IOMMU 1 24#define VFIO_TYPE1_IOMMU 1
25#define VFIO_SPAPR_TCE_IOMMU 2 25#define VFIO_SPAPR_TCE_IOMMU 2
26#define VFIO_TYPE1v2_IOMMU 3
26 27
27/* 28/*
28 * The IOCTL interface is designed for extensibility by embedding the 29 * The IOCTL interface is designed for extensibility by embedding the