diff options
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 637 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 1 |
2 files changed, 336 insertions, 302 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4fb7a8f83c8a..8c7bb9befdab 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/iommu.h> | 30 | #include <linux/iommu.h> |
31 | #include <linux/module.h> | 31 | #include <linux/module.h> |
32 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/pci.h> /* pci_bus_type */ | ||
34 | #include <linux/rbtree.h> | 33 | #include <linux/rbtree.h> |
35 | #include <linux/sched.h> | 34 | #include <linux/sched.h> |
36 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
@@ -55,11 +54,17 @@ MODULE_PARM_DESC(disable_hugepages, | |||
55 | "Disable VFIO IOMMU support for IOMMU hugepages."); | 54 | "Disable VFIO IOMMU support for IOMMU hugepages."); |
56 | 55 | ||
57 | struct vfio_iommu { | 56 | struct vfio_iommu { |
58 | struct iommu_domain *domain; | 57 | struct list_head domain_list; |
59 | struct mutex lock; | 58 | struct mutex lock; |
60 | struct rb_root dma_list; | 59 | struct rb_root dma_list; |
60 | bool v2; | ||
61 | }; | ||
62 | |||
63 | struct vfio_domain { | ||
64 | struct iommu_domain *domain; | ||
65 | struct list_head next; | ||
61 | struct list_head group_list; | 66 | struct list_head group_list; |
62 | bool cache; | 67 | int prot; /* IOMMU_CACHE */ |
63 | }; | 68 | }; |
64 | 69 | ||
65 | struct vfio_dma { | 70 | struct vfio_dma { |
@@ -99,7 +104,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | |||
99 | return NULL; | 104 | return NULL; |
100 | } | 105 | } |
101 | 106 | ||
102 | static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) | 107 | static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) |
103 | { | 108 | { |
104 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; | 109 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; |
105 | struct vfio_dma *dma; | 110 | struct vfio_dma *dma; |
@@ -118,7 +123,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) | |||
118 | rb_insert_color(&new->node, &iommu->dma_list); | 123 | rb_insert_color(&new->node, &iommu->dma_list); |
119 | } | 124 | } |
120 | 125 | ||
121 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) | 126 | static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) |
122 | { | 127 | { |
123 | rb_erase(&old->node, &iommu->dma_list); | 128 | rb_erase(&old->node, &iommu->dma_list); |
124 | } | 129 | } |
@@ -322,32 +327,39 @@ static long vfio_unpin_pages(unsigned long pfn, long npage, | |||
322 | return unlocked; | 327 | return unlocked; |
323 | } | 328 | } |
324 | 329 | ||
325 | static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, | 330 | static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) |
326 | dma_addr_t iova, size_t *size) | ||
327 | { | 331 | { |
328 | dma_addr_t start = iova, end = iova + *size; | 332 | dma_addr_t iova = dma->iova, end = dma->iova + dma->size; |
333 | struct vfio_domain *domain, *d; | ||
329 | long unlocked = 0; | 334 | long unlocked = 0; |
330 | 335 | ||
336 | if (!dma->size) | ||
337 | return; | ||
338 | /* | ||
339 | * We use the IOMMU to track the physical addresses, otherwise we'd | ||
340 | * need a much more complicated tracking system. Unfortunately that | ||
341 | * means we need to use one of the iommu domains to figure out the | ||
342 | * pfns to unpin. The rest need to be unmapped in advance so we have | ||
343 | * no iommu translations remaining when the pages are unpinned. | ||
344 | */ | ||
345 | domain = d = list_first_entry(&iommu->domain_list, | ||
346 | struct vfio_domain, next); | ||
347 | |||
348 | list_for_each_entry_continue(d, &iommu->domain_list, next) | ||
349 | iommu_unmap(d->domain, dma->iova, dma->size); | ||
350 | |||
331 | while (iova < end) { | 351 | while (iova < end) { |
332 | size_t unmapped; | 352 | size_t unmapped; |
333 | phys_addr_t phys; | 353 | phys_addr_t phys; |
334 | 354 | ||
335 | /* | 355 | phys = iommu_iova_to_phys(domain->domain, iova); |
336 | * We use the IOMMU to track the physical address. This | ||
337 | * saves us from having a lot more entries in our mapping | ||
338 | * tree. The downside is that we don't track the size | ||
339 | * used to do the mapping. We request unmap of a single | ||
340 | * page, but expect IOMMUs that support large pages to | ||
341 | * unmap a larger chunk. | ||
342 | */ | ||
343 | phys = iommu_iova_to_phys(iommu->domain, iova); | ||
344 | if (WARN_ON(!phys)) { | 356 | if (WARN_ON(!phys)) { |
345 | iova += PAGE_SIZE; | 357 | iova += PAGE_SIZE; |
346 | continue; | 358 | continue; |
347 | } | 359 | } |
348 | 360 | ||
349 | unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); | 361 | unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); |
350 | if (!unmapped) | 362 | if (WARN_ON(!unmapped)) |
351 | break; | 363 | break; |
352 | 364 | ||
353 | unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, | 365 | unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, |
@@ -357,119 +369,26 @@ static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, | |||
357 | } | 369 | } |
358 | 370 | ||
359 | vfio_lock_acct(-unlocked); | 371 | vfio_lock_acct(-unlocked); |
360 | |||
361 | *size = iova - start; | ||
362 | |||
363 | return 0; | ||
364 | } | 372 | } |
365 | 373 | ||
366 | static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | 374 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) |
367 | size_t *size, struct vfio_dma *dma) | ||
368 | { | 375 | { |
369 | size_t offset, overlap, tmp; | 376 | vfio_unmap_unpin(iommu, dma); |
370 | struct vfio_dma *split; | 377 | vfio_unlink_dma(iommu, dma); |
371 | int ret; | 378 | kfree(dma); |
372 | 379 | } | |
373 | if (!*size) | ||
374 | return 0; | ||
375 | |||
376 | /* | ||
377 | * Existing dma region is completely covered, unmap all. This is | ||
378 | * the likely case since userspace tends to map and unmap buffers | ||
379 | * in one shot rather than multiple mappings within a buffer. | ||
380 | */ | ||
381 | if (likely(start <= dma->iova && | ||
382 | start + *size >= dma->iova + dma->size)) { | ||
383 | *size = dma->size; | ||
384 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); | ||
385 | if (ret) | ||
386 | return ret; | ||
387 | |||
388 | /* | ||
389 | * Did we remove more than we have? Should never happen | ||
390 | * since a vfio_dma is contiguous in iova and vaddr. | ||
391 | */ | ||
392 | WARN_ON(*size != dma->size); | ||
393 | |||
394 | vfio_remove_dma(iommu, dma); | ||
395 | kfree(dma); | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | /* Overlap low address of existing range */ | ||
400 | if (start <= dma->iova) { | ||
401 | overlap = start + *size - dma->iova; | ||
402 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); | ||
403 | if (ret) | ||
404 | return ret; | ||
405 | |||
406 | vfio_remove_dma(iommu, dma); | ||
407 | |||
408 | /* | ||
409 | * Check, we may have removed to whole vfio_dma. If not | ||
410 | * fixup and re-insert. | ||
411 | */ | ||
412 | if (overlap < dma->size) { | ||
413 | dma->iova += overlap; | ||
414 | dma->vaddr += overlap; | ||
415 | dma->size -= overlap; | ||
416 | vfio_insert_dma(iommu, dma); | ||
417 | } else | ||
418 | kfree(dma); | ||
419 | |||
420 | *size = overlap; | ||
421 | return 0; | ||
422 | } | ||
423 | |||
424 | /* Overlap high address of existing range */ | ||
425 | if (start + *size >= dma->iova + dma->size) { | ||
426 | offset = start - dma->iova; | ||
427 | overlap = dma->size - offset; | ||
428 | |||
429 | ret = vfio_unmap_unpin(iommu, dma, start, &overlap); | ||
430 | if (ret) | ||
431 | return ret; | ||
432 | |||
433 | dma->size -= overlap; | ||
434 | *size = overlap; | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | /* Split existing */ | ||
439 | |||
440 | /* | ||
441 | * Allocate our tracking structure early even though it may not | ||
442 | * be used. An Allocation failure later loses track of pages and | ||
443 | * is more difficult to unwind. | ||
444 | */ | ||
445 | split = kzalloc(sizeof(*split), GFP_KERNEL); | ||
446 | if (!split) | ||
447 | return -ENOMEM; | ||
448 | |||
449 | offset = start - dma->iova; | ||
450 | |||
451 | ret = vfio_unmap_unpin(iommu, dma, start, size); | ||
452 | if (ret || !*size) { | ||
453 | kfree(split); | ||
454 | return ret; | ||
455 | } | ||
456 | |||
457 | tmp = dma->size; | ||
458 | 380 | ||
459 | /* Resize the lower vfio_dma in place, before the below insert */ | 381 | static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) |
460 | dma->size = offset; | 382 | { |
383 | struct vfio_domain *domain; | ||
384 | unsigned long bitmap = PAGE_MASK; | ||
461 | 385 | ||
462 | /* Insert new for remainder, assuming it didn't all get unmapped */ | 386 | mutex_lock(&iommu->lock); |
463 | if (likely(offset + *size < tmp)) { | 387 | list_for_each_entry(domain, &iommu->domain_list, next) |
464 | split->size = tmp - offset - *size; | 388 | bitmap &= domain->domain->ops->pgsize_bitmap; |
465 | split->iova = dma->iova + offset + *size; | 389 | mutex_unlock(&iommu->lock); |
466 | split->vaddr = dma->vaddr + offset + *size; | ||
467 | split->prot = dma->prot; | ||
468 | vfio_insert_dma(iommu, split); | ||
469 | } else | ||
470 | kfree(split); | ||
471 | 390 | ||
472 | return 0; | 391 | return bitmap; |
473 | } | 392 | } |
474 | 393 | ||
475 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | 394 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, |
@@ -477,10 +396,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | |||
477 | { | 396 | { |
478 | uint64_t mask; | 397 | uint64_t mask; |
479 | struct vfio_dma *dma; | 398 | struct vfio_dma *dma; |
480 | size_t unmapped = 0, size; | 399 | size_t unmapped = 0; |
481 | int ret = 0; | 400 | int ret = 0; |
482 | 401 | ||
483 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | 402 | mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; |
484 | 403 | ||
485 | if (unmap->iova & mask) | 404 | if (unmap->iova & mask) |
486 | return -EINVAL; | 405 | return -EINVAL; |
@@ -491,20 +410,61 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | |||
491 | 410 | ||
492 | mutex_lock(&iommu->lock); | 411 | mutex_lock(&iommu->lock); |
493 | 412 | ||
413 | /* | ||
414 | * vfio-iommu-type1 (v1) - User mappings were coalesced together to | ||
415 | * avoid tracking individual mappings. This means that the granularity | ||
416 | * of the original mapping was lost and the user was allowed to attempt | ||
417 | * to unmap any range. Depending on the contiguousness of physical | ||
418 | * memory and page sizes supported by the IOMMU, arbitrary unmaps may | ||
419 | * or may not have worked. We only guaranteed unmap granularity | ||
420 | * matching the original mapping; even though it was untracked here, | ||
421 | * the original mappings are reflected in IOMMU mappings. This | ||
422 | * resulted in a couple unusual behaviors. First, if a range is not | ||
423 | * able to be unmapped, ex. a set of 4k pages that was mapped as a | ||
424 | * 2M hugepage into the IOMMU, the unmap ioctl returns success but with | ||
425 | * a zero sized unmap. Also, if an unmap request overlaps the first | ||
426 | * address of a hugepage, the IOMMU will unmap the entire hugepage. | ||
427 | * This also returns success and the returned unmap size reflects the | ||
428 | * actual size unmapped. | ||
429 | * | ||
430 | * We attempt to maintain compatibility with this "v1" interface, but | ||
431 | * we take control out of the hands of the IOMMU. Therefore, an unmap | ||
432 | * request offset from the beginning of the original mapping will | ||
433 | * return success with zero sized unmap. And an unmap request covering | ||
434 | * the first iova of mapping will unmap the entire range. | ||
435 | * | ||
436 | * The v2 version of this interface intends to be more deterministic. | ||
437 | * Unmap requests must fully cover previous mappings. Multiple | ||
438 | * mappings may still be unmaped by specifying large ranges, but there | ||
439 | * must not be any previous mappings bisected by the range. An error | ||
440 | * will be returned if these conditions are not met. The v2 interface | ||
441 | * will only return success and a size of zero if there were no | ||
442 | * mappings within the range. | ||
443 | */ | ||
444 | if (iommu->v2) { | ||
445 | dma = vfio_find_dma(iommu, unmap->iova, 0); | ||
446 | if (dma && dma->iova != unmap->iova) { | ||
447 | ret = -EINVAL; | ||
448 | goto unlock; | ||
449 | } | ||
450 | dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0); | ||
451 | if (dma && dma->iova + dma->size != unmap->iova + unmap->size) { | ||
452 | ret = -EINVAL; | ||
453 | goto unlock; | ||
454 | } | ||
455 | } | ||
456 | |||
494 | while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { | 457 | while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { |
495 | size = unmap->size; | 458 | if (!iommu->v2 && unmap->iova > dma->iova) |
496 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); | ||
497 | if (ret || !size) | ||
498 | break; | 459 | break; |
499 | unmapped += size; | 460 | unmapped += dma->size; |
461 | vfio_remove_dma(iommu, dma); | ||
500 | } | 462 | } |
501 | 463 | ||
464 | unlock: | ||
502 | mutex_unlock(&iommu->lock); | 465 | mutex_unlock(&iommu->lock); |
503 | 466 | ||
504 | /* | 467 | /* Report how much was unmapped */ |
505 | * We may unmap more than requested, update the unmap struct so | ||
506 | * userspace can know. | ||
507 | */ | ||
508 | unmap->size = unmapped; | 468 | unmap->size = unmapped; |
509 | 469 | ||
510 | return ret; | 470 | return ret; |
@@ -516,22 +476,47 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | |||
516 | * soon, so this is just a temporary workaround to break mappings down into | 476 | * soon, so this is just a temporary workaround to break mappings down into |
517 | * PAGE_SIZE. Better to map smaller pages than nothing. | 477 | * PAGE_SIZE. Better to map smaller pages than nothing. |
518 | */ | 478 | */ |
519 | static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, | 479 | static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova, |
520 | unsigned long pfn, long npage, int prot) | 480 | unsigned long pfn, long npage, int prot) |
521 | { | 481 | { |
522 | long i; | 482 | long i; |
523 | int ret; | 483 | int ret; |
524 | 484 | ||
525 | for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { | 485 | for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { |
526 | ret = iommu_map(iommu->domain, iova, | 486 | ret = iommu_map(domain->domain, iova, |
527 | (phys_addr_t)pfn << PAGE_SHIFT, | 487 | (phys_addr_t)pfn << PAGE_SHIFT, |
528 | PAGE_SIZE, prot); | 488 | PAGE_SIZE, prot | domain->prot); |
529 | if (ret) | 489 | if (ret) |
530 | break; | 490 | break; |
531 | } | 491 | } |
532 | 492 | ||
533 | for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) | 493 | for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) |
534 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | 494 | iommu_unmap(domain->domain, iova, PAGE_SIZE); |
495 | |||
496 | return ret; | ||
497 | } | ||
498 | |||
499 | static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, | ||
500 | unsigned long pfn, long npage, int prot) | ||
501 | { | ||
502 | struct vfio_domain *d; | ||
503 | int ret; | ||
504 | |||
505 | list_for_each_entry(d, &iommu->domain_list, next) { | ||
506 | ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, | ||
507 | npage << PAGE_SHIFT, prot | d->prot); | ||
508 | if (ret) { | ||
509 | if (ret != -EBUSY || | ||
510 | map_try_harder(d, iova, pfn, npage, prot)) | ||
511 | goto unwind; | ||
512 | } | ||
513 | } | ||
514 | |||
515 | return 0; | ||
516 | |||
517 | unwind: | ||
518 | list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) | ||
519 | iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); | ||
535 | 520 | ||
536 | return ret; | 521 | return ret; |
537 | } | 522 | } |
@@ -545,12 +530,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, | |||
545 | long npage; | 530 | long npage; |
546 | int ret = 0, prot = 0; | 531 | int ret = 0, prot = 0; |
547 | uint64_t mask; | 532 | uint64_t mask; |
548 | struct vfio_dma *dma = NULL; | 533 | struct vfio_dma *dma; |
549 | unsigned long pfn; | 534 | unsigned long pfn; |
550 | 535 | ||
551 | end = map->iova + map->size; | 536 | end = map->iova + map->size; |
552 | 537 | ||
553 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | 538 | mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; |
554 | 539 | ||
555 | /* READ/WRITE from device perspective */ | 540 | /* READ/WRITE from device perspective */ |
556 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | 541 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) |
@@ -561,9 +546,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, | |||
561 | if (!prot) | 546 | if (!prot) |
562 | return -EINVAL; /* No READ/WRITE? */ | 547 | return -EINVAL; /* No READ/WRITE? */ |
563 | 548 | ||
564 | if (iommu->cache) | ||
565 | prot |= IOMMU_CACHE; | ||
566 | |||
567 | if (vaddr & mask) | 549 | if (vaddr & mask) |
568 | return -EINVAL; | 550 | return -EINVAL; |
569 | if (map->iova & mask) | 551 | if (map->iova & mask) |
@@ -588,180 +570,257 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, | |||
588 | return -EEXIST; | 570 | return -EEXIST; |
589 | } | 571 | } |
590 | 572 | ||
591 | for (iova = map->iova; iova < end; iova += size, vaddr += size) { | 573 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); |
592 | long i; | 574 | if (!dma) { |
575 | mutex_unlock(&iommu->lock); | ||
576 | return -ENOMEM; | ||
577 | } | ||
578 | |||
579 | dma->iova = map->iova; | ||
580 | dma->vaddr = map->vaddr; | ||
581 | dma->prot = prot; | ||
593 | 582 | ||
583 | /* Insert zero-sized and grow as we map chunks of it */ | ||
584 | vfio_link_dma(iommu, dma); | ||
585 | |||
586 | for (iova = map->iova; iova < end; iova += size, vaddr += size) { | ||
594 | /* Pin a contiguous chunk of memory */ | 587 | /* Pin a contiguous chunk of memory */ |
595 | npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, | 588 | npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, |
596 | prot, &pfn); | 589 | prot, &pfn); |
597 | if (npage <= 0) { | 590 | if (npage <= 0) { |
598 | WARN_ON(!npage); | 591 | WARN_ON(!npage); |
599 | ret = (int)npage; | 592 | ret = (int)npage; |
600 | goto out; | 593 | break; |
601 | } | ||
602 | |||
603 | /* Verify pages are not already mapped */ | ||
604 | for (i = 0; i < npage; i++) { | ||
605 | if (iommu_iova_to_phys(iommu->domain, | ||
606 | iova + (i << PAGE_SHIFT))) { | ||
607 | ret = -EBUSY; | ||
608 | goto out_unpin; | ||
609 | } | ||
610 | } | 594 | } |
611 | 595 | ||
612 | ret = iommu_map(iommu->domain, iova, | 596 | /* Map it! */ |
613 | (phys_addr_t)pfn << PAGE_SHIFT, | 597 | ret = vfio_iommu_map(iommu, iova, pfn, npage, prot); |
614 | npage << PAGE_SHIFT, prot); | ||
615 | if (ret) { | 598 | if (ret) { |
616 | if (ret != -EBUSY || | 599 | vfio_unpin_pages(pfn, npage, prot, true); |
617 | map_try_harder(iommu, iova, pfn, npage, prot)) { | 600 | break; |
618 | goto out_unpin; | ||
619 | } | ||
620 | } | 601 | } |
621 | 602 | ||
622 | size = npage << PAGE_SHIFT; | 603 | size = npage << PAGE_SHIFT; |
604 | dma->size += size; | ||
605 | } | ||
623 | 606 | ||
624 | /* | 607 | if (ret) |
625 | * Check if we abut a region below - nothing below 0. | 608 | vfio_remove_dma(iommu, dma); |
626 | * This is the most likely case when mapping chunks of | ||
627 | * physically contiguous regions within a virtual address | ||
628 | * range. Update the abutting entry in place since iova | ||
629 | * doesn't change. | ||
630 | */ | ||
631 | if (likely(iova)) { | ||
632 | struct vfio_dma *tmp; | ||
633 | tmp = vfio_find_dma(iommu, iova - 1, 1); | ||
634 | if (tmp && tmp->prot == prot && | ||
635 | tmp->vaddr + tmp->size == vaddr) { | ||
636 | tmp->size += size; | ||
637 | iova = tmp->iova; | ||
638 | size = tmp->size; | ||
639 | vaddr = tmp->vaddr; | ||
640 | dma = tmp; | ||
641 | } | ||
642 | } | ||
643 | 609 | ||
644 | /* | 610 | mutex_unlock(&iommu->lock); |
645 | * Check if we abut a region above - nothing above ~0 + 1. | 611 | return ret; |
646 | * If we abut above and below, remove and free. If only | 612 | } |
647 | * abut above, remove, modify, reinsert. | 613 | |
648 | */ | 614 | static int vfio_bus_type(struct device *dev, void *data) |
649 | if (likely(iova + size)) { | 615 | { |
650 | struct vfio_dma *tmp; | 616 | struct bus_type **bus = data; |
651 | tmp = vfio_find_dma(iommu, iova + size, 1); | 617 | |
652 | if (tmp && tmp->prot == prot && | 618 | if (*bus && *bus != dev->bus) |
653 | tmp->vaddr == vaddr + size) { | 619 | return -EINVAL; |
654 | vfio_remove_dma(iommu, tmp); | 620 | |
655 | if (dma) { | 621 | *bus = dev->bus; |
656 | dma->size += tmp->size; | 622 | |
657 | kfree(tmp); | 623 | return 0; |
658 | } else { | 624 | } |
659 | size += tmp->size; | 625 | |
660 | tmp->size = size; | 626 | static int vfio_iommu_replay(struct vfio_iommu *iommu, |
661 | tmp->iova = iova; | 627 | struct vfio_domain *domain) |
662 | tmp->vaddr = vaddr; | 628 | { |
663 | vfio_insert_dma(iommu, tmp); | 629 | struct vfio_domain *d; |
664 | dma = tmp; | 630 | struct rb_node *n; |
665 | } | 631 | int ret; |
666 | } | 632 | |
667 | } | 633 | /* Arbitrarily pick the first domain in the list for lookups */ |
634 | d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); | ||
635 | n = rb_first(&iommu->dma_list); | ||
636 | |||
637 | /* If there's not a domain, there better not be any mappings */ | ||
638 | if (WARN_ON(n && !d)) | ||
639 | return -EINVAL; | ||
640 | |||
641 | for (; n; n = rb_next(n)) { | ||
642 | struct vfio_dma *dma; | ||
643 | dma_addr_t iova; | ||
644 | |||
645 | dma = rb_entry(n, struct vfio_dma, node); | ||
646 | iova = dma->iova; | ||
647 | |||
648 | while (iova < dma->iova + dma->size) { | ||
649 | phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); | ||
650 | size_t size; | ||
668 | 651 | ||
669 | if (!dma) { | 652 | if (WARN_ON(!phys)) { |
670 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); | 653 | iova += PAGE_SIZE; |
671 | if (!dma) { | 654 | continue; |
672 | iommu_unmap(iommu->domain, iova, size); | ||
673 | ret = -ENOMEM; | ||
674 | goto out_unpin; | ||
675 | } | 655 | } |
676 | 656 | ||
677 | dma->size = size; | 657 | size = PAGE_SIZE; |
678 | dma->iova = iova; | ||
679 | dma->vaddr = vaddr; | ||
680 | dma->prot = prot; | ||
681 | vfio_insert_dma(iommu, dma); | ||
682 | } | ||
683 | } | ||
684 | 658 | ||
685 | WARN_ON(ret); | 659 | while (iova + size < dma->iova + dma->size && |
686 | mutex_unlock(&iommu->lock); | 660 | phys + size == iommu_iova_to_phys(d->domain, |
687 | return ret; | 661 | iova + size)) |
662 | size += PAGE_SIZE; | ||
688 | 663 | ||
689 | out_unpin: | 664 | ret = iommu_map(domain->domain, iova, phys, |
690 | vfio_unpin_pages(pfn, npage, prot, true); | 665 | size, dma->prot | domain->prot); |
666 | if (ret) | ||
667 | return ret; | ||
691 | 668 | ||
692 | out: | 669 | iova += size; |
693 | iova = map->iova; | 670 | } |
694 | size = map->size; | ||
695 | while ((dma = vfio_find_dma(iommu, iova, size))) { | ||
696 | int r = vfio_remove_dma_overlap(iommu, iova, | ||
697 | &size, dma); | ||
698 | if (WARN_ON(r || !size)) | ||
699 | break; | ||
700 | } | 671 | } |
701 | 672 | ||
702 | mutex_unlock(&iommu->lock); | 673 | return 0; |
703 | return ret; | ||
704 | } | 674 | } |
705 | 675 | ||
706 | static int vfio_iommu_type1_attach_group(void *iommu_data, | 676 | static int vfio_iommu_type1_attach_group(void *iommu_data, |
707 | struct iommu_group *iommu_group) | 677 | struct iommu_group *iommu_group) |
708 | { | 678 | { |
709 | struct vfio_iommu *iommu = iommu_data; | 679 | struct vfio_iommu *iommu = iommu_data; |
710 | struct vfio_group *group, *tmp; | 680 | struct vfio_group *group, *g; |
681 | struct vfio_domain *domain, *d; | ||
682 | struct bus_type *bus = NULL; | ||
711 | int ret; | 683 | int ret; |
712 | 684 | ||
713 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
714 | if (!group) | ||
715 | return -ENOMEM; | ||
716 | |||
717 | mutex_lock(&iommu->lock); | 685 | mutex_lock(&iommu->lock); |
718 | 686 | ||
719 | list_for_each_entry(tmp, &iommu->group_list, next) { | 687 | list_for_each_entry(d, &iommu->domain_list, next) { |
720 | if (tmp->iommu_group == iommu_group) { | 688 | list_for_each_entry(g, &d->group_list, next) { |
689 | if (g->iommu_group != iommu_group) | ||
690 | continue; | ||
691 | |||
721 | mutex_unlock(&iommu->lock); | 692 | mutex_unlock(&iommu->lock); |
722 | kfree(group); | ||
723 | return -EINVAL; | 693 | return -EINVAL; |
724 | } | 694 | } |
725 | } | 695 | } |
726 | 696 | ||
697 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
698 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | ||
699 | if (!group || !domain) { | ||
700 | ret = -ENOMEM; | ||
701 | goto out_free; | ||
702 | } | ||
703 | |||
704 | group->iommu_group = iommu_group; | ||
705 | |||
706 | /* Determine bus_type in order to allocate a domain */ | ||
707 | ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); | ||
708 | if (ret) | ||
709 | goto out_free; | ||
710 | |||
711 | domain->domain = iommu_domain_alloc(bus); | ||
712 | if (!domain->domain) { | ||
713 | ret = -EIO; | ||
714 | goto out_free; | ||
715 | } | ||
716 | |||
717 | ret = iommu_attach_group(domain->domain, iommu_group); | ||
718 | if (ret) | ||
719 | goto out_domain; | ||
720 | |||
721 | INIT_LIST_HEAD(&domain->group_list); | ||
722 | list_add(&group->next, &domain->group_list); | ||
723 | |||
724 | if (!allow_unsafe_interrupts && | ||
725 | !iommu_domain_has_cap(domain->domain, IOMMU_CAP_INTR_REMAP)) { | ||
726 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
727 | __func__); | ||
728 | ret = -EPERM; | ||
729 | goto out_detach; | ||
730 | } | ||
731 | |||
732 | if (iommu_domain_has_cap(domain->domain, IOMMU_CAP_CACHE_COHERENCY)) | ||
733 | domain->prot |= IOMMU_CACHE; | ||
734 | |||
727 | /* | 735 | /* |
728 | * TODO: Domain have capabilities that might change as we add | 736 | * Try to match an existing compatible domain. We don't want to |
729 | * groups (see iommu->cache, currently never set). Check for | 737 | * preclude an IOMMU driver supporting multiple bus_types and being |
730 | * them and potentially disallow groups to be attached when it | 738 | * able to include different bus_types in the same IOMMU domain, so |
731 | * would change capabilities (ugh). | 739 | * we test whether the domains use the same iommu_ops rather than |
740 | * testing if they're on the same bus_type. | ||
732 | */ | 741 | */ |
733 | ret = iommu_attach_group(iommu->domain, iommu_group); | 742 | list_for_each_entry(d, &iommu->domain_list, next) { |
734 | if (ret) { | 743 | if (d->domain->ops == domain->domain->ops && |
735 | mutex_unlock(&iommu->lock); | 744 | d->prot == domain->prot) { |
736 | kfree(group); | 745 | iommu_detach_group(domain->domain, iommu_group); |
737 | return ret; | 746 | if (!iommu_attach_group(d->domain, iommu_group)) { |
747 | list_add(&group->next, &d->group_list); | ||
748 | iommu_domain_free(domain->domain); | ||
749 | kfree(domain); | ||
750 | mutex_unlock(&iommu->lock); | ||
751 | return 0; | ||
752 | } | ||
753 | |||
754 | ret = iommu_attach_group(domain->domain, iommu_group); | ||
755 | if (ret) | ||
756 | goto out_domain; | ||
757 | } | ||
738 | } | 758 | } |
739 | 759 | ||
740 | group->iommu_group = iommu_group; | 760 | /* replay mappings on new domains */ |
741 | list_add(&group->next, &iommu->group_list); | 761 | ret = vfio_iommu_replay(iommu, domain); |
762 | if (ret) | ||
763 | goto out_detach; | ||
764 | |||
765 | list_add(&domain->next, &iommu->domain_list); | ||
742 | 766 | ||
743 | mutex_unlock(&iommu->lock); | 767 | mutex_unlock(&iommu->lock); |
744 | 768 | ||
745 | return 0; | 769 | return 0; |
770 | |||
771 | out_detach: | ||
772 | iommu_detach_group(domain->domain, iommu_group); | ||
773 | out_domain: | ||
774 | iommu_domain_free(domain->domain); | ||
775 | out_free: | ||
776 | kfree(domain); | ||
777 | kfree(group); | ||
778 | mutex_unlock(&iommu->lock); | ||
779 | return ret; | ||
780 | } | ||
781 | |||
782 | static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) | ||
783 | { | ||
784 | struct rb_node *node; | ||
785 | |||
786 | while ((node = rb_first(&iommu->dma_list))) | ||
787 | vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); | ||
746 | } | 788 | } |
747 | 789 | ||
748 | static void vfio_iommu_type1_detach_group(void *iommu_data, | 790 | static void vfio_iommu_type1_detach_group(void *iommu_data, |
749 | struct iommu_group *iommu_group) | 791 | struct iommu_group *iommu_group) |
750 | { | 792 | { |
751 | struct vfio_iommu *iommu = iommu_data; | 793 | struct vfio_iommu *iommu = iommu_data; |
794 | struct vfio_domain *domain; | ||
752 | struct vfio_group *group; | 795 | struct vfio_group *group; |
753 | 796 | ||
754 | mutex_lock(&iommu->lock); | 797 | mutex_lock(&iommu->lock); |
755 | 798 | ||
756 | list_for_each_entry(group, &iommu->group_list, next) { | 799 | list_for_each_entry(domain, &iommu->domain_list, next) { |
757 | if (group->iommu_group == iommu_group) { | 800 | list_for_each_entry(group, &domain->group_list, next) { |
758 | iommu_detach_group(iommu->domain, iommu_group); | 801 | if (group->iommu_group != iommu_group) |
802 | continue; | ||
803 | |||
804 | iommu_detach_group(domain->domain, iommu_group); | ||
759 | list_del(&group->next); | 805 | list_del(&group->next); |
760 | kfree(group); | 806 | kfree(group); |
761 | break; | 807 | /* |
808 | * Group ownership provides privilege, if the group | ||
809 | * list is empty, the domain goes away. If it's the | ||
810 | * last domain, then all the mappings go away too. | ||
811 | */ | ||
812 | if (list_empty(&domain->group_list)) { | ||
813 | if (list_is_singular(&iommu->domain_list)) | ||
814 | vfio_iommu_unmap_unpin_all(iommu); | ||
815 | iommu_domain_free(domain->domain); | ||
816 | list_del(&domain->next); | ||
817 | kfree(domain); | ||
818 | } | ||
819 | goto done; | ||
762 | } | 820 | } |
763 | } | 821 | } |
764 | 822 | ||
823 | done: | ||
765 | mutex_unlock(&iommu->lock); | 824 | mutex_unlock(&iommu->lock); |
766 | } | 825 | } |
767 | 826 | ||
@@ -769,40 +828,17 @@ static void *vfio_iommu_type1_open(unsigned long arg) | |||
769 | { | 828 | { |
770 | struct vfio_iommu *iommu; | 829 | struct vfio_iommu *iommu; |
771 | 830 | ||
772 | if (arg != VFIO_TYPE1_IOMMU) | 831 | if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU) |
773 | return ERR_PTR(-EINVAL); | 832 | return ERR_PTR(-EINVAL); |
774 | 833 | ||
775 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | 834 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); |
776 | if (!iommu) | 835 | if (!iommu) |
777 | return ERR_PTR(-ENOMEM); | 836 | return ERR_PTR(-ENOMEM); |
778 | 837 | ||
779 | INIT_LIST_HEAD(&iommu->group_list); | 838 | INIT_LIST_HEAD(&iommu->domain_list); |
780 | iommu->dma_list = RB_ROOT; | 839 | iommu->dma_list = RB_ROOT; |
781 | mutex_init(&iommu->lock); | 840 | mutex_init(&iommu->lock); |
782 | 841 | iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU); | |
783 | /* | ||
784 | * Wish we didn't have to know about bus_type here. | ||
785 | */ | ||
786 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | ||
787 | if (!iommu->domain) { | ||
788 | kfree(iommu); | ||
789 | return ERR_PTR(-EIO); | ||
790 | } | ||
791 | |||
792 | /* | ||
793 | * Wish we could specify required capabilities rather than create | ||
794 | * a domain, see what comes out and hope it doesn't change along | ||
795 | * the way. Fortunately we know interrupt remapping is global for | ||
796 | * our iommus. | ||
797 | */ | ||
798 | if (!allow_unsafe_interrupts && | ||
799 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | ||
800 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
801 | __func__); | ||
802 | iommu_domain_free(iommu->domain); | ||
803 | kfree(iommu); | ||
804 | return ERR_PTR(-EPERM); | ||
805 | } | ||
806 | 842 | ||
807 | return iommu; | 843 | return iommu; |
808 | } | 844 | } |
@@ -810,25 +846,24 @@ static void *vfio_iommu_type1_open(unsigned long arg) | |||
810 | static void vfio_iommu_type1_release(void *iommu_data) | 846 | static void vfio_iommu_type1_release(void *iommu_data) |
811 | { | 847 | { |
812 | struct vfio_iommu *iommu = iommu_data; | 848 | struct vfio_iommu *iommu = iommu_data; |
849 | struct vfio_domain *domain, *domain_tmp; | ||
813 | struct vfio_group *group, *group_tmp; | 850 | struct vfio_group *group, *group_tmp; |
814 | struct rb_node *node; | ||
815 | 851 | ||
816 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | 852 | vfio_iommu_unmap_unpin_all(iommu); |
817 | iommu_detach_group(iommu->domain, group->iommu_group); | ||
818 | list_del(&group->next); | ||
819 | kfree(group); | ||
820 | } | ||
821 | 853 | ||
822 | while ((node = rb_first(&iommu->dma_list))) { | 854 | list_for_each_entry_safe(domain, domain_tmp, |
823 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | 855 | &iommu->domain_list, next) { |
824 | size_t size = dma->size; | 856 | list_for_each_entry_safe(group, group_tmp, |
825 | vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); | 857 | &domain->group_list, next) { |
826 | if (WARN_ON(!size)) | 858 | iommu_detach_group(domain->domain, group->iommu_group); |
827 | break; | 859 | list_del(&group->next); |
860 | kfree(group); | ||
861 | } | ||
862 | iommu_domain_free(domain->domain); | ||
863 | list_del(&domain->next); | ||
864 | kfree(domain); | ||
828 | } | 865 | } |
829 | 866 | ||
830 | iommu_domain_free(iommu->domain); | ||
831 | iommu->domain = NULL; | ||
832 | kfree(iommu); | 867 | kfree(iommu); |
833 | } | 868 | } |
834 | 869 | ||
@@ -841,6 +876,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, | |||
841 | if (cmd == VFIO_CHECK_EXTENSION) { | 876 | if (cmd == VFIO_CHECK_EXTENSION) { |
842 | switch (arg) { | 877 | switch (arg) { |
843 | case VFIO_TYPE1_IOMMU: | 878 | case VFIO_TYPE1_IOMMU: |
879 | case VFIO_TYPE1v2_IOMMU: | ||
844 | return 1; | 880 | return 1; |
845 | default: | 881 | default: |
846 | return 0; | 882 | return 0; |
@@ -858,7 +894,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, | |||
858 | 894 | ||
859 | info.flags = 0; | 895 | info.flags = 0; |
860 | 896 | ||
861 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | 897 | info.iova_pgsizes = vfio_pgsize_bitmap(iommu); |
862 | 898 | ||
863 | return copy_to_user((void __user *)arg, &info, minsz); | 899 | return copy_to_user((void __user *)arg, &info, minsz); |
864 | 900 | ||
@@ -911,9 +947,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | |||
911 | 947 | ||
912 | static int __init vfio_iommu_type1_init(void) | 948 | static int __init vfio_iommu_type1_init(void) |
913 | { | 949 | { |
914 | if (!iommu_present(&pci_bus_type)) | ||
915 | return -ENODEV; | ||
916 | |||
917 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | 950 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); |
918 | } | 951 | } |
919 | 952 | ||
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0fd47f5bc146..460fdf2e26f1 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h | |||
@@ -23,6 +23,7 @@ | |||
23 | 23 | ||
24 | #define VFIO_TYPE1_IOMMU 1 | 24 | #define VFIO_TYPE1_IOMMU 1 |
25 | #define VFIO_SPAPR_TCE_IOMMU 2 | 25 | #define VFIO_SPAPR_TCE_IOMMU 2 |
26 | #define VFIO_TYPE1v2_IOMMU 3 | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * The IOCTL interface is designed for extensibility by embedding the | 29 | * The IOCTL interface is designed for extensibility by embedding the |