diff options
| author | Alex Williamson <alex.williamson@redhat.com> | 2012-07-31 10:16:23 -0400 |
|---|---|---|
| committer | Alex Williamson <alex.williamson@redhat.com> | 2012-07-31 10:16:23 -0400 |
| commit | 73fa0d10d077d9521ee2dace2307ae2c9a965336 (patch) | |
| tree | 2c820b194dd8ea00f23d85c382e86ea6c3beb498 | |
| parent | 4a5b2a20ec87384eeb19e70991e7e15a00cad87b (diff) | |
vfio: Type1 IOMMU implementation
This VFIO IOMMU backend is designed primarily for AMD-Vi and Intel
VT-d hardware, but is potentially usable by anything supporting
similar mapping functionality. We arbitrarily call this a Type1
backend for lack of a better name. This backend has no IOVA
or host memory mapping restrictions for the user and is optimized
for relatively static mappings. Mapped areas are pinned into system
memory.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
| -rw-r--r-- | drivers/vfio/Kconfig | 6 | ||||
| -rw-r--r-- | drivers/vfio/Makefile | 2 | ||||
| -rw-r--r-- | drivers/vfio/vfio.c | 7 | ||||
| -rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 753 | ||||
| -rw-r--r-- | include/linux/vfio.h | 54 |
5 files changed, 821 insertions, 1 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 9acb1e729bd6..128b97910b8e 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig | |||
| @@ -1,6 +1,12 @@ | |||
| 1 | config VFIO_IOMMU_TYPE1 | ||
| 2 | tristate | ||
| 3 | depends on VFIO | ||
| 4 | default n | ||
| 5 | |||
| 1 | menuconfig VFIO | 6 | menuconfig VFIO |
| 2 | tristate "VFIO Non-Privileged userspace driver framework" | 7 | tristate "VFIO Non-Privileged userspace driver framework" |
| 3 | depends on IOMMU_API | 8 | depends on IOMMU_API |
| 9 | select VFIO_IOMMU_TYPE1 if X86 | ||
| 4 | help | 10 | help |
| 5 | VFIO provides a framework for secure userspace device drivers. | 11 | VFIO provides a framework for secure userspace device drivers. |
| 6 | See Documentation/vfio.txt for more details. | 12 | See Documentation/vfio.txt for more details. |
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 7500a67a42a0..2398d4a0e38b 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile | |||
| @@ -1 +1,3 @@ | |||
| 1 | obj-$(CONFIG_VFIO) += vfio.o | 1 | obj-$(CONFIG_VFIO) += vfio.o |
| 2 | obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o | ||
| 3 | obj-$(CONFIG_VFIO_PCI) += pci/ | ||
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 052e310aed72..9591e2b509d7 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c | |||
| @@ -1376,6 +1376,13 @@ static int __init vfio_init(void) | |||
| 1376 | 1376 | ||
| 1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); | 1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); |
| 1378 | 1378 | ||
| 1379 | /* | ||
| 1380 | * Attempt to load known iommu-drivers. This gives us a working | ||
| 1381 | * environment without the user needing to explicitly load iommu | ||
| 1382 | * drivers. | ||
| 1383 | */ | ||
| 1384 | request_module_nowait("vfio_iommu_type1"); | ||
| 1385 | |||
| 1379 | return 0; | 1386 | return 0; |
| 1380 | 1387 | ||
| 1381 | err_groups_cdev: | 1388 | err_groups_cdev: |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c new file mode 100644 index 000000000000..6f3fbc48a6c7 --- /dev/null +++ b/drivers/vfio/vfio_iommu_type1.c | |||
| @@ -0,0 +1,753 @@ | |||
| 1 | /* | ||
| 2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | ||
| 3 | * | ||
| 4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
| 5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * Derived from original vfio: | ||
| 12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
| 13 | * Author: Tom Lyon, pugs@cisco.com | ||
| 14 | * | ||
| 15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | ||
| 16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | ||
| 17 | * VT-d, but that makes it harder to re-use as theoretically anyone | ||
| 18 | * implementing a similar IOMMU could make use of this. We expect the | ||
| 19 | * IOMMU to support the IOMMU API and have few to no restrictions around | ||
| 20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | ||
| 21 | * optimized for relatively static mappings of a userspace process with | ||
| 22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | ||
| 23 | * domains are PCI based as the IOMMU API is still centered around a | ||
| 24 | * device/bus interface rather than a group interface. | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/compat.h> | ||
| 28 | #include <linux/device.h> | ||
| 29 | #include <linux/fs.h> | ||
| 30 | #include <linux/iommu.h> | ||
| 31 | #include <linux/module.h> | ||
| 32 | #include <linux/mm.h> | ||
| 33 | #include <linux/pci.h> /* pci_bus_type */ | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/slab.h> | ||
| 36 | #include <linux/uaccess.h> | ||
| 37 | #include <linux/vfio.h> | ||
| 38 | #include <linux/workqueue.h> | ||
| 39 | |||
| 40 | #define DRIVER_VERSION "0.2" | ||
| 41 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
| 42 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | ||
| 43 | |||
| 44 | static bool allow_unsafe_interrupts; | ||
| 45 | module_param_named(allow_unsafe_interrupts, | ||
| 46 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | ||
| 47 | MODULE_PARM_DESC(allow_unsafe_interrupts, | ||
| 48 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | ||
| 49 | |||
| 50 | struct vfio_iommu { | ||
| 51 | struct iommu_domain *domain; | ||
| 52 | struct mutex lock; | ||
| 53 | struct list_head dma_list; | ||
| 54 | struct list_head group_list; | ||
| 55 | bool cache; | ||
| 56 | }; | ||
| 57 | |||
| 58 | struct vfio_dma { | ||
| 59 | struct list_head next; | ||
| 60 | dma_addr_t iova; /* Device address */ | ||
| 61 | unsigned long vaddr; /* Process virtual addr */ | ||
| 62 | long npage; /* Number of pages */ | ||
| 63 | int prot; /* IOMMU_READ/WRITE */ | ||
| 64 | }; | ||
| 65 | |||
| 66 | struct vfio_group { | ||
| 67 | struct iommu_group *iommu_group; | ||
| 68 | struct list_head next; | ||
| 69 | }; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * This code handles mapping and unmapping of user data buffers | ||
| 73 | * into DMA'ble space using the IOMMU | ||
| 74 | */ | ||
| 75 | |||
| 76 | #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) | ||
| 77 | |||
| 78 | struct vwork { | ||
| 79 | struct mm_struct *mm; | ||
| 80 | long npage; | ||
| 81 | struct work_struct work; | ||
| 82 | }; | ||
| 83 | |||
| 84 | /* delayed decrement/increment for locked_vm */ | ||
| 85 | static void vfio_lock_acct_bg(struct work_struct *work) | ||
| 86 | { | ||
| 87 | struct vwork *vwork = container_of(work, struct vwork, work); | ||
| 88 | struct mm_struct *mm; | ||
| 89 | |||
| 90 | mm = vwork->mm; | ||
| 91 | down_write(&mm->mmap_sem); | ||
| 92 | mm->locked_vm += vwork->npage; | ||
| 93 | up_write(&mm->mmap_sem); | ||
| 94 | mmput(mm); | ||
| 95 | kfree(vwork); | ||
| 96 | } | ||
| 97 | |||
| 98 | static void vfio_lock_acct(long npage) | ||
| 99 | { | ||
| 100 | struct vwork *vwork; | ||
| 101 | struct mm_struct *mm; | ||
| 102 | |||
| 103 | if (!current->mm) | ||
| 104 | return; /* process exited */ | ||
| 105 | |||
| 106 | if (down_write_trylock(¤t->mm->mmap_sem)) { | ||
| 107 | current->mm->locked_vm += npage; | ||
| 108 | up_write(¤t->mm->mmap_sem); | ||
| 109 | return; | ||
| 110 | } | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Couldn't get mmap_sem lock, so must setup to update | ||
| 114 | * mm->locked_vm later. If locked_vm were atomic, we | ||
| 115 | * wouldn't need this silliness | ||
| 116 | */ | ||
| 117 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | ||
| 118 | if (!vwork) | ||
| 119 | return; | ||
| 120 | mm = get_task_mm(current); | ||
| 121 | if (!mm) { | ||
| 122 | kfree(vwork); | ||
| 123 | return; | ||
| 124 | } | ||
| 125 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | ||
| 126 | vwork->mm = mm; | ||
| 127 | vwork->npage = npage; | ||
| 128 | schedule_work(&vwork->work); | ||
| 129 | } | ||
| 130 | |||
| 131 | /* | ||
| 132 | * Some mappings aren't backed by a struct page, for example an mmap'd | ||
| 133 | * MMIO range for our own or another device. These use a different | ||
| 134 | * pfn conversion and shouldn't be tracked as locked pages. | ||
| 135 | */ | ||
| 136 | static bool is_invalid_reserved_pfn(unsigned long pfn) | ||
| 137 | { | ||
| 138 | if (pfn_valid(pfn)) { | ||
| 139 | bool reserved; | ||
| 140 | struct page *tail = pfn_to_page(pfn); | ||
| 141 | struct page *head = compound_trans_head(tail); | ||
| 142 | reserved = !!(PageReserved(head)); | ||
| 143 | if (head != tail) { | ||
| 144 | /* | ||
| 145 | * "head" is not a dangling pointer | ||
| 146 | * (compound_trans_head takes care of that) | ||
| 147 | * but the hugepage may have been split | ||
| 148 | * from under us (and we may not hold a | ||
| 149 | * reference count on the head page so it can | ||
| 150 | * be reused before we run PageReferenced), so | ||
| 151 | * we've to check PageTail before returning | ||
| 152 | * what we just read. | ||
| 153 | */ | ||
| 154 | smp_rmb(); | ||
| 155 | if (PageTail(tail)) | ||
| 156 | return reserved; | ||
| 157 | } | ||
| 158 | return PageReserved(tail); | ||
| 159 | } | ||
| 160 | |||
| 161 | return true; | ||
| 162 | } | ||
| 163 | |||
| 164 | static int put_pfn(unsigned long pfn, int prot) | ||
| 165 | { | ||
| 166 | if (!is_invalid_reserved_pfn(pfn)) { | ||
| 167 | struct page *page = pfn_to_page(pfn); | ||
| 168 | if (prot & IOMMU_WRITE) | ||
| 169 | SetPageDirty(page); | ||
| 170 | put_page(page); | ||
| 171 | return 1; | ||
| 172 | } | ||
| 173 | return 0; | ||
| 174 | } | ||
| 175 | |||
| 176 | /* Unmap DMA region */ | ||
| 177 | static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 178 | long npage, int prot) | ||
| 179 | { | ||
| 180 | long i, unlocked = 0; | ||
| 181 | |||
| 182 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { | ||
| 183 | unsigned long pfn; | ||
| 184 | |||
| 185 | pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; | ||
| 186 | if (pfn) { | ||
| 187 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
| 188 | unlocked += put_pfn(pfn, prot); | ||
| 189 | } | ||
| 190 | } | ||
| 191 | return unlocked; | ||
| 192 | } | ||
| 193 | |||
| 194 | static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 195 | long npage, int prot) | ||
| 196 | { | ||
| 197 | long unlocked; | ||
| 198 | |||
| 199 | unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); | ||
| 200 | vfio_lock_acct(-unlocked); | ||
| 201 | } | ||
| 202 | |||
| 203 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | ||
| 204 | { | ||
| 205 | struct page *page[1]; | ||
| 206 | struct vm_area_struct *vma; | ||
| 207 | int ret = -EFAULT; | ||
| 208 | |||
| 209 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | ||
| 210 | *pfn = page_to_pfn(page[0]); | ||
| 211 | return 0; | ||
| 212 | } | ||
| 213 | |||
| 214 | down_read(¤t->mm->mmap_sem); | ||
| 215 | |||
| 216 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | ||
| 217 | |||
| 218 | if (vma && vma->vm_flags & VM_PFNMAP) { | ||
| 219 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 220 | if (is_invalid_reserved_pfn(*pfn)) | ||
| 221 | ret = 0; | ||
| 222 | } | ||
| 223 | |||
| 224 | up_read(¤t->mm->mmap_sem); | ||
| 225 | |||
| 226 | return ret; | ||
| 227 | } | ||
| 228 | |||
| 229 | /* Map DMA region */ | ||
| 230 | static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, | ||
| 231 | unsigned long vaddr, long npage, int prot) | ||
| 232 | { | ||
| 233 | dma_addr_t start = iova; | ||
| 234 | long i, locked = 0; | ||
| 235 | int ret; | ||
| 236 | |||
| 237 | /* Verify that pages are not already mapped */ | ||
| 238 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) | ||
| 239 | if (iommu_iova_to_phys(iommu->domain, iova)) | ||
| 240 | return -EBUSY; | ||
| 241 | |||
| 242 | iova = start; | ||
| 243 | |||
| 244 | if (iommu->cache) | ||
| 245 | prot |= IOMMU_CACHE; | ||
| 246 | |||
| 247 | /* | ||
| 248 | * XXX We break mappings into pages and use get_user_pages_fast to | ||
| 249 | * pin the pages in memory. It's been suggested that mlock might | ||
| 250 | * provide a more efficient mechanism, but nothing prevents the | ||
| 251 | * user from munlocking the pages, which could then allow the user | ||
| 252 | * access to random host memory. We also have no guarantee from the | ||
| 253 | * IOMMU API that the iommu driver can unmap sub-pages of previous | ||
| 254 | * mappings. This means we might lose an entire range if a single | ||
| 255 | * page within it is unmapped. Single page mappings are inefficient, | ||
| 256 | * but provide the most flexibility for now. | ||
| 257 | */ | ||
| 258 | for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { | ||
| 259 | unsigned long pfn = 0; | ||
| 260 | |||
| 261 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | ||
| 262 | if (ret) { | ||
| 263 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
| 264 | return ret; | ||
| 265 | } | ||
| 266 | |||
| 267 | /* | ||
| 268 | * Only add actual locked pages to accounting | ||
| 269 | * XXX We're effectively marking a page locked for every | ||
| 270 | * IOVA page even though it's possible the user could be | ||
| 271 | * backing multiple IOVAs with the same vaddr. This over- | ||
| 272 | * penalizes the user process, but we currently have no | ||
| 273 | * easy way to do this properly. | ||
| 274 | */ | ||
| 275 | if (!is_invalid_reserved_pfn(pfn)) | ||
| 276 | locked++; | ||
| 277 | |||
| 278 | ret = iommu_map(iommu->domain, iova, | ||
| 279 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
| 280 | PAGE_SIZE, prot); | ||
| 281 | if (ret) { | ||
| 282 | /* Back out mappings on error */ | ||
| 283 | put_pfn(pfn, prot); | ||
| 284 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
| 285 | return ret; | ||
| 286 | } | ||
| 287 | } | ||
| 288 | vfio_lock_acct(locked); | ||
| 289 | return 0; | ||
| 290 | } | ||
| 291 | |||
| 292 | static inline bool ranges_overlap(dma_addr_t start1, size_t size1, | ||
| 293 | dma_addr_t start2, size_t size2) | ||
| 294 | { | ||
| 295 | if (start1 < start2) | ||
| 296 | return (start2 - start1 < size1); | ||
| 297 | else if (start2 < start1) | ||
| 298 | return (start1 - start2 < size2); | ||
| 299 | return (size1 > 0 && size2 > 0); | ||
| 300 | } | ||
| 301 | |||
| 302 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | ||
| 303 | dma_addr_t start, size_t size) | ||
| 304 | { | ||
| 305 | struct vfio_dma *dma; | ||
| 306 | |||
| 307 | list_for_each_entry(dma, &iommu->dma_list, next) { | ||
| 308 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
| 309 | start, size)) | ||
| 310 | return dma; | ||
| 311 | } | ||
| 312 | return NULL; | ||
| 313 | } | ||
| 314 | |||
| 315 | static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | ||
| 316 | size_t size, struct vfio_dma *dma) | ||
| 317 | { | ||
| 318 | struct vfio_dma *split; | ||
| 319 | long npage_lo, npage_hi; | ||
| 320 | |||
| 321 | /* Existing dma region is completely covered, unmap all */ | ||
| 322 | if (start <= dma->iova && | ||
| 323 | start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
| 324 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
| 325 | list_del(&dma->next); | ||
| 326 | npage_lo = dma->npage; | ||
| 327 | kfree(dma); | ||
| 328 | return npage_lo; | ||
| 329 | } | ||
| 330 | |||
| 331 | /* Overlap low address of existing range */ | ||
| 332 | if (start <= dma->iova) { | ||
| 333 | size_t overlap; | ||
| 334 | |||
| 335 | overlap = start + size - dma->iova; | ||
| 336 | npage_lo = overlap >> PAGE_SHIFT; | ||
| 337 | |||
| 338 | vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); | ||
| 339 | dma->iova += overlap; | ||
| 340 | dma->vaddr += overlap; | ||
| 341 | dma->npage -= npage_lo; | ||
| 342 | return npage_lo; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* Overlap high address of existing range */ | ||
| 346 | if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
| 347 | size_t overlap; | ||
| 348 | |||
| 349 | overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; | ||
| 350 | npage_hi = overlap >> PAGE_SHIFT; | ||
| 351 | |||
| 352 | vfio_dma_unmap(iommu, start, npage_hi, dma->prot); | ||
| 353 | dma->npage -= npage_hi; | ||
| 354 | return npage_hi; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* Split existing */ | ||
| 358 | npage_lo = (start - dma->iova) >> PAGE_SHIFT; | ||
| 359 | npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; | ||
| 360 | |||
| 361 | split = kzalloc(sizeof *split, GFP_KERNEL); | ||
| 362 | if (!split) | ||
| 363 | return -ENOMEM; | ||
| 364 | |||
| 365 | vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); | ||
| 366 | |||
| 367 | dma->npage = npage_lo; | ||
| 368 | |||
| 369 | split->npage = npage_hi; | ||
| 370 | split->iova = start + size; | ||
| 371 | split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; | ||
| 372 | split->prot = dma->prot; | ||
| 373 | list_add(&split->next, &iommu->dma_list); | ||
| 374 | return size >> PAGE_SHIFT; | ||
| 375 | } | ||
| 376 | |||
| 377 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | ||
| 378 | struct vfio_iommu_type1_dma_unmap *unmap) | ||
| 379 | { | ||
| 380 | long ret = 0, npage = unmap->size >> PAGE_SHIFT; | ||
| 381 | struct vfio_dma *dma, *tmp; | ||
| 382 | uint64_t mask; | ||
| 383 | |||
| 384 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
| 385 | |||
| 386 | if (unmap->iova & mask) | ||
| 387 | return -EINVAL; | ||
| 388 | if (unmap->size & mask) | ||
| 389 | return -EINVAL; | ||
| 390 | |||
| 391 | /* XXX We still break these down into PAGE_SIZE */ | ||
| 392 | WARN_ON(mask & PAGE_MASK); | ||
| 393 | |||
| 394 | mutex_lock(&iommu->lock); | ||
| 395 | |||
| 396 | list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { | ||
| 397 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
| 398 | unmap->iova, unmap->size)) { | ||
| 399 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, | ||
| 400 | unmap->size, dma); | ||
| 401 | if (ret > 0) | ||
| 402 | npage -= ret; | ||
| 403 | if (ret < 0 || npage == 0) | ||
| 404 | break; | ||
| 405 | } | ||
| 406 | } | ||
| 407 | mutex_unlock(&iommu->lock); | ||
| 408 | return ret > 0 ? 0 : (int)ret; | ||
| 409 | } | ||
| 410 | |||
| 411 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | ||
| 412 | struct vfio_iommu_type1_dma_map *map) | ||
| 413 | { | ||
| 414 | struct vfio_dma *dma, *pdma = NULL; | ||
| 415 | dma_addr_t iova = map->iova; | ||
| 416 | unsigned long locked, lock_limit, vaddr = map->vaddr; | ||
| 417 | size_t size = map->size; | ||
| 418 | int ret = 0, prot = 0; | ||
| 419 | uint64_t mask; | ||
| 420 | long npage; | ||
| 421 | |||
| 422 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
| 423 | |||
| 424 | /* READ/WRITE from device perspective */ | ||
| 425 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | ||
| 426 | prot |= IOMMU_WRITE; | ||
| 427 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | ||
| 428 | prot |= IOMMU_READ; | ||
| 429 | |||
| 430 | if (!prot) | ||
| 431 | return -EINVAL; /* No READ/WRITE? */ | ||
| 432 | |||
| 433 | if (vaddr & mask) | ||
| 434 | return -EINVAL; | ||
| 435 | if (iova & mask) | ||
| 436 | return -EINVAL; | ||
| 437 | if (size & mask) | ||
| 438 | return -EINVAL; | ||
| 439 | |||
| 440 | /* XXX We still break these down into PAGE_SIZE */ | ||
| 441 | WARN_ON(mask & PAGE_MASK); | ||
| 442 | |||
| 443 | /* Don't allow IOVA wrap */ | ||
| 444 | if (iova + size && iova + size < iova) | ||
| 445 | return -EINVAL; | ||
| 446 | |||
| 447 | /* Don't allow virtual address wrap */ | ||
| 448 | if (vaddr + size && vaddr + size < vaddr) | ||
| 449 | return -EINVAL; | ||
| 450 | |||
| 451 | npage = size >> PAGE_SHIFT; | ||
| 452 | if (!npage) | ||
| 453 | return -EINVAL; | ||
| 454 | |||
| 455 | mutex_lock(&iommu->lock); | ||
| 456 | |||
| 457 | if (vfio_find_dma(iommu, iova, size)) { | ||
| 458 | ret = -EBUSY; | ||
| 459 | goto out_lock; | ||
| 460 | } | ||
| 461 | |||
| 462 | /* account for locked pages */ | ||
| 463 | locked = current->mm->locked_vm + npage; | ||
| 464 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
| 465 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { | ||
| 466 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | ||
| 467 | __func__, rlimit(RLIMIT_MEMLOCK)); | ||
| 468 | ret = -ENOMEM; | ||
| 469 | goto out_lock; | ||
| 470 | } | ||
| 471 | |||
| 472 | ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); | ||
| 473 | if (ret) | ||
| 474 | goto out_lock; | ||
| 475 | |||
| 476 | /* Check if we abut a region below - nothing below 0 */ | ||
| 477 | if (iova) { | ||
| 478 | dma = vfio_find_dma(iommu, iova - 1, 1); | ||
| 479 | if (dma && dma->prot == prot && | ||
| 480 | dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { | ||
| 481 | |||
| 482 | dma->npage += npage; | ||
| 483 | iova = dma->iova; | ||
| 484 | vaddr = dma->vaddr; | ||
| 485 | npage = dma->npage; | ||
| 486 | size = NPAGE_TO_SIZE(npage); | ||
| 487 | |||
| 488 | pdma = dma; | ||
| 489 | } | ||
| 490 | } | ||
| 491 | |||
| 492 | /* Check if we abut a region above - nothing above ~0 + 1 */ | ||
| 493 | if (iova + size) { | ||
| 494 | dma = vfio_find_dma(iommu, iova + size, 1); | ||
| 495 | if (dma && dma->prot == prot && | ||
| 496 | dma->vaddr == vaddr + size) { | ||
| 497 | |||
| 498 | dma->npage += npage; | ||
| 499 | dma->iova = iova; | ||
| 500 | dma->vaddr = vaddr; | ||
| 501 | |||
| 502 | /* | ||
| 503 | * If merged above and below, remove previously | ||
| 504 | * merged entry. New entry covers it. | ||
| 505 | */ | ||
| 506 | if (pdma) { | ||
| 507 | list_del(&pdma->next); | ||
| 508 | kfree(pdma); | ||
| 509 | } | ||
| 510 | pdma = dma; | ||
| 511 | } | ||
| 512 | } | ||
| 513 | |||
| 514 | /* Isolated, new region */ | ||
| 515 | if (!pdma) { | ||
| 516 | dma = kzalloc(sizeof *dma, GFP_KERNEL); | ||
| 517 | if (!dma) { | ||
| 518 | ret = -ENOMEM; | ||
| 519 | vfio_dma_unmap(iommu, iova, npage, prot); | ||
| 520 | goto out_lock; | ||
| 521 | } | ||
| 522 | |||
| 523 | dma->npage = npage; | ||
| 524 | dma->iova = iova; | ||
| 525 | dma->vaddr = vaddr; | ||
| 526 | dma->prot = prot; | ||
| 527 | list_add(&dma->next, &iommu->dma_list); | ||
| 528 | } | ||
| 529 | |||
| 530 | out_lock: | ||
| 531 | mutex_unlock(&iommu->lock); | ||
| 532 | return ret; | ||
| 533 | } | ||
| 534 | |||
| 535 | static int vfio_iommu_type1_attach_group(void *iommu_data, | ||
| 536 | struct iommu_group *iommu_group) | ||
| 537 | { | ||
| 538 | struct vfio_iommu *iommu = iommu_data; | ||
| 539 | struct vfio_group *group, *tmp; | ||
| 540 | int ret; | ||
| 541 | |||
| 542 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
| 543 | if (!group) | ||
| 544 | return -ENOMEM; | ||
| 545 | |||
| 546 | mutex_lock(&iommu->lock); | ||
| 547 | |||
| 548 | list_for_each_entry(tmp, &iommu->group_list, next) { | ||
| 549 | if (tmp->iommu_group == iommu_group) { | ||
| 550 | mutex_unlock(&iommu->lock); | ||
| 551 | kfree(group); | ||
| 552 | return -EINVAL; | ||
| 553 | } | ||
| 554 | } | ||
| 555 | |||
| 556 | /* | ||
| 557 | * TODO: Domain have capabilities that might change as we add | ||
| 558 | * groups (see iommu->cache, currently never set). Check for | ||
| 559 | * them and potentially disallow groups to be attached when it | ||
| 560 | * would change capabilities (ugh). | ||
| 561 | */ | ||
| 562 | ret = iommu_attach_group(iommu->domain, iommu_group); | ||
| 563 | if (ret) { | ||
| 564 | mutex_unlock(&iommu->lock); | ||
| 565 | kfree(group); | ||
| 566 | return ret; | ||
| 567 | } | ||
| 568 | |||
| 569 | group->iommu_group = iommu_group; | ||
| 570 | list_add(&group->next, &iommu->group_list); | ||
| 571 | |||
| 572 | mutex_unlock(&iommu->lock); | ||
| 573 | |||
| 574 | return 0; | ||
| 575 | } | ||
| 576 | |||
| 577 | static void vfio_iommu_type1_detach_group(void *iommu_data, | ||
| 578 | struct iommu_group *iommu_group) | ||
| 579 | { | ||
| 580 | struct vfio_iommu *iommu = iommu_data; | ||
| 581 | struct vfio_group *group; | ||
| 582 | |||
| 583 | mutex_lock(&iommu->lock); | ||
| 584 | |||
| 585 | list_for_each_entry(group, &iommu->group_list, next) { | ||
| 586 | if (group->iommu_group == iommu_group) { | ||
| 587 | iommu_detach_group(iommu->domain, iommu_group); | ||
| 588 | list_del(&group->next); | ||
| 589 | kfree(group); | ||
| 590 | break; | ||
| 591 | } | ||
| 592 | } | ||
| 593 | |||
| 594 | mutex_unlock(&iommu->lock); | ||
| 595 | } | ||
| 596 | |||
| 597 | static void *vfio_iommu_type1_open(unsigned long arg) | ||
| 598 | { | ||
| 599 | struct vfio_iommu *iommu; | ||
| 600 | |||
| 601 | if (arg != VFIO_TYPE1_IOMMU) | ||
| 602 | return ERR_PTR(-EINVAL); | ||
| 603 | |||
| 604 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | ||
| 605 | if (!iommu) | ||
| 606 | return ERR_PTR(-ENOMEM); | ||
| 607 | |||
| 608 | INIT_LIST_HEAD(&iommu->group_list); | ||
| 609 | INIT_LIST_HEAD(&iommu->dma_list); | ||
| 610 | mutex_init(&iommu->lock); | ||
| 611 | |||
| 612 | /* | ||
| 613 | * Wish we didn't have to know about bus_type here. | ||
| 614 | */ | ||
| 615 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | ||
| 616 | if (!iommu->domain) { | ||
| 617 | kfree(iommu); | ||
| 618 | return ERR_PTR(-EIO); | ||
| 619 | } | ||
| 620 | |||
| 621 | /* | ||
| 622 | * Wish we could specify required capabilities rather than create | ||
| 623 | * a domain, see what comes out and hope it doesn't change along | ||
| 624 | * the way. Fortunately we know interrupt remapping is global for | ||
| 625 | * our iommus. | ||
| 626 | */ | ||
| 627 | if (!allow_unsafe_interrupts && | ||
| 628 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | ||
| 629 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
| 630 | __func__); | ||
| 631 | iommu_domain_free(iommu->domain); | ||
| 632 | kfree(iommu); | ||
| 633 | return ERR_PTR(-EPERM); | ||
| 634 | } | ||
| 635 | |||
| 636 | return iommu; | ||
| 637 | } | ||
| 638 | |||
| 639 | static void vfio_iommu_type1_release(void *iommu_data) | ||
| 640 | { | ||
| 641 | struct vfio_iommu *iommu = iommu_data; | ||
| 642 | struct vfio_group *group, *group_tmp; | ||
| 643 | struct vfio_dma *dma, *dma_tmp; | ||
| 644 | |||
| 645 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | ||
| 646 | iommu_detach_group(iommu->domain, group->iommu_group); | ||
| 647 | list_del(&group->next); | ||
| 648 | kfree(group); | ||
| 649 | } | ||
| 650 | |||
| 651 | list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { | ||
| 652 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
| 653 | list_del(&dma->next); | ||
| 654 | kfree(dma); | ||
| 655 | } | ||
| 656 | |||
| 657 | iommu_domain_free(iommu->domain); | ||
| 658 | iommu->domain = NULL; | ||
| 659 | kfree(iommu); | ||
| 660 | } | ||
| 661 | |||
| 662 | static long vfio_iommu_type1_ioctl(void *iommu_data, | ||
| 663 | unsigned int cmd, unsigned long arg) | ||
| 664 | { | ||
| 665 | struct vfio_iommu *iommu = iommu_data; | ||
| 666 | unsigned long minsz; | ||
| 667 | |||
| 668 | if (cmd == VFIO_CHECK_EXTENSION) { | ||
| 669 | switch (arg) { | ||
| 670 | case VFIO_TYPE1_IOMMU: | ||
| 671 | return 1; | ||
| 672 | default: | ||
| 673 | return 0; | ||
| 674 | } | ||
| 675 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | ||
| 676 | struct vfio_iommu_type1_info info; | ||
| 677 | |||
| 678 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | ||
| 679 | |||
| 680 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
| 681 | return -EFAULT; | ||
| 682 | |||
| 683 | if (info.argsz < minsz) | ||
| 684 | return -EINVAL; | ||
| 685 | |||
| 686 | info.flags = 0; | ||
| 687 | |||
| 688 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | ||
| 689 | |||
| 690 | return copy_to_user((void __user *)arg, &info, minsz); | ||
| 691 | |||
| 692 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | ||
| 693 | struct vfio_iommu_type1_dma_map map; | ||
| 694 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | ||
| 695 | VFIO_DMA_MAP_FLAG_WRITE; | ||
| 696 | |||
| 697 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | ||
| 698 | |||
| 699 | if (copy_from_user(&map, (void __user *)arg, minsz)) | ||
| 700 | return -EFAULT; | ||
| 701 | |||
| 702 | if (map.argsz < minsz || map.flags & ~mask) | ||
| 703 | return -EINVAL; | ||
| 704 | |||
| 705 | return vfio_dma_do_map(iommu, &map); | ||
| 706 | |||
| 707 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | ||
| 708 | struct vfio_iommu_type1_dma_unmap unmap; | ||
| 709 | |||
| 710 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | ||
| 711 | |||
| 712 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | ||
| 713 | return -EFAULT; | ||
| 714 | |||
| 715 | if (unmap.argsz < minsz || unmap.flags) | ||
| 716 | return -EINVAL; | ||
| 717 | |||
| 718 | return vfio_dma_do_unmap(iommu, &unmap); | ||
| 719 | } | ||
| 720 | |||
| 721 | return -ENOTTY; | ||
| 722 | } | ||
| 723 | |||
| 724 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | ||
| 725 | .name = "vfio-iommu-type1", | ||
| 726 | .owner = THIS_MODULE, | ||
| 727 | .open = vfio_iommu_type1_open, | ||
| 728 | .release = vfio_iommu_type1_release, | ||
| 729 | .ioctl = vfio_iommu_type1_ioctl, | ||
| 730 | .attach_group = vfio_iommu_type1_attach_group, | ||
| 731 | .detach_group = vfio_iommu_type1_detach_group, | ||
| 732 | }; | ||
| 733 | |||
| 734 | static int __init vfio_iommu_type1_init(void) | ||
| 735 | { | ||
| 736 | if (!iommu_present(&pci_bus_type)) | ||
| 737 | return -ENODEV; | ||
| 738 | |||
| 739 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
| 740 | } | ||
| 741 | |||
| 742 | static void __exit vfio_iommu_type1_cleanup(void) | ||
| 743 | { | ||
| 744 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
| 745 | } | ||
| 746 | |||
| 747 | module_init(vfio_iommu_type1_init); | ||
| 748 | module_exit(vfio_iommu_type1_cleanup); | ||
| 749 | |||
| 750 | MODULE_VERSION(DRIVER_VERSION); | ||
| 751 | MODULE_LICENSE("GPL v2"); | ||
| 752 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
| 753 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 03e56a5154b6..acb046fd5b70 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h | |||
| @@ -98,7 +98,7 @@ extern void vfio_unregister_iommu_driver( | |||
| 98 | 98 | ||
| 99 | /* Extensions */ | 99 | /* Extensions */ |
| 100 | 100 | ||
| 101 | /* None yet */ | 101 | #define VFIO_TYPE1_IOMMU 1 |
| 102 | 102 | ||
| 103 | /* | 103 | /* |
| 104 | * The IOCTL interface is designed for extensibility by embedding the | 104 | * The IOCTL interface is designed for extensibility by embedding the |
| @@ -364,4 +364,56 @@ struct vfio_irq_set { | |||
| 364 | */ | 364 | */ |
| 365 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) | 365 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) |
| 366 | 366 | ||
| 367 | /* -------- API for Type1 VFIO IOMMU -------- */ | ||
| 368 | |||
| 369 | /** | ||
| 370 | * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) | ||
| 371 | * | ||
| 372 | * Retrieve information about the IOMMU object. Fills in provided | ||
| 373 | * struct vfio_iommu_info. Caller sets argsz. | ||
| 374 | * | ||
| 375 | * XXX Should we do these by CHECK_EXTENSION too? | ||
| 376 | */ | ||
| 377 | struct vfio_iommu_type1_info { | ||
| 378 | __u32 argsz; | ||
| 379 | __u32 flags; | ||
| 380 | #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ | ||
| 381 | __u64 iova_pgsizes; /* Bitmap of supported page sizes */ | ||
| 382 | }; | ||
| 383 | |||
| 384 | #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) | ||
| 385 | |||
| 386 | /** | ||
| 387 | * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) | ||
| 388 | * | ||
| 389 | * Map process virtual addresses to IO virtual addresses using the | ||
| 390 | * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. | ||
| 391 | */ | ||
| 392 | struct vfio_iommu_type1_dma_map { | ||
| 393 | __u32 argsz; | ||
| 394 | __u32 flags; | ||
| 395 | #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ | ||
| 396 | #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ | ||
| 397 | __u64 vaddr; /* Process virtual address */ | ||
| 398 | __u64 iova; /* IO virtual address */ | ||
| 399 | __u64 size; /* Size of mapping (bytes) */ | ||
| 400 | }; | ||
| 401 | |||
| 402 | #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) | ||
| 403 | |||
| 404 | /** | ||
| 405 | * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) | ||
| 406 | * | ||
| 407 | * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. | ||
| 408 | * Caller sets argsz. | ||
| 409 | */ | ||
| 410 | struct vfio_iommu_type1_dma_unmap { | ||
| 411 | __u32 argsz; | ||
| 412 | __u32 flags; | ||
| 413 | __u64 iova; /* IO virtual address */ | ||
| 414 | __u64 size; /* Size of mapping (bytes) */ | ||
| 415 | }; | ||
| 416 | |||
| 417 | #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) | ||
| 418 | |||
| 367 | #endif /* VFIO_H */ | 419 | #endif /* VFIO_H */ |
