diff options
author | Alex Williamson <alex.williamson@redhat.com> | 2012-07-31 10:16:23 -0400 |
---|---|---|
committer | Alex Williamson <alex.williamson@redhat.com> | 2012-07-31 10:16:23 -0400 |
commit | 73fa0d10d077d9521ee2dace2307ae2c9a965336 (patch) | |
tree | 2c820b194dd8ea00f23d85c382e86ea6c3beb498 | |
parent | 4a5b2a20ec87384eeb19e70991e7e15a00cad87b (diff) |
vfio: Type1 IOMMU implementation
This VFIO IOMMU backend is designed primarily for AMD-Vi and Intel
VT-d hardware, but is potentially usable by anything supporting
similar mapping functionality. We arbitrarily call this a Type1
backend for lack of a better name. This backend has no IOVA
or host memory mapping restrictions for the user and is optimized
for relatively static mappings. Mapped areas are pinned into system
memory.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r-- | drivers/vfio/Kconfig | 6 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 2 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 7 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 753 | ||||
-rw-r--r-- | include/linux/vfio.h | 54 |
5 files changed, 821 insertions, 1 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 9acb1e729bd6..128b97910b8e 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig | |||
@@ -1,6 +1,12 @@ | |||
1 | config VFIO_IOMMU_TYPE1 | ||
2 | tristate | ||
3 | depends on VFIO | ||
4 | default n | ||
5 | |||
1 | menuconfig VFIO | 6 | menuconfig VFIO |
2 | tristate "VFIO Non-Privileged userspace driver framework" | 7 | tristate "VFIO Non-Privileged userspace driver framework" |
3 | depends on IOMMU_API | 8 | depends on IOMMU_API |
9 | select VFIO_IOMMU_TYPE1 if X86 | ||
4 | help | 10 | help |
5 | VFIO provides a framework for secure userspace device drivers. | 11 | VFIO provides a framework for secure userspace device drivers. |
6 | See Documentation/vfio.txt for more details. | 12 | See Documentation/vfio.txt for more details. |
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 7500a67a42a0..2398d4a0e38b 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile | |||
@@ -1 +1,3 @@ | |||
1 | obj-$(CONFIG_VFIO) += vfio.o | 1 | obj-$(CONFIG_VFIO) += vfio.o |
2 | obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o | ||
3 | obj-$(CONFIG_VFIO_PCI) += pci/ | ||
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 052e310aed72..9591e2b509d7 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c | |||
@@ -1376,6 +1376,13 @@ static int __init vfio_init(void) | |||
1376 | 1376 | ||
1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); | 1377 | pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); |
1378 | 1378 | ||
1379 | /* | ||
1380 | * Attempt to load known iommu-drivers. This gives us a working | ||
1381 | * environment without the user needing to explicitly load iommu | ||
1382 | * drivers. | ||
1383 | */ | ||
1384 | request_module_nowait("vfio_iommu_type1"); | ||
1385 | |||
1379 | return 0; | 1386 | return 0; |
1380 | 1387 | ||
1381 | err_groups_cdev: | 1388 | err_groups_cdev: |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c new file mode 100644 index 000000000000..6f3fbc48a6c7 --- /dev/null +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -0,0 +1,753 @@ | |||
1 | /* | ||
2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | ||
3 | * | ||
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | ||
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * Derived from original vfio: | ||
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | ||
13 | * Author: Tom Lyon, pugs@cisco.com | ||
14 | * | ||
15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | ||
16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | ||
17 | * VT-d, but that makes it harder to re-use as theoretically anyone | ||
18 | * implementing a similar IOMMU could make use of this. We expect the | ||
19 | * IOMMU to support the IOMMU API and have few to no restrictions around | ||
20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | ||
21 | * optimized for relatively static mappings of a userspace process with | ||
22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | ||
23 | * domains are PCI based as the IOMMU API is still centered around a | ||
24 | * device/bus interface rather than a group interface. | ||
25 | */ | ||
26 | |||
27 | #include <linux/compat.h> | ||
28 | #include <linux/device.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/iommu.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/pci.h> /* pci_bus_type */ | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/uaccess.h> | ||
37 | #include <linux/vfio.h> | ||
38 | #include <linux/workqueue.h> | ||
39 | |||
40 | #define DRIVER_VERSION "0.2" | ||
41 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | ||
42 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | ||
43 | |||
44 | static bool allow_unsafe_interrupts; | ||
45 | module_param_named(allow_unsafe_interrupts, | ||
46 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | ||
47 | MODULE_PARM_DESC(allow_unsafe_interrupts, | ||
48 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | ||
49 | |||
50 | struct vfio_iommu { | ||
51 | struct iommu_domain *domain; | ||
52 | struct mutex lock; | ||
53 | struct list_head dma_list; | ||
54 | struct list_head group_list; | ||
55 | bool cache; | ||
56 | }; | ||
57 | |||
58 | struct vfio_dma { | ||
59 | struct list_head next; | ||
60 | dma_addr_t iova; /* Device address */ | ||
61 | unsigned long vaddr; /* Process virtual addr */ | ||
62 | long npage; /* Number of pages */ | ||
63 | int prot; /* IOMMU_READ/WRITE */ | ||
64 | }; | ||
65 | |||
66 | struct vfio_group { | ||
67 | struct iommu_group *iommu_group; | ||
68 | struct list_head next; | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * This code handles mapping and unmapping of user data buffers | ||
73 | * into DMA'ble space using the IOMMU | ||
74 | */ | ||
75 | |||
76 | #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) | ||
77 | |||
78 | struct vwork { | ||
79 | struct mm_struct *mm; | ||
80 | long npage; | ||
81 | struct work_struct work; | ||
82 | }; | ||
83 | |||
84 | /* delayed decrement/increment for locked_vm */ | ||
85 | static void vfio_lock_acct_bg(struct work_struct *work) | ||
86 | { | ||
87 | struct vwork *vwork = container_of(work, struct vwork, work); | ||
88 | struct mm_struct *mm; | ||
89 | |||
90 | mm = vwork->mm; | ||
91 | down_write(&mm->mmap_sem); | ||
92 | mm->locked_vm += vwork->npage; | ||
93 | up_write(&mm->mmap_sem); | ||
94 | mmput(mm); | ||
95 | kfree(vwork); | ||
96 | } | ||
97 | |||
98 | static void vfio_lock_acct(long npage) | ||
99 | { | ||
100 | struct vwork *vwork; | ||
101 | struct mm_struct *mm; | ||
102 | |||
103 | if (!current->mm) | ||
104 | return; /* process exited */ | ||
105 | |||
106 | if (down_write_trylock(¤t->mm->mmap_sem)) { | ||
107 | current->mm->locked_vm += npage; | ||
108 | up_write(¤t->mm->mmap_sem); | ||
109 | return; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Couldn't get mmap_sem lock, so must setup to update | ||
114 | * mm->locked_vm later. If locked_vm were atomic, we | ||
115 | * wouldn't need this silliness | ||
116 | */ | ||
117 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | ||
118 | if (!vwork) | ||
119 | return; | ||
120 | mm = get_task_mm(current); | ||
121 | if (!mm) { | ||
122 | kfree(vwork); | ||
123 | return; | ||
124 | } | ||
125 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | ||
126 | vwork->mm = mm; | ||
127 | vwork->npage = npage; | ||
128 | schedule_work(&vwork->work); | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Some mappings aren't backed by a struct page, for example an mmap'd | ||
133 | * MMIO range for our own or another device. These use a different | ||
134 | * pfn conversion and shouldn't be tracked as locked pages. | ||
135 | */ | ||
136 | static bool is_invalid_reserved_pfn(unsigned long pfn) | ||
137 | { | ||
138 | if (pfn_valid(pfn)) { | ||
139 | bool reserved; | ||
140 | struct page *tail = pfn_to_page(pfn); | ||
141 | struct page *head = compound_trans_head(tail); | ||
142 | reserved = !!(PageReserved(head)); | ||
143 | if (head != tail) { | ||
144 | /* | ||
145 | * "head" is not a dangling pointer | ||
146 | * (compound_trans_head takes care of that) | ||
147 | * but the hugepage may have been split | ||
148 | * from under us (and we may not hold a | ||
149 | * reference count on the head page so it can | ||
150 | * be reused before we run PageReferenced), so | ||
151 | * we've to check PageTail before returning | ||
152 | * what we just read. | ||
153 | */ | ||
154 | smp_rmb(); | ||
155 | if (PageTail(tail)) | ||
156 | return reserved; | ||
157 | } | ||
158 | return PageReserved(tail); | ||
159 | } | ||
160 | |||
161 | return true; | ||
162 | } | ||
163 | |||
164 | static int put_pfn(unsigned long pfn, int prot) | ||
165 | { | ||
166 | if (!is_invalid_reserved_pfn(pfn)) { | ||
167 | struct page *page = pfn_to_page(pfn); | ||
168 | if (prot & IOMMU_WRITE) | ||
169 | SetPageDirty(page); | ||
170 | put_page(page); | ||
171 | return 1; | ||
172 | } | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | /* Unmap DMA region */ | ||
177 | static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
178 | long npage, int prot) | ||
179 | { | ||
180 | long i, unlocked = 0; | ||
181 | |||
182 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { | ||
183 | unsigned long pfn; | ||
184 | |||
185 | pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; | ||
186 | if (pfn) { | ||
187 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
188 | unlocked += put_pfn(pfn, prot); | ||
189 | } | ||
190 | } | ||
191 | return unlocked; | ||
192 | } | ||
193 | |||
194 | static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
195 | long npage, int prot) | ||
196 | { | ||
197 | long unlocked; | ||
198 | |||
199 | unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); | ||
200 | vfio_lock_acct(-unlocked); | ||
201 | } | ||
202 | |||
203 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | ||
204 | { | ||
205 | struct page *page[1]; | ||
206 | struct vm_area_struct *vma; | ||
207 | int ret = -EFAULT; | ||
208 | |||
209 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | ||
210 | *pfn = page_to_pfn(page[0]); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | down_read(¤t->mm->mmap_sem); | ||
215 | |||
216 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | ||
217 | |||
218 | if (vma && vma->vm_flags & VM_PFNMAP) { | ||
219 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
220 | if (is_invalid_reserved_pfn(*pfn)) | ||
221 | ret = 0; | ||
222 | } | ||
223 | |||
224 | up_read(¤t->mm->mmap_sem); | ||
225 | |||
226 | return ret; | ||
227 | } | ||
228 | |||
229 | /* Map DMA region */ | ||
230 | static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, | ||
231 | unsigned long vaddr, long npage, int prot) | ||
232 | { | ||
233 | dma_addr_t start = iova; | ||
234 | long i, locked = 0; | ||
235 | int ret; | ||
236 | |||
237 | /* Verify that pages are not already mapped */ | ||
238 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) | ||
239 | if (iommu_iova_to_phys(iommu->domain, iova)) | ||
240 | return -EBUSY; | ||
241 | |||
242 | iova = start; | ||
243 | |||
244 | if (iommu->cache) | ||
245 | prot |= IOMMU_CACHE; | ||
246 | |||
247 | /* | ||
248 | * XXX We break mappings into pages and use get_user_pages_fast to | ||
249 | * pin the pages in memory. It's been suggested that mlock might | ||
250 | * provide a more efficient mechanism, but nothing prevents the | ||
251 | * user from munlocking the pages, which could then allow the user | ||
252 | * access to random host memory. We also have no guarantee from the | ||
253 | * IOMMU API that the iommu driver can unmap sub-pages of previous | ||
254 | * mappings. This means we might lose an entire range if a single | ||
255 | * page within it is unmapped. Single page mappings are inefficient, | ||
256 | * but provide the most flexibility for now. | ||
257 | */ | ||
258 | for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { | ||
259 | unsigned long pfn = 0; | ||
260 | |||
261 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | ||
262 | if (ret) { | ||
263 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
264 | return ret; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Only add actual locked pages to accounting | ||
269 | * XXX We're effectively marking a page locked for every | ||
270 | * IOVA page even though it's possible the user could be | ||
271 | * backing multiple IOVAs with the same vaddr. This over- | ||
272 | * penalizes the user process, but we currently have no | ||
273 | * easy way to do this properly. | ||
274 | */ | ||
275 | if (!is_invalid_reserved_pfn(pfn)) | ||
276 | locked++; | ||
277 | |||
278 | ret = iommu_map(iommu->domain, iova, | ||
279 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
280 | PAGE_SIZE, prot); | ||
281 | if (ret) { | ||
282 | /* Back out mappings on error */ | ||
283 | put_pfn(pfn, prot); | ||
284 | __vfio_dma_do_unmap(iommu, start, i, prot); | ||
285 | return ret; | ||
286 | } | ||
287 | } | ||
288 | vfio_lock_acct(locked); | ||
289 | return 0; | ||
290 | } | ||
291 | |||
292 | static inline bool ranges_overlap(dma_addr_t start1, size_t size1, | ||
293 | dma_addr_t start2, size_t size2) | ||
294 | { | ||
295 | if (start1 < start2) | ||
296 | return (start2 - start1 < size1); | ||
297 | else if (start2 < start1) | ||
298 | return (start1 - start2 < size2); | ||
299 | return (size1 > 0 && size2 > 0); | ||
300 | } | ||
301 | |||
302 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | ||
303 | dma_addr_t start, size_t size) | ||
304 | { | ||
305 | struct vfio_dma *dma; | ||
306 | |||
307 | list_for_each_entry(dma, &iommu->dma_list, next) { | ||
308 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
309 | start, size)) | ||
310 | return dma; | ||
311 | } | ||
312 | return NULL; | ||
313 | } | ||
314 | |||
315 | static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | ||
316 | size_t size, struct vfio_dma *dma) | ||
317 | { | ||
318 | struct vfio_dma *split; | ||
319 | long npage_lo, npage_hi; | ||
320 | |||
321 | /* Existing dma region is completely covered, unmap all */ | ||
322 | if (start <= dma->iova && | ||
323 | start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
324 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
325 | list_del(&dma->next); | ||
326 | npage_lo = dma->npage; | ||
327 | kfree(dma); | ||
328 | return npage_lo; | ||
329 | } | ||
330 | |||
331 | /* Overlap low address of existing range */ | ||
332 | if (start <= dma->iova) { | ||
333 | size_t overlap; | ||
334 | |||
335 | overlap = start + size - dma->iova; | ||
336 | npage_lo = overlap >> PAGE_SHIFT; | ||
337 | |||
338 | vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); | ||
339 | dma->iova += overlap; | ||
340 | dma->vaddr += overlap; | ||
341 | dma->npage -= npage_lo; | ||
342 | return npage_lo; | ||
343 | } | ||
344 | |||
345 | /* Overlap high address of existing range */ | ||
346 | if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | ||
347 | size_t overlap; | ||
348 | |||
349 | overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; | ||
350 | npage_hi = overlap >> PAGE_SHIFT; | ||
351 | |||
352 | vfio_dma_unmap(iommu, start, npage_hi, dma->prot); | ||
353 | dma->npage -= npage_hi; | ||
354 | return npage_hi; | ||
355 | } | ||
356 | |||
357 | /* Split existing */ | ||
358 | npage_lo = (start - dma->iova) >> PAGE_SHIFT; | ||
359 | npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; | ||
360 | |||
361 | split = kzalloc(sizeof *split, GFP_KERNEL); | ||
362 | if (!split) | ||
363 | return -ENOMEM; | ||
364 | |||
365 | vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); | ||
366 | |||
367 | dma->npage = npage_lo; | ||
368 | |||
369 | split->npage = npage_hi; | ||
370 | split->iova = start + size; | ||
371 | split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; | ||
372 | split->prot = dma->prot; | ||
373 | list_add(&split->next, &iommu->dma_list); | ||
374 | return size >> PAGE_SHIFT; | ||
375 | } | ||
376 | |||
377 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | ||
378 | struct vfio_iommu_type1_dma_unmap *unmap) | ||
379 | { | ||
380 | long ret = 0, npage = unmap->size >> PAGE_SHIFT; | ||
381 | struct vfio_dma *dma, *tmp; | ||
382 | uint64_t mask; | ||
383 | |||
384 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
385 | |||
386 | if (unmap->iova & mask) | ||
387 | return -EINVAL; | ||
388 | if (unmap->size & mask) | ||
389 | return -EINVAL; | ||
390 | |||
391 | /* XXX We still break these down into PAGE_SIZE */ | ||
392 | WARN_ON(mask & PAGE_MASK); | ||
393 | |||
394 | mutex_lock(&iommu->lock); | ||
395 | |||
396 | list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { | ||
397 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | ||
398 | unmap->iova, unmap->size)) { | ||
399 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, | ||
400 | unmap->size, dma); | ||
401 | if (ret > 0) | ||
402 | npage -= ret; | ||
403 | if (ret < 0 || npage == 0) | ||
404 | break; | ||
405 | } | ||
406 | } | ||
407 | mutex_unlock(&iommu->lock); | ||
408 | return ret > 0 ? 0 : (int)ret; | ||
409 | } | ||
410 | |||
411 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | ||
412 | struct vfio_iommu_type1_dma_map *map) | ||
413 | { | ||
414 | struct vfio_dma *dma, *pdma = NULL; | ||
415 | dma_addr_t iova = map->iova; | ||
416 | unsigned long locked, lock_limit, vaddr = map->vaddr; | ||
417 | size_t size = map->size; | ||
418 | int ret = 0, prot = 0; | ||
419 | uint64_t mask; | ||
420 | long npage; | ||
421 | |||
422 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | ||
423 | |||
424 | /* READ/WRITE from device perspective */ | ||
425 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | ||
426 | prot |= IOMMU_WRITE; | ||
427 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | ||
428 | prot |= IOMMU_READ; | ||
429 | |||
430 | if (!prot) | ||
431 | return -EINVAL; /* No READ/WRITE? */ | ||
432 | |||
433 | if (vaddr & mask) | ||
434 | return -EINVAL; | ||
435 | if (iova & mask) | ||
436 | return -EINVAL; | ||
437 | if (size & mask) | ||
438 | return -EINVAL; | ||
439 | |||
440 | /* XXX We still break these down into PAGE_SIZE */ | ||
441 | WARN_ON(mask & PAGE_MASK); | ||
442 | |||
443 | /* Don't allow IOVA wrap */ | ||
444 | if (iova + size && iova + size < iova) | ||
445 | return -EINVAL; | ||
446 | |||
447 | /* Don't allow virtual address wrap */ | ||
448 | if (vaddr + size && vaddr + size < vaddr) | ||
449 | return -EINVAL; | ||
450 | |||
451 | npage = size >> PAGE_SHIFT; | ||
452 | if (!npage) | ||
453 | return -EINVAL; | ||
454 | |||
455 | mutex_lock(&iommu->lock); | ||
456 | |||
457 | if (vfio_find_dma(iommu, iova, size)) { | ||
458 | ret = -EBUSY; | ||
459 | goto out_lock; | ||
460 | } | ||
461 | |||
462 | /* account for locked pages */ | ||
463 | locked = current->mm->locked_vm + npage; | ||
464 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
465 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { | ||
466 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | ||
467 | __func__, rlimit(RLIMIT_MEMLOCK)); | ||
468 | ret = -ENOMEM; | ||
469 | goto out_lock; | ||
470 | } | ||
471 | |||
472 | ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); | ||
473 | if (ret) | ||
474 | goto out_lock; | ||
475 | |||
476 | /* Check if we abut a region below - nothing below 0 */ | ||
477 | if (iova) { | ||
478 | dma = vfio_find_dma(iommu, iova - 1, 1); | ||
479 | if (dma && dma->prot == prot && | ||
480 | dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { | ||
481 | |||
482 | dma->npage += npage; | ||
483 | iova = dma->iova; | ||
484 | vaddr = dma->vaddr; | ||
485 | npage = dma->npage; | ||
486 | size = NPAGE_TO_SIZE(npage); | ||
487 | |||
488 | pdma = dma; | ||
489 | } | ||
490 | } | ||
491 | |||
492 | /* Check if we abut a region above - nothing above ~0 + 1 */ | ||
493 | if (iova + size) { | ||
494 | dma = vfio_find_dma(iommu, iova + size, 1); | ||
495 | if (dma && dma->prot == prot && | ||
496 | dma->vaddr == vaddr + size) { | ||
497 | |||
498 | dma->npage += npage; | ||
499 | dma->iova = iova; | ||
500 | dma->vaddr = vaddr; | ||
501 | |||
502 | /* | ||
503 | * If merged above and below, remove previously | ||
504 | * merged entry. New entry covers it. | ||
505 | */ | ||
506 | if (pdma) { | ||
507 | list_del(&pdma->next); | ||
508 | kfree(pdma); | ||
509 | } | ||
510 | pdma = dma; | ||
511 | } | ||
512 | } | ||
513 | |||
514 | /* Isolated, new region */ | ||
515 | if (!pdma) { | ||
516 | dma = kzalloc(sizeof *dma, GFP_KERNEL); | ||
517 | if (!dma) { | ||
518 | ret = -ENOMEM; | ||
519 | vfio_dma_unmap(iommu, iova, npage, prot); | ||
520 | goto out_lock; | ||
521 | } | ||
522 | |||
523 | dma->npage = npage; | ||
524 | dma->iova = iova; | ||
525 | dma->vaddr = vaddr; | ||
526 | dma->prot = prot; | ||
527 | list_add(&dma->next, &iommu->dma_list); | ||
528 | } | ||
529 | |||
530 | out_lock: | ||
531 | mutex_unlock(&iommu->lock); | ||
532 | return ret; | ||
533 | } | ||
534 | |||
535 | static int vfio_iommu_type1_attach_group(void *iommu_data, | ||
536 | struct iommu_group *iommu_group) | ||
537 | { | ||
538 | struct vfio_iommu *iommu = iommu_data; | ||
539 | struct vfio_group *group, *tmp; | ||
540 | int ret; | ||
541 | |||
542 | group = kzalloc(sizeof(*group), GFP_KERNEL); | ||
543 | if (!group) | ||
544 | return -ENOMEM; | ||
545 | |||
546 | mutex_lock(&iommu->lock); | ||
547 | |||
548 | list_for_each_entry(tmp, &iommu->group_list, next) { | ||
549 | if (tmp->iommu_group == iommu_group) { | ||
550 | mutex_unlock(&iommu->lock); | ||
551 | kfree(group); | ||
552 | return -EINVAL; | ||
553 | } | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * TODO: Domain have capabilities that might change as we add | ||
558 | * groups (see iommu->cache, currently never set). Check for | ||
559 | * them and potentially disallow groups to be attached when it | ||
560 | * would change capabilities (ugh). | ||
561 | */ | ||
562 | ret = iommu_attach_group(iommu->domain, iommu_group); | ||
563 | if (ret) { | ||
564 | mutex_unlock(&iommu->lock); | ||
565 | kfree(group); | ||
566 | return ret; | ||
567 | } | ||
568 | |||
569 | group->iommu_group = iommu_group; | ||
570 | list_add(&group->next, &iommu->group_list); | ||
571 | |||
572 | mutex_unlock(&iommu->lock); | ||
573 | |||
574 | return 0; | ||
575 | } | ||
576 | |||
577 | static void vfio_iommu_type1_detach_group(void *iommu_data, | ||
578 | struct iommu_group *iommu_group) | ||
579 | { | ||
580 | struct vfio_iommu *iommu = iommu_data; | ||
581 | struct vfio_group *group; | ||
582 | |||
583 | mutex_lock(&iommu->lock); | ||
584 | |||
585 | list_for_each_entry(group, &iommu->group_list, next) { | ||
586 | if (group->iommu_group == iommu_group) { | ||
587 | iommu_detach_group(iommu->domain, iommu_group); | ||
588 | list_del(&group->next); | ||
589 | kfree(group); | ||
590 | break; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | mutex_unlock(&iommu->lock); | ||
595 | } | ||
596 | |||
597 | static void *vfio_iommu_type1_open(unsigned long arg) | ||
598 | { | ||
599 | struct vfio_iommu *iommu; | ||
600 | |||
601 | if (arg != VFIO_TYPE1_IOMMU) | ||
602 | return ERR_PTR(-EINVAL); | ||
603 | |||
604 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | ||
605 | if (!iommu) | ||
606 | return ERR_PTR(-ENOMEM); | ||
607 | |||
608 | INIT_LIST_HEAD(&iommu->group_list); | ||
609 | INIT_LIST_HEAD(&iommu->dma_list); | ||
610 | mutex_init(&iommu->lock); | ||
611 | |||
612 | /* | ||
613 | * Wish we didn't have to know about bus_type here. | ||
614 | */ | ||
615 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | ||
616 | if (!iommu->domain) { | ||
617 | kfree(iommu); | ||
618 | return ERR_PTR(-EIO); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Wish we could specify required capabilities rather than create | ||
623 | * a domain, see what comes out and hope it doesn't change along | ||
624 | * the way. Fortunately we know interrupt remapping is global for | ||
625 | * our iommus. | ||
626 | */ | ||
627 | if (!allow_unsafe_interrupts && | ||
628 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | ||
629 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | ||
630 | __func__); | ||
631 | iommu_domain_free(iommu->domain); | ||
632 | kfree(iommu); | ||
633 | return ERR_PTR(-EPERM); | ||
634 | } | ||
635 | |||
636 | return iommu; | ||
637 | } | ||
638 | |||
639 | static void vfio_iommu_type1_release(void *iommu_data) | ||
640 | { | ||
641 | struct vfio_iommu *iommu = iommu_data; | ||
642 | struct vfio_group *group, *group_tmp; | ||
643 | struct vfio_dma *dma, *dma_tmp; | ||
644 | |||
645 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | ||
646 | iommu_detach_group(iommu->domain, group->iommu_group); | ||
647 | list_del(&group->next); | ||
648 | kfree(group); | ||
649 | } | ||
650 | |||
651 | list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { | ||
652 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | ||
653 | list_del(&dma->next); | ||
654 | kfree(dma); | ||
655 | } | ||
656 | |||
657 | iommu_domain_free(iommu->domain); | ||
658 | iommu->domain = NULL; | ||
659 | kfree(iommu); | ||
660 | } | ||
661 | |||
662 | static long vfio_iommu_type1_ioctl(void *iommu_data, | ||
663 | unsigned int cmd, unsigned long arg) | ||
664 | { | ||
665 | struct vfio_iommu *iommu = iommu_data; | ||
666 | unsigned long minsz; | ||
667 | |||
668 | if (cmd == VFIO_CHECK_EXTENSION) { | ||
669 | switch (arg) { | ||
670 | case VFIO_TYPE1_IOMMU: | ||
671 | return 1; | ||
672 | default: | ||
673 | return 0; | ||
674 | } | ||
675 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | ||
676 | struct vfio_iommu_type1_info info; | ||
677 | |||
678 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | ||
679 | |||
680 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
681 | return -EFAULT; | ||
682 | |||
683 | if (info.argsz < minsz) | ||
684 | return -EINVAL; | ||
685 | |||
686 | info.flags = 0; | ||
687 | |||
688 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | ||
689 | |||
690 | return copy_to_user((void __user *)arg, &info, minsz); | ||
691 | |||
692 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | ||
693 | struct vfio_iommu_type1_dma_map map; | ||
694 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | ||
695 | VFIO_DMA_MAP_FLAG_WRITE; | ||
696 | |||
697 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | ||
698 | |||
699 | if (copy_from_user(&map, (void __user *)arg, minsz)) | ||
700 | return -EFAULT; | ||
701 | |||
702 | if (map.argsz < minsz || map.flags & ~mask) | ||
703 | return -EINVAL; | ||
704 | |||
705 | return vfio_dma_do_map(iommu, &map); | ||
706 | |||
707 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | ||
708 | struct vfio_iommu_type1_dma_unmap unmap; | ||
709 | |||
710 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | ||
711 | |||
712 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | ||
713 | return -EFAULT; | ||
714 | |||
715 | if (unmap.argsz < minsz || unmap.flags) | ||
716 | return -EINVAL; | ||
717 | |||
718 | return vfio_dma_do_unmap(iommu, &unmap); | ||
719 | } | ||
720 | |||
721 | return -ENOTTY; | ||
722 | } | ||
723 | |||
724 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | ||
725 | .name = "vfio-iommu-type1", | ||
726 | .owner = THIS_MODULE, | ||
727 | .open = vfio_iommu_type1_open, | ||
728 | .release = vfio_iommu_type1_release, | ||
729 | .ioctl = vfio_iommu_type1_ioctl, | ||
730 | .attach_group = vfio_iommu_type1_attach_group, | ||
731 | .detach_group = vfio_iommu_type1_detach_group, | ||
732 | }; | ||
733 | |||
734 | static int __init vfio_iommu_type1_init(void) | ||
735 | { | ||
736 | if (!iommu_present(&pci_bus_type)) | ||
737 | return -ENODEV; | ||
738 | |||
739 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
740 | } | ||
741 | |||
742 | static void __exit vfio_iommu_type1_cleanup(void) | ||
743 | { | ||
744 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | ||
745 | } | ||
746 | |||
747 | module_init(vfio_iommu_type1_init); | ||
748 | module_exit(vfio_iommu_type1_cleanup); | ||
749 | |||
750 | MODULE_VERSION(DRIVER_VERSION); | ||
751 | MODULE_LICENSE("GPL v2"); | ||
752 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
753 | MODULE_DESCRIPTION(DRIVER_DESC); | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 03e56a5154b6..acb046fd5b70 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h | |||
@@ -98,7 +98,7 @@ extern void vfio_unregister_iommu_driver( | |||
98 | 98 | ||
99 | /* Extensions */ | 99 | /* Extensions */ |
100 | 100 | ||
101 | /* None yet */ | 101 | #define VFIO_TYPE1_IOMMU 1 |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * The IOCTL interface is designed for extensibility by embedding the | 104 | * The IOCTL interface is designed for extensibility by embedding the |
@@ -364,4 +364,56 @@ struct vfio_irq_set { | |||
364 | */ | 364 | */ |
365 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) | 365 | #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) |
366 | 366 | ||
367 | /* -------- API for Type1 VFIO IOMMU -------- */ | ||
368 | |||
369 | /** | ||
370 | * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) | ||
371 | * | ||
372 | * Retrieve information about the IOMMU object. Fills in provided | ||
373 | * struct vfio_iommu_info. Caller sets argsz. | ||
374 | * | ||
375 | * XXX Should we do these by CHECK_EXTENSION too? | ||
376 | */ | ||
377 | struct vfio_iommu_type1_info { | ||
378 | __u32 argsz; | ||
379 | __u32 flags; | ||
380 | #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ | ||
381 | __u64 iova_pgsizes; /* Bitmap of supported page sizes */ | ||
382 | }; | ||
383 | |||
384 | #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) | ||
385 | |||
386 | /** | ||
387 | * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) | ||
388 | * | ||
389 | * Map process virtual addresses to IO virtual addresses using the | ||
390 | * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. | ||
391 | */ | ||
392 | struct vfio_iommu_type1_dma_map { | ||
393 | __u32 argsz; | ||
394 | __u32 flags; | ||
395 | #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ | ||
396 | #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ | ||
397 | __u64 vaddr; /* Process virtual address */ | ||
398 | __u64 iova; /* IO virtual address */ | ||
399 | __u64 size; /* Size of mapping (bytes) */ | ||
400 | }; | ||
401 | |||
402 | #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) | ||
403 | |||
404 | /** | ||
405 | * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) | ||
406 | * | ||
407 | * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. | ||
408 | * Caller sets argsz. | ||
409 | */ | ||
410 | struct vfio_iommu_type1_dma_unmap { | ||
411 | __u32 argsz; | ||
412 | __u32 flags; | ||
413 | __u64 iova; /* IO virtual address */ | ||
414 | __u64 size; /* Size of mapping (bytes) */ | ||
415 | }; | ||
416 | |||
417 | #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) | ||
418 | |||
367 | #endif /* VFIO_H */ | 419 | #endif /* VFIO_H */ |