aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-10 17:50:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-10 17:50:08 -0400
commit15a49b9a90c86c6cb7f270a699d2ae7468862c28 (patch)
tree218fc6bf3ae5beb989cddaede2e4ee0c2e679845 /drivers/vfio
parent8d10aae2741ec9ffd53c8d214f7ada6d543b3a46 (diff)
parent8d38ef1948bd415a5cb653a5c0ec16f3402aaca1 (diff)
Merge tag 'vfio-v3.11' of git://github.com/awilliam/linux-vfio
Pull vfio updates from Alex Williamson: "Largely hugepage support for vfio/type1 iommu and surrounding cleanups and fixes" * tag 'vfio-v3.11' of git://github.com/awilliam/linux-vfio: vfio/type1: Fix leak on error path vfio: Limit group opens vfio/type1: Fix missed frees and zero sized removes vfio: fix documentation vfio: Provide module option to disable vfio_iommu_type1 hugepage support vfio: hugepage support for vfio_iommu_type1 vfio: Convert type1 iommu to use rbtree
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/vfio.c14
-rw-r--r--drivers/vfio/vfio_iommu_type1.c626
2 files changed, 415 insertions, 225 deletions
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 259ad282ae5d..c488da5db7c7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -76,6 +76,7 @@ struct vfio_group {
76 struct notifier_block nb; 76 struct notifier_block nb;
77 struct list_head vfio_next; 77 struct list_head vfio_next;
78 struct list_head container_next; 78 struct list_head container_next;
79 atomic_t opened;
79}; 80};
80 81
81struct vfio_device { 82struct vfio_device {
@@ -206,6 +207,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
206 INIT_LIST_HEAD(&group->device_list); 207 INIT_LIST_HEAD(&group->device_list);
207 mutex_init(&group->device_lock); 208 mutex_init(&group->device_lock);
208 atomic_set(&group->container_users, 0); 209 atomic_set(&group->container_users, 0);
210 atomic_set(&group->opened, 0);
209 group->iommu_group = iommu_group; 211 group->iommu_group = iommu_group;
210 212
211 group->nb.notifier_call = vfio_iommu_group_notifier; 213 group->nb.notifier_call = vfio_iommu_group_notifier;
@@ -1236,12 +1238,22 @@ static long vfio_group_fops_compat_ioctl(struct file *filep,
1236static int vfio_group_fops_open(struct inode *inode, struct file *filep) 1238static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1237{ 1239{
1238 struct vfio_group *group; 1240 struct vfio_group *group;
1241 int opened;
1239 1242
1240 group = vfio_group_get_from_minor(iminor(inode)); 1243 group = vfio_group_get_from_minor(iminor(inode));
1241 if (!group) 1244 if (!group)
1242 return -ENODEV; 1245 return -ENODEV;
1243 1246
1247 /* Do we need multiple instances of the group open? Seems not. */
1248 opened = atomic_cmpxchg(&group->opened, 0, 1);
1249 if (opened) {
1250 vfio_group_put(group);
1251 return -EBUSY;
1252 }
1253
1254 /* Is something still in use from a previous open? */
1244 if (group->container) { 1255 if (group->container) {
1256 atomic_dec(&group->opened);
1245 vfio_group_put(group); 1257 vfio_group_put(group);
1246 return -EBUSY; 1258 return -EBUSY;
1247 } 1259 }
@@ -1259,6 +1271,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1259 1271
1260 vfio_group_try_dissolve_container(group); 1272 vfio_group_try_dissolve_container(group);
1261 1273
1274 atomic_dec(&group->opened);
1275
1262 vfio_group_put(group); 1276 vfio_group_put(group);
1263 1277
1264 return 0; 1278 return 0;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f3fbc48a6c7..a9807dea3887 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,6 +31,7 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/pci.h> /* pci_bus_type */ 33#include <linux/pci.h> /* pci_bus_type */
34#include <linux/rbtree.h>
34#include <linux/sched.h> 35#include <linux/sched.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
36#include <linux/uaccess.h> 37#include <linux/uaccess.h>
@@ -47,19 +48,25 @@ module_param_named(allow_unsafe_interrupts,
47MODULE_PARM_DESC(allow_unsafe_interrupts, 48MODULE_PARM_DESC(allow_unsafe_interrupts,
48 "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); 49 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
49 50
51static bool disable_hugepages;
52module_param_named(disable_hugepages,
53 disable_hugepages, bool, S_IRUGO | S_IWUSR);
54MODULE_PARM_DESC(disable_hugepages,
55 "Disable VFIO IOMMU support for IOMMU hugepages.");
56
50struct vfio_iommu { 57struct vfio_iommu {
51 struct iommu_domain *domain; 58 struct iommu_domain *domain;
52 struct mutex lock; 59 struct mutex lock;
53 struct list_head dma_list; 60 struct rb_root dma_list;
54 struct list_head group_list; 61 struct list_head group_list;
55 bool cache; 62 bool cache;
56}; 63};
57 64
58struct vfio_dma { 65struct vfio_dma {
59 struct list_head next; 66 struct rb_node node;
60 dma_addr_t iova; /* Device address */ 67 dma_addr_t iova; /* Device address */
61 unsigned long vaddr; /* Process virtual addr */ 68 unsigned long vaddr; /* Process virtual addr */
62 long npage; /* Number of pages */ 69 size_t size; /* Map size (bytes) */
63 int prot; /* IOMMU_READ/WRITE */ 70 int prot; /* IOMMU_READ/WRITE */
64}; 71};
65 72
@@ -73,7 +80,48 @@ struct vfio_group {
73 * into DMA'ble space using the IOMMU 80 * into DMA'ble space using the IOMMU
74 */ 81 */
75 82
76#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) 83static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
84 dma_addr_t start, size_t size)
85{
86 struct rb_node *node = iommu->dma_list.rb_node;
87
88 while (node) {
89 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
90
91 if (start + size <= dma->iova)
92 node = node->rb_left;
93 else if (start >= dma->iova + dma->size)
94 node = node->rb_right;
95 else
96 return dma;
97 }
98
99 return NULL;
100}
101
102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
103{
104 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
105 struct vfio_dma *dma;
106
107 while (*link) {
108 parent = *link;
109 dma = rb_entry(parent, struct vfio_dma, node);
110
111 if (new->iova + new->size <= dma->iova)
112 link = &(*link)->rb_left;
113 else
114 link = &(*link)->rb_right;
115 }
116
117 rb_link_node(&new->node, parent, link);
118 rb_insert_color(&new->node, &iommu->dma_list);
119}
120
121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
122{
123 rb_erase(&old->node, &iommu->dma_list);
124}
77 125
78struct vwork { 126struct vwork {
79 struct mm_struct *mm; 127 struct mm_struct *mm;
@@ -100,8 +148,8 @@ static void vfio_lock_acct(long npage)
100 struct vwork *vwork; 148 struct vwork *vwork;
101 struct mm_struct *mm; 149 struct mm_struct *mm;
102 150
103 if (!current->mm) 151 if (!current->mm || !npage)
104 return; /* process exited */ 152 return; /* process exited or nothing to do */
105 153
106 if (down_write_trylock(&current->mm->mmap_sem)) { 154 if (down_write_trylock(&current->mm->mmap_sem)) {
107 current->mm->locked_vm += npage; 155 current->mm->locked_vm += npage;
@@ -173,33 +221,6 @@ static int put_pfn(unsigned long pfn, int prot)
173 return 0; 221 return 0;
174} 222}
175 223
176/* Unmap DMA region */
177static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
178 long npage, int prot)
179{
180 long i, unlocked = 0;
181
182 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
183 unsigned long pfn;
184
185 pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
186 if (pfn) {
187 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
188 unlocked += put_pfn(pfn, prot);
189 }
190 }
191 return unlocked;
192}
193
194static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
195 long npage, int prot)
196{
197 long unlocked;
198
199 unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
200 vfio_lock_acct(-unlocked);
201}
202
203static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) 224static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
204{ 225{
205 struct page *page[1]; 226 struct page *page[1];
@@ -226,198 +247,306 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
226 return ret; 247 return ret;
227} 248}
228 249
229/* Map DMA region */ 250/*
230static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, 251 * Attempt to pin pages. We really don't want to track all the pfns and
231 unsigned long vaddr, long npage, int prot) 252 * the iommu can only map chunks of consecutive pfns anyway, so get the
253 * first page and all consecutive pages with the same locking.
254 */
255static long vfio_pin_pages(unsigned long vaddr, long npage,
256 int prot, unsigned long *pfn_base)
232{ 257{
233 dma_addr_t start = iova; 258 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
234 long i, locked = 0; 259 bool lock_cap = capable(CAP_IPC_LOCK);
235 int ret; 260 long ret, i;
236 261
237 /* Verify that pages are not already mapped */ 262 if (!current->mm)
238 for (i = 0; i < npage; i++, iova += PAGE_SIZE) 263 return -ENODEV;
239 if (iommu_iova_to_phys(iommu->domain, iova))
240 return -EBUSY;
241 264
242 iova = start; 265 ret = vaddr_get_pfn(vaddr, prot, pfn_base);
266 if (ret)
267 return ret;
243 268
244 if (iommu->cache) 269 if (is_invalid_reserved_pfn(*pfn_base))
245 prot |= IOMMU_CACHE; 270 return 1;
246 271
247 /* 272 if (!lock_cap && current->mm->locked_vm + 1 > limit) {
248 * XXX We break mappings into pages and use get_user_pages_fast to 273 put_pfn(*pfn_base, prot);
249 * pin the pages in memory. It's been suggested that mlock might 274 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
250 * provide a more efficient mechanism, but nothing prevents the 275 limit << PAGE_SHIFT);
251 * user from munlocking the pages, which could then allow the user 276 return -ENOMEM;
252 * access to random host memory. We also have no guarantee from the 277 }
253 * IOMMU API that the iommu driver can unmap sub-pages of previous 278
254 * mappings. This means we might lose an entire range if a single 279 if (unlikely(disable_hugepages)) {
255 * page within it is unmapped. Single page mappings are inefficient, 280 vfio_lock_acct(1);
256 * but provide the most flexibility for now. 281 return 1;
257 */ 282 }
258 for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { 283
284 /* Lock all the consecutive pages from pfn_base */
285 for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
259 unsigned long pfn = 0; 286 unsigned long pfn = 0;
260 287
261 ret = vaddr_get_pfn(vaddr, prot, &pfn); 288 ret = vaddr_get_pfn(vaddr, prot, &pfn);
262 if (ret) { 289 if (ret)
263 __vfio_dma_do_unmap(iommu, start, i, prot); 290 break;
264 return ret;
265 }
266 291
267 /* 292 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
268 * Only add actual locked pages to accounting 293 put_pfn(pfn, prot);
269 * XXX We're effectively marking a page locked for every 294 break;
270 * IOVA page even though it's possible the user could be 295 }
271 * backing multiple IOVAs with the same vaddr. This over-
272 * penalizes the user process, but we currently have no
273 * easy way to do this properly.
274 */
275 if (!is_invalid_reserved_pfn(pfn))
276 locked++;
277 296
278 ret = iommu_map(iommu->domain, iova, 297 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
279 (phys_addr_t)pfn << PAGE_SHIFT,
280 PAGE_SIZE, prot);
281 if (ret) {
282 /* Back out mappings on error */
283 put_pfn(pfn, prot); 298 put_pfn(pfn, prot);
284 __vfio_dma_do_unmap(iommu, start, i, prot); 299 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
285 return ret; 300 __func__, limit << PAGE_SHIFT);
301 break;
286 } 302 }
287 } 303 }
288 vfio_lock_acct(locked); 304
289 return 0; 305 vfio_lock_acct(i);
306
307 return i;
290} 308}
291 309
292static inline bool ranges_overlap(dma_addr_t start1, size_t size1, 310static long vfio_unpin_pages(unsigned long pfn, long npage,
293 dma_addr_t start2, size_t size2) 311 int prot, bool do_accounting)
294{ 312{
295 if (start1 < start2) 313 unsigned long unlocked = 0;
296 return (start2 - start1 < size1); 314 long i;
297 else if (start2 < start1) 315
298 return (start1 - start2 < size2); 316 for (i = 0; i < npage; i++)
299 return (size1 > 0 && size2 > 0); 317 unlocked += put_pfn(pfn++, prot);
318
319 if (do_accounting)
320 vfio_lock_acct(-unlocked);
321
322 return unlocked;
300} 323}
301 324
302static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, 325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
303 dma_addr_t start, size_t size) 326 dma_addr_t iova, size_t *size)
304{ 327{
305 struct vfio_dma *dma; 328 dma_addr_t start = iova, end = iova + *size;
329 long unlocked = 0;
306 330
307 list_for_each_entry(dma, &iommu->dma_list, next) { 331 while (iova < end) {
308 if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), 332 size_t unmapped;
309 start, size)) 333 phys_addr_t phys;
310 return dma; 334
335 /*
336 * We use the IOMMU to track the physical address. This
337 * saves us from having a lot more entries in our mapping
338 * tree. The downside is that we don't track the size
339 * used to do the mapping. We request unmap of a single
340 * page, but expect IOMMUs that support large pages to
341 * unmap a larger chunk.
342 */
343 phys = iommu_iova_to_phys(iommu->domain, iova);
344 if (WARN_ON(!phys)) {
345 iova += PAGE_SIZE;
346 continue;
347 }
348
349 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
350 if (!unmapped)
351 break;
352
353 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
354 unmapped >> PAGE_SHIFT,
355 dma->prot, false);
356 iova += unmapped;
311 } 357 }
312 return NULL; 358
359 vfio_lock_acct(-unlocked);
360
361 *size = iova - start;
362
363 return 0;
313} 364}
314 365
315static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, 366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
316 size_t size, struct vfio_dma *dma) 367 size_t *size, struct vfio_dma *dma)
317{ 368{
369 size_t offset, overlap, tmp;
318 struct vfio_dma *split; 370 struct vfio_dma *split;
319 long npage_lo, npage_hi; 371 int ret;
320 372
321 /* Existing dma region is completely covered, unmap all */ 373 if (!*size)
322 if (start <= dma->iova && 374 return 0;
323 start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { 375
324 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); 376 /*
325 list_del(&dma->next); 377 * Existing dma region is completely covered, unmap all. This is
326 npage_lo = dma->npage; 378 * the likely case since userspace tends to map and unmap buffers
379 * in one shot rather than multiple mappings within a buffer.
380 */
381 if (likely(start <= dma->iova &&
382 start + *size >= dma->iova + dma->size)) {
383 *size = dma->size;
384 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
385 if (ret)
386 return ret;
387
388 /*
389 * Did we remove more than we have? Should never happen
390 * since a vfio_dma is contiguous in iova and vaddr.
391 */
392 WARN_ON(*size != dma->size);
393
394 vfio_remove_dma(iommu, dma);
327 kfree(dma); 395 kfree(dma);
328 return npage_lo; 396 return 0;
329 } 397 }
330 398
331 /* Overlap low address of existing range */ 399 /* Overlap low address of existing range */
332 if (start <= dma->iova) { 400 if (start <= dma->iova) {
333 size_t overlap; 401 overlap = start + *size - dma->iova;
402 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
403 if (ret)
404 return ret;
334 405
335 overlap = start + size - dma->iova; 406 vfio_remove_dma(iommu, dma);
336 npage_lo = overlap >> PAGE_SHIFT;
337 407
338 vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); 408 /*
339 dma->iova += overlap; 409 * Check, we may have removed to whole vfio_dma. If not
340 dma->vaddr += overlap; 410 * fixup and re-insert.
341 dma->npage -= npage_lo; 411 */
342 return npage_lo; 412 if (overlap < dma->size) {
413 dma->iova += overlap;
414 dma->vaddr += overlap;
415 dma->size -= overlap;
416 vfio_insert_dma(iommu, dma);
417 } else
418 kfree(dma);
419
420 *size = overlap;
421 return 0;
343 } 422 }
344 423
345 /* Overlap high address of existing range */ 424 /* Overlap high address of existing range */
346 if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { 425 if (start + *size >= dma->iova + dma->size) {
347 size_t overlap; 426 offset = start - dma->iova;
427 overlap = dma->size - offset;
348 428
349 overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; 429 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
350 npage_hi = overlap >> PAGE_SHIFT; 430 if (ret)
431 return ret;
351 432
352 vfio_dma_unmap(iommu, start, npage_hi, dma->prot); 433 dma->size -= overlap;
353 dma->npage -= npage_hi; 434 *size = overlap;
354 return npage_hi; 435 return 0;
355 } 436 }
356 437
357 /* Split existing */ 438 /* Split existing */
358 npage_lo = (start - dma->iova) >> PAGE_SHIFT;
359 npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
360 439
361 split = kzalloc(sizeof *split, GFP_KERNEL); 440 /*
441 * Allocate our tracking structure early even though it may not
442 * be used. An Allocation failure later loses track of pages and
443 * is more difficult to unwind.
444 */
445 split = kzalloc(sizeof(*split), GFP_KERNEL);
362 if (!split) 446 if (!split)
363 return -ENOMEM; 447 return -ENOMEM;
364 448
365 vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); 449 offset = start - dma->iova;
450
451 ret = vfio_unmap_unpin(iommu, dma, start, size);
452 if (ret || !*size) {
453 kfree(split);
454 return ret;
455 }
456
457 tmp = dma->size;
366 458
367 dma->npage = npage_lo; 459 /* Resize the lower vfio_dma in place, before the below insert */
460 dma->size = offset;
368 461
369 split->npage = npage_hi; 462 /* Insert new for remainder, assuming it didn't all get unmapped */
370 split->iova = start + size; 463 if (likely(offset + *size < tmp)) {
371 split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; 464 split->size = tmp - offset - *size;
372 split->prot = dma->prot; 465 split->iova = dma->iova + offset + *size;
373 list_add(&split->next, &iommu->dma_list); 466 split->vaddr = dma->vaddr + offset + *size;
374 return size >> PAGE_SHIFT; 467 split->prot = dma->prot;
468 vfio_insert_dma(iommu, split);
469 } else
470 kfree(split);
471
472 return 0;
375} 473}
376 474
377static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 475static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
378 struct vfio_iommu_type1_dma_unmap *unmap) 476 struct vfio_iommu_type1_dma_unmap *unmap)
379{ 477{
380 long ret = 0, npage = unmap->size >> PAGE_SHIFT;
381 struct vfio_dma *dma, *tmp;
382 uint64_t mask; 478 uint64_t mask;
479 struct vfio_dma *dma;
480 size_t unmapped = 0, size;
481 int ret = 0;
383 482
384 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 483 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
385 484
386 if (unmap->iova & mask) 485 if (unmap->iova & mask)
387 return -EINVAL; 486 return -EINVAL;
388 if (unmap->size & mask) 487 if (!unmap->size || unmap->size & mask)
389 return -EINVAL; 488 return -EINVAL;
390 489
391 /* XXX We still break these down into PAGE_SIZE */
392 WARN_ON(mask & PAGE_MASK); 490 WARN_ON(mask & PAGE_MASK);
393 491
394 mutex_lock(&iommu->lock); 492 mutex_lock(&iommu->lock);
395 493
396 list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { 494 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
397 if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), 495 size = unmap->size;
398 unmap->iova, unmap->size)) { 496 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
399 ret = vfio_remove_dma_overlap(iommu, unmap->iova, 497 if (ret || !size)
400 unmap->size, dma); 498 break;
401 if (ret > 0) 499 unmapped += size;
402 npage -= ret;
403 if (ret < 0 || npage == 0)
404 break;
405 }
406 } 500 }
501
407 mutex_unlock(&iommu->lock); 502 mutex_unlock(&iommu->lock);
408 return ret > 0 ? 0 : (int)ret; 503
504 /*
505 * We may unmap more than requested, update the unmap struct so
506 * userspace can know.
507 */
508 unmap->size = unmapped;
509
510 return ret;
511}
512
513/*
514 * Turns out AMD IOMMU has a page table bug where it won't map large pages
515 * to a region that previously mapped smaller pages. This should be fixed
516 * soon, so this is just a temporary workaround to break mappings down into
517 * PAGE_SIZE. Better to map smaller pages than nothing.
518 */
519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
520 unsigned long pfn, long npage, int prot)
521{
522 long i;
523 int ret;
524
525 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
526 ret = iommu_map(iommu->domain, iova,
527 (phys_addr_t)pfn << PAGE_SHIFT,
528 PAGE_SIZE, prot);
529 if (ret)
530 break;
531 }
532
533 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
534 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
535
536 return ret;
409} 537}
410 538
411static int vfio_dma_do_map(struct vfio_iommu *iommu, 539static int vfio_dma_do_map(struct vfio_iommu *iommu,
412 struct vfio_iommu_type1_dma_map *map) 540 struct vfio_iommu_type1_dma_map *map)
413{ 541{
414 struct vfio_dma *dma, *pdma = NULL; 542 dma_addr_t end, iova;
415 dma_addr_t iova = map->iova; 543 unsigned long vaddr = map->vaddr;
416 unsigned long locked, lock_limit, vaddr = map->vaddr;
417 size_t size = map->size; 544 size_t size = map->size;
545 long npage;
418 int ret = 0, prot = 0; 546 int ret = 0, prot = 0;
419 uint64_t mask; 547 uint64_t mask;
420 long npage; 548
549 end = map->iova + map->size;
421 550
422 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 551 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
423 552
@@ -430,104 +559,144 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
430 if (!prot) 559 if (!prot)
431 return -EINVAL; /* No READ/WRITE? */ 560 return -EINVAL; /* No READ/WRITE? */
432 561
562 if (iommu->cache)
563 prot |= IOMMU_CACHE;
564
433 if (vaddr & mask) 565 if (vaddr & mask)
434 return -EINVAL; 566 return -EINVAL;
435 if (iova & mask) 567 if (map->iova & mask)
436 return -EINVAL; 568 return -EINVAL;
437 if (size & mask) 569 if (!map->size || map->size & mask)
438 return -EINVAL; 570 return -EINVAL;
439 571
440 /* XXX We still break these down into PAGE_SIZE */
441 WARN_ON(mask & PAGE_MASK); 572 WARN_ON(mask & PAGE_MASK);
442 573
443 /* Don't allow IOVA wrap */ 574 /* Don't allow IOVA wrap */
444 if (iova + size && iova + size < iova) 575 if (end && end < map->iova)
445 return -EINVAL; 576 return -EINVAL;
446 577
447 /* Don't allow virtual address wrap */ 578 /* Don't allow virtual address wrap */
448 if (vaddr + size && vaddr + size < vaddr) 579 if (vaddr + map->size && vaddr + map->size < vaddr)
449 return -EINVAL;
450
451 npage = size >> PAGE_SHIFT;
452 if (!npage)
453 return -EINVAL; 580 return -EINVAL;
454 581
455 mutex_lock(&iommu->lock); 582 mutex_lock(&iommu->lock);
456 583
457 if (vfio_find_dma(iommu, iova, size)) { 584 if (vfio_find_dma(iommu, map->iova, map->size)) {
458 ret = -EBUSY; 585 mutex_unlock(&iommu->lock);
459 goto out_lock; 586 return -EEXIST;
460 } 587 }
461 588
462 /* account for locked pages */ 589 for (iova = map->iova; iova < end; iova += size, vaddr += size) {
463 locked = current->mm->locked_vm + npage; 590 struct vfio_dma *dma = NULL;
464 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 591 unsigned long pfn;
465 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { 592 long i;
466 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 593
467 __func__, rlimit(RLIMIT_MEMLOCK)); 594 /* Pin a contiguous chunk of memory */
468 ret = -ENOMEM; 595 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
469 goto out_lock; 596 prot, &pfn);
470 } 597 if (npage <= 0) {
598 WARN_ON(!npage);
599 ret = (int)npage;
600 break;
601 }
471 602
472 ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); 603 /* Verify pages are not already mapped */
473 if (ret) 604 for (i = 0; i < npage; i++) {
474 goto out_lock; 605 if (iommu_iova_to_phys(iommu->domain,
606 iova + (i << PAGE_SHIFT))) {
607 vfio_unpin_pages(pfn, npage, prot, true);
608 ret = -EBUSY;
609 break;
610 }
611 }
475 612
476 /* Check if we abut a region below - nothing below 0 */ 613 ret = iommu_map(iommu->domain, iova,
477 if (iova) { 614 (phys_addr_t)pfn << PAGE_SHIFT,
478 dma = vfio_find_dma(iommu, iova - 1, 1); 615 npage << PAGE_SHIFT, prot);
479 if (dma && dma->prot == prot && 616 if (ret) {
480 dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { 617 if (ret != -EBUSY ||
618 map_try_harder(iommu, iova, pfn, npage, prot)) {
619 vfio_unpin_pages(pfn, npage, prot, true);
620 break;
621 }
622 }
481 623
482 dma->npage += npage; 624 size = npage << PAGE_SHIFT;
483 iova = dma->iova;
484 vaddr = dma->vaddr;
485 npage = dma->npage;
486 size = NPAGE_TO_SIZE(npage);
487 625
488 pdma = dma; 626 /*
627 * Check if we abut a region below - nothing below 0.
628 * This is the most likely case when mapping chunks of
629 * physically contiguous regions within a virtual address
630 * range. Update the abutting entry in place since iova
631 * doesn't change.
632 */
633 if (likely(iova)) {
634 struct vfio_dma *tmp;
635 tmp = vfio_find_dma(iommu, iova - 1, 1);
636 if (tmp && tmp->prot == prot &&
637 tmp->vaddr + tmp->size == vaddr) {
638 tmp->size += size;
639 iova = tmp->iova;
640 size = tmp->size;
641 vaddr = tmp->vaddr;
642 dma = tmp;
643 }
644 }
645
646 /*
647 * Check if we abut a region above - nothing above ~0 + 1.
648 * If we abut above and below, remove and free. If only
649 * abut above, remove, modify, reinsert.
650 */
651 if (likely(iova + size)) {
652 struct vfio_dma *tmp;
653 tmp = vfio_find_dma(iommu, iova + size, 1);
654 if (tmp && tmp->prot == prot &&
655 tmp->vaddr == vaddr + size) {
656 vfio_remove_dma(iommu, tmp);
657 if (dma) {
658 dma->size += tmp->size;
659 kfree(tmp);
660 } else {
661 size += tmp->size;
662 tmp->size = size;
663 tmp->iova = iova;
664 tmp->vaddr = vaddr;
665 vfio_insert_dma(iommu, tmp);
666 dma = tmp;
667 }
668 }
489 } 669 }
490 }
491 670
492 /* Check if we abut a region above - nothing above ~0 + 1 */ 671 if (!dma) {
493 if (iova + size) { 672 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
494 dma = vfio_find_dma(iommu, iova + size, 1); 673 if (!dma) {
495 if (dma && dma->prot == prot && 674 iommu_unmap(iommu->domain, iova, size);
496 dma->vaddr == vaddr + size) { 675 vfio_unpin_pages(pfn, npage, prot, true);
676 ret = -ENOMEM;
677 break;
678 }
497 679
498 dma->npage += npage; 680 dma->size = size;
499 dma->iova = iova; 681 dma->iova = iova;
500 dma->vaddr = vaddr; 682 dma->vaddr = vaddr;
501 683 dma->prot = prot;
502 /* 684 vfio_insert_dma(iommu, dma);
503 * If merged above and below, remove previously
504 * merged entry. New entry covers it.
505 */
506 if (pdma) {
507 list_del(&pdma->next);
508 kfree(pdma);
509 }
510 pdma = dma;
511 } 685 }
512 } 686 }
513 687
514 /* Isolated, new region */ 688 if (ret) {
515 if (!pdma) { 689 struct vfio_dma *tmp;
516 dma = kzalloc(sizeof *dma, GFP_KERNEL); 690 iova = map->iova;
517 if (!dma) { 691 size = map->size;
518 ret = -ENOMEM; 692 while ((tmp = vfio_find_dma(iommu, iova, size))) {
519 vfio_dma_unmap(iommu, iova, npage, prot); 693 int r = vfio_remove_dma_overlap(iommu, iova,
520 goto out_lock; 694 &size, tmp);
695 if (WARN_ON(r || !size))
696 break;
521 } 697 }
522
523 dma->npage = npage;
524 dma->iova = iova;
525 dma->vaddr = vaddr;
526 dma->prot = prot;
527 list_add(&dma->next, &iommu->dma_list);
528 } 698 }
529 699
530out_lock:
531 mutex_unlock(&iommu->lock); 700 mutex_unlock(&iommu->lock);
532 return ret; 701 return ret;
533} 702}
@@ -606,7 +775,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
606 return ERR_PTR(-ENOMEM); 775 return ERR_PTR(-ENOMEM);
607 776
608 INIT_LIST_HEAD(&iommu->group_list); 777 INIT_LIST_HEAD(&iommu->group_list);
609 INIT_LIST_HEAD(&iommu->dma_list); 778 iommu->dma_list = RB_ROOT;
610 mutex_init(&iommu->lock); 779 mutex_init(&iommu->lock);
611 780
612 /* 781 /*
@@ -640,7 +809,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
640{ 809{
641 struct vfio_iommu *iommu = iommu_data; 810 struct vfio_iommu *iommu = iommu_data;
642 struct vfio_group *group, *group_tmp; 811 struct vfio_group *group, *group_tmp;
643 struct vfio_dma *dma, *dma_tmp; 812 struct rb_node *node;
644 813
645 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { 814 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
646 iommu_detach_group(iommu->domain, group->iommu_group); 815 iommu_detach_group(iommu->domain, group->iommu_group);
@@ -648,10 +817,12 @@ static void vfio_iommu_type1_release(void *iommu_data)
648 kfree(group); 817 kfree(group);
649 } 818 }
650 819
651 list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { 820 while ((node = rb_first(&iommu->dma_list))) {
652 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); 821 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
653 list_del(&dma->next); 822 size_t size = dma->size;
654 kfree(dma); 823 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
824 if (WARN_ON(!size))
825 break;
655 } 826 }
656 827
657 iommu_domain_free(iommu->domain); 828 iommu_domain_free(iommu->domain);
@@ -706,6 +877,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
706 877
707 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { 878 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
708 struct vfio_iommu_type1_dma_unmap unmap; 879 struct vfio_iommu_type1_dma_unmap unmap;
880 long ret;
709 881
710 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 882 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
711 883
@@ -715,7 +887,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
715 if (unmap.argsz < minsz || unmap.flags) 887 if (unmap.argsz < minsz || unmap.flags)
716 return -EINVAL; 888 return -EINVAL;
717 889
718 return vfio_dma_do_unmap(iommu, &unmap); 890 ret = vfio_dma_do_unmap(iommu, &unmap);
891 if (ret)
892 return ret;
893
894 return copy_to_user((void __user *)arg, &unmap, minsz);
719 } 895 }
720 896
721 return -ENOTTY; 897 return -ENOTTY;