aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2013-06-21 11:38:02 -0400
committerAlex Williamson <alex.williamson@redhat.com>2013-06-21 11:38:02 -0400
commit166fd7d94afdac040b28c473e45241820ca522a2 (patch)
tree044cd4540cb2a949ed8a55949cc39471b05a73b3
parentcd9b22685e4ccd728550d51fbe108c473f89df4f (diff)
vfio: hugepage support for vfio_iommu_type1
We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r--drivers/vfio/vfio_iommu_type1.c523
-rw-r--r--include/uapi/linux/vfio.h8
2 files changed, 344 insertions, 187 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0e863b3ddcab..6654a7eb42d3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
60 struct rb_node node; 60 struct rb_node node;
61 dma_addr_t iova; /* Device address */ 61 dma_addr_t iova; /* Device address */
62 unsigned long vaddr; /* Process virtual addr */ 62 unsigned long vaddr; /* Process virtual addr */
63 long npage; /* Number of pages */ 63 size_t size; /* Map size (bytes) */
64 int prot; /* IOMMU_READ/WRITE */ 64 int prot; /* IOMMU_READ/WRITE */
65}; 65};
66 66
@@ -74,8 +74,6 @@ struct vfio_group {
74 * into DMA'ble space using the IOMMU 74 * into DMA'ble space using the IOMMU
75 */ 75 */
76 76
77#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
78
79static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, 77static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
80 dma_addr_t start, size_t size) 78 dma_addr_t start, size_t size)
81{ 79{
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
86 84
87 if (start + size <= dma->iova) 85 if (start + size <= dma->iova)
88 node = node->rb_left; 86 node = node->rb_left;
89 else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) 87 else if (start >= dma->iova + dma->size)
90 node = node->rb_right; 88 node = node->rb_right;
91 else 89 else
92 return dma; 90 return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
104 parent = *link; 102 parent = *link;
105 dma = rb_entry(parent, struct vfio_dma, node); 103 dma = rb_entry(parent, struct vfio_dma, node);
106 104
107 if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) 105 if (new->iova + new->size <= dma->iova)
108 link = &(*link)->rb_left; 106 link = &(*link)->rb_left;
109 else 107 else
110 link = &(*link)->rb_right; 108 link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
144 struct vwork *vwork; 142 struct vwork *vwork;
145 struct mm_struct *mm; 143 struct mm_struct *mm;
146 144
147 if (!current->mm) 145 if (!current->mm || !npage)
148 return; /* process exited */ 146 return; /* process exited or nothing to do */
149 147
150 if (down_write_trylock(&current->mm->mmap_sem)) { 148 if (down_write_trylock(&current->mm->mmap_sem)) {
151 current->mm->locked_vm += npage; 149 current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
217 return 0; 215 return 0;
218} 216}
219 217
220/* Unmap DMA region */
221static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
222 long npage, int prot)
223{
224 long i, unlocked = 0;
225
226 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
227 unsigned long pfn;
228
229 pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
230 if (pfn) {
231 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
232 unlocked += put_pfn(pfn, prot);
233 }
234 }
235 return unlocked;
236}
237
238static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
239 long npage, int prot)
240{
241 long unlocked;
242
243 unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
244 vfio_lock_acct(-unlocked);
245}
246
247static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) 218static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
248{ 219{
249 struct page *page[1]; 220 struct page *page[1];
@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
270 return ret; 241 return ret;
271} 242}
272 243
273/* Map DMA region */ 244/*
274static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, 245 * Attempt to pin pages. We really don't want to track all the pfns and
275 unsigned long vaddr, long npage, int prot) 246 * the iommu can only map chunks of consecutive pfns anyway, so get the
247 * first page and all consecutive pages with the same locking.
248 */
249static long vfio_pin_pages(unsigned long vaddr, long npage,
250 int prot, unsigned long *pfn_base)
276{ 251{
277 dma_addr_t start = iova; 252 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
278 long i, locked = 0; 253 bool lock_cap = capable(CAP_IPC_LOCK);
279 int ret; 254 long ret, i;
280 255
281 /* Verify that pages are not already mapped */ 256 if (!current->mm)
282 for (i = 0; i < npage; i++, iova += PAGE_SIZE) 257 return -ENODEV;
283 if (iommu_iova_to_phys(iommu->domain, iova))
284 return -EBUSY;
285 258
286 iova = start; 259 ret = vaddr_get_pfn(vaddr, prot, pfn_base);
260 if (ret)
261 return ret;
287 262
288 if (iommu->cache) 263 if (is_invalid_reserved_pfn(*pfn_base))
289 prot |= IOMMU_CACHE; 264 return 1;
290 265
291 /* 266 if (!lock_cap && current->mm->locked_vm + 1 > limit) {
292 * XXX We break mappings into pages and use get_user_pages_fast to 267 put_pfn(*pfn_base, prot);
293 * pin the pages in memory. It's been suggested that mlock might 268 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
294 * provide a more efficient mechanism, but nothing prevents the 269 limit << PAGE_SHIFT);
295 * user from munlocking the pages, which could then allow the user 270 return -ENOMEM;
296 * access to random host memory. We also have no guarantee from the 271 }
297 * IOMMU API that the iommu driver can unmap sub-pages of previous 272
298 * mappings. This means we might lose an entire range if a single 273 /* Lock all the consecutive pages from pfn_base */
299 * page within it is unmapped. Single page mappings are inefficient, 274 for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
300 * but provide the most flexibility for now.
301 */
302 for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
303 unsigned long pfn = 0; 275 unsigned long pfn = 0;
304 276
305 ret = vaddr_get_pfn(vaddr, prot, &pfn); 277 ret = vaddr_get_pfn(vaddr, prot, &pfn);
306 if (ret) { 278 if (ret)
307 __vfio_dma_do_unmap(iommu, start, i, prot); 279 break;
308 return ret; 280
281 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
282 put_pfn(pfn, prot);
283 break;
309 } 284 }
310 285
286 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
287 put_pfn(pfn, prot);
288 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
289 __func__, limit << PAGE_SHIFT);
290 break;
291 }
292 }
293
294 vfio_lock_acct(i);
295
296 return i;
297}
298
299static long vfio_unpin_pages(unsigned long pfn, long npage,
300 int prot, bool do_accounting)
301{
302 unsigned long unlocked = 0;
303 long i;
304
305 for (i = 0; i < npage; i++)
306 unlocked += put_pfn(pfn++, prot);
307
308 if (do_accounting)
309 vfio_lock_acct(-unlocked);
310
311 return unlocked;
312}
313
314static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
315 dma_addr_t iova, size_t *size)
316{
317 dma_addr_t start = iova, end = iova + *size;
318 long unlocked = 0;
319
320 while (iova < end) {
321 size_t unmapped;
322 phys_addr_t phys;
323
311 /* 324 /*
312 * Only add actual locked pages to accounting 325 * We use the IOMMU to track the physical address. This
313 * XXX We're effectively marking a page locked for every 326 * saves us from having a lot more entries in our mapping
314 * IOVA page even though it's possible the user could be 327 * tree. The downside is that we don't track the size
315 * backing multiple IOVAs with the same vaddr. This over- 328 * used to do the mapping. We request unmap of a single
316 * penalizes the user process, but we currently have no 329 * page, but expect IOMMUs that support large pages to
317 * easy way to do this properly. 330 * unmap a larger chunk.
318 */ 331 */
319 if (!is_invalid_reserved_pfn(pfn)) 332 phys = iommu_iova_to_phys(iommu->domain, iova);
320 locked++; 333 if (WARN_ON(!phys)) {
321 334 iova += PAGE_SIZE;
322 ret = iommu_map(iommu->domain, iova, 335 continue;
323 (phys_addr_t)pfn << PAGE_SHIFT,
324 PAGE_SIZE, prot);
325 if (ret) {
326 /* Back out mappings on error */
327 put_pfn(pfn, prot);
328 __vfio_dma_do_unmap(iommu, start, i, prot);
329 return ret;
330 } 336 }
337
338 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
339 if (!unmapped)
340 break;
341
342 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
343 unmapped >> PAGE_SHIFT,
344 dma->prot, false);
345 iova += unmapped;
331 } 346 }
332 vfio_lock_acct(locked); 347
348 vfio_lock_acct(-unlocked);
349
350 *size = iova - start;
351
333 return 0; 352 return 0;
334} 353}
335 354
336static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, 355static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
337 size_t size, struct vfio_dma *dma) 356 size_t *size, struct vfio_dma *dma)
338{ 357{
358 size_t offset, overlap, tmp;
339 struct vfio_dma *split; 359 struct vfio_dma *split;
340 long npage_lo, npage_hi; 360 int ret;
361
362 /*
363 * Existing dma region is completely covered, unmap all. This is
364 * the likely case since userspace tends to map and unmap buffers
365 * in one shot rather than multiple mappings within a buffer.
366 */
367 if (likely(start <= dma->iova &&
368 start + *size >= dma->iova + dma->size)) {
369 *size = dma->size;
370 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
371 if (ret)
372 return ret;
373
374 /*
375 * Did we remove more than we have? Should never happen
376 * since a vfio_dma is contiguous in iova and vaddr.
377 */
378 WARN_ON(*size != dma->size);
341 379
342 /* Existing dma region is completely covered, unmap all */
343 if (start <= dma->iova &&
344 start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
345 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
346 vfio_remove_dma(iommu, dma); 380 vfio_remove_dma(iommu, dma);
347 kfree(dma); 381 kfree(dma);
348 return 0; 382 return 0;
@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
350 384
351 /* Overlap low address of existing range */ 385 /* Overlap low address of existing range */
352 if (start <= dma->iova) { 386 if (start <= dma->iova) {
353 size_t overlap; 387 overlap = start + *size - dma->iova;
388 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
389 if (ret)
390 return ret;
354 391
355 overlap = start + size - dma->iova; 392 vfio_remove_dma(iommu, dma);
356 npage_lo = overlap >> PAGE_SHIFT;
357 393
358 vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); 394 /*
359 dma->iova += overlap; 395 * Check, we may have removed to whole vfio_dma. If not
360 dma->vaddr += overlap; 396 * fixup and re-insert.
361 dma->npage -= npage_lo; 397 */
398 if (overlap < dma->size) {
399 dma->iova += overlap;
400 dma->vaddr += overlap;
401 dma->size -= overlap;
402 vfio_insert_dma(iommu, dma);
403 }
404 *size = overlap;
362 return 0; 405 return 0;
363 } 406 }
364 407
365 /* Overlap high address of existing range */ 408 /* Overlap high address of existing range */
366 if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { 409 if (start + *size >= dma->iova + dma->size) {
367 size_t overlap; 410 offset = start - dma->iova;
411 overlap = dma->size - offset;
368 412
369 overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; 413 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
370 npage_hi = overlap >> PAGE_SHIFT; 414 if (ret)
415 return ret;
416
417 /*
418 * We may have unmapped the entire vfio_dma if the user is
419 * trying to unmap a sub-region of what was originally
420 * mapped. If anything left, we can resize in place since
421 * iova is unchanged.
422 */
423 if (overlap < dma->size)
424 dma->size -= overlap;
425 else
426 vfio_remove_dma(iommu, dma);
371 427
372 vfio_dma_unmap(iommu, start, npage_hi, dma->prot); 428 *size = overlap;
373 dma->npage -= npage_hi;
374 return 0; 429 return 0;
375 } 430 }
376 431
377 /* Split existing */ 432 /* Split existing */
378 npage_lo = (start - dma->iova) >> PAGE_SHIFT; 433 offset = start - dma->iova;
379 npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
380 434
381 split = kzalloc(sizeof *split, GFP_KERNEL); 435 ret = vfio_unmap_unpin(iommu, dma, start, size);
382 if (!split) 436 if (ret)
383 return -ENOMEM; 437 return ret;
384 438
385 vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); 439 WARN_ON(!*size);
440 tmp = dma->size;
386 441
387 dma->npage = npage_lo; 442 /*
443 * Resize the lower vfio_dma in place, insert new for remaining
444 * upper segment.
445 */
446 dma->size = offset;
447
448 if (offset + *size < tmp) {
449 split = kzalloc(sizeof(*split), GFP_KERNEL);
450 if (!split)
451 return -ENOMEM;
452
453 split->size = tmp - offset - *size;
454 split->iova = dma->iova + offset + *size;
455 split->vaddr = dma->vaddr + offset + *size;
456 split->prot = dma->prot;
457 vfio_insert_dma(iommu, split);
458 }
388 459
389 split->npage = npage_hi;
390 split->iova = start + size;
391 split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
392 split->prot = dma->prot;
393 vfio_insert_dma(iommu, split);
394 return 0; 460 return 0;
395} 461}
396 462
@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
399{ 465{
400 uint64_t mask; 466 uint64_t mask;
401 struct vfio_dma *dma; 467 struct vfio_dma *dma;
468 size_t unmapped = 0, size;
402 int ret = 0; 469 int ret = 0;
403 470
404 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 471 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
408 if (unmap->size & mask) 475 if (unmap->size & mask)
409 return -EINVAL; 476 return -EINVAL;
410 477
411 /* XXX We still break these down into PAGE_SIZE */
412 WARN_ON(mask & PAGE_MASK); 478 WARN_ON(mask & PAGE_MASK);
413 479
414 mutex_lock(&iommu->lock); 480 mutex_lock(&iommu->lock);
415 481
416 while (!ret && (dma = vfio_find_dma(iommu, 482 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
417 unmap->iova, unmap->size))) 483 size = unmap->size;
418 ret = vfio_remove_dma_overlap(iommu, unmap->iova, 484 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
419 unmap->size, dma); 485 if (ret)
486 break;
487 unmapped += size;
488 }
420 489
421 mutex_unlock(&iommu->lock); 490 mutex_unlock(&iommu->lock);
491
492 /*
493 * We may unmap more than requested, update the unmap struct so
494 * userspace can know.
495 */
496 unmap->size = unmapped;
497
498 return ret;
499}
500
501/*
502 * Turns out AMD IOMMU has a page table bug where it won't map large pages
503 * to a region that previously mapped smaller pages. This should be fixed
504 * soon, so this is just a temporary workaround to break mappings down into
505 * PAGE_SIZE. Better to map smaller pages than nothing.
506 */
507static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
508 unsigned long pfn, long npage, int prot)
509{
510 long i;
511 int ret;
512
513 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
514 ret = iommu_map(iommu->domain, iova,
515 (phys_addr_t)pfn << PAGE_SHIFT,
516 PAGE_SIZE, prot);
517 if (ret)
518 break;
519 }
520
521 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
522 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
523
422 return ret; 524 return ret;
423} 525}
424 526
425static int vfio_dma_do_map(struct vfio_iommu *iommu, 527static int vfio_dma_do_map(struct vfio_iommu *iommu,
426 struct vfio_iommu_type1_dma_map *map) 528 struct vfio_iommu_type1_dma_map *map)
427{ 529{
428 struct vfio_dma *dma; 530 dma_addr_t end, iova;
429 dma_addr_t iova = map->iova; 531 unsigned long vaddr = map->vaddr;
430 unsigned long locked, lock_limit, vaddr = map->vaddr;
431 size_t size = map->size; 532 size_t size = map->size;
533 long npage;
432 int ret = 0, prot = 0; 534 int ret = 0, prot = 0;
433 uint64_t mask; 535 uint64_t mask;
434 long npage; 536
537 end = map->iova + map->size;
435 538
436 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 539 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
437 540
@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
444 if (!prot) 547 if (!prot)
445 return -EINVAL; /* No READ/WRITE? */ 548 return -EINVAL; /* No READ/WRITE? */
446 549
550 if (iommu->cache)
551 prot |= IOMMU_CACHE;
552
447 if (vaddr & mask) 553 if (vaddr & mask)
448 return -EINVAL; 554 return -EINVAL;
449 if (iova & mask) 555 if (map->iova & mask)
450 return -EINVAL; 556 return -EINVAL;
451 if (size & mask) 557 if (!map->size || map->size & mask)
452 return -EINVAL; 558 return -EINVAL;
453 559
454 /* XXX We still break these down into PAGE_SIZE */
455 WARN_ON(mask & PAGE_MASK); 560 WARN_ON(mask & PAGE_MASK);
456 561
457 /* Don't allow IOVA wrap */ 562 /* Don't allow IOVA wrap */
458 if (iova + size && iova + size < iova) 563 if (end && end < map->iova)
459 return -EINVAL; 564 return -EINVAL;
460 565
461 /* Don't allow virtual address wrap */ 566 /* Don't allow virtual address wrap */
462 if (vaddr + size && vaddr + size < vaddr) 567 if (vaddr + map->size && vaddr + map->size < vaddr)
463 return -EINVAL;
464
465 npage = size >> PAGE_SHIFT;
466 if (!npage)
467 return -EINVAL; 568 return -EINVAL;
468 569
469 dma = kzalloc(sizeof *dma, GFP_KERNEL);
470 if (!dma)
471 return -ENOMEM;
472
473 mutex_lock(&iommu->lock); 570 mutex_lock(&iommu->lock);
474 571
475 if (vfio_find_dma(iommu, iova, size)) { 572 if (vfio_find_dma(iommu, map->iova, map->size)) {
476 ret = -EBUSY; 573 mutex_unlock(&iommu->lock);
477 goto out_lock; 574 return -EEXIST;
478 } 575 }
479 576
480 /* account for locked pages */ 577 for (iova = map->iova; iova < end; iova += size, vaddr += size) {
481 locked = current->mm->locked_vm + npage; 578 struct vfio_dma *dma = NULL;
482 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 579 unsigned long pfn;
483 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { 580 long i;
484 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 581
485 __func__, rlimit(RLIMIT_MEMLOCK)); 582 /* Pin a contiguous chunk of memory */
486 ret = -ENOMEM; 583 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
487 goto out_lock; 584 prot, &pfn);
488 } 585 if (npage <= 0) {
586 WARN_ON(!npage);
587 ret = (int)npage;
588 break;
589 }
489 590
490 ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); 591 /* Verify pages are not already mapped */
491 if (ret) 592 for (i = 0; i < npage; i++) {
492 goto out_lock; 593 if (iommu_iova_to_phys(iommu->domain,
493 594 iova + (i << PAGE_SHIFT))) {
494 dma->npage = npage; 595 vfio_unpin_pages(pfn, npage, prot, true);
495 dma->iova = iova; 596 ret = -EBUSY;
496 dma->vaddr = vaddr; 597 break;
497 dma->prot = prot; 598 }
498 599 }
499 /* Check if we abut a region below - nothing below 0 */ 600
500 if (iova) { 601 ret = iommu_map(iommu->domain, iova,
501 struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1); 602 (phys_addr_t)pfn << PAGE_SHIFT,
502 if (tmp && tmp->prot == prot && 603 npage << PAGE_SHIFT, prot);
503 tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) { 604 if (ret) {
504 vfio_remove_dma(iommu, tmp); 605 if (ret != -EBUSY ||
505 dma->npage += tmp->npage; 606 map_try_harder(iommu, iova, pfn, npage, prot)) {
506 dma->iova = iova = tmp->iova; 607 vfio_unpin_pages(pfn, npage, prot, true);
507 dma->vaddr = vaddr = tmp->vaddr; 608 break;
508 kfree(tmp); 609 }
509 npage = dma->npage; 610 }
510 size = NPAGE_TO_SIZE(npage); 611
612 size = npage << PAGE_SHIFT;
613
614 /*
615 * Check if we abut a region below - nothing below 0.
616 * This is the most likely case when mapping chunks of
617 * physically contiguous regions within a virtual address
618 * range. Update the abutting entry in place since iova
619 * doesn't change.
620 */
621 if (likely(iova)) {
622 struct vfio_dma *tmp;
623 tmp = vfio_find_dma(iommu, iova - 1, 1);
624 if (tmp && tmp->prot == prot &&
625 tmp->vaddr + tmp->size == vaddr) {
626 tmp->size += size;
627
628 iova = tmp->iova;
629 size = tmp->size;
630 vaddr = tmp->vaddr;
631 dma = tmp;
632 }
633 }
634
635 /* Check if we abut a region above - nothing above ~0 + 1 */
636 if (likely(iova + size)) {
637 struct vfio_dma *tmp;
638
639 tmp = vfio_find_dma(iommu, iova + size, 1);
640 if (tmp && tmp->prot == prot &&
641 tmp->vaddr == vaddr + size) {
642 vfio_remove_dma(iommu, tmp);
643 if (dma)
644 dma->size += tmp->size;
645 else
646 size += tmp->size;
647 kfree(tmp);
648 }
511 } 649 }
512 }
513 650
514 /* Check if we abut a region above - nothing above ~0 + 1 */ 651 if (!dma) {
515 if (iova + size) { 652 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
516 struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1); 653 if (!dma) {
517 if (tmp && tmp->prot == prot && 654 iommu_unmap(iommu->domain, iova, size);
518 tmp->vaddr == vaddr + size) { 655 vfio_unpin_pages(pfn, npage, prot, true);
519 vfio_remove_dma(iommu, tmp); 656 ret = -ENOMEM;
520 dma->npage += tmp->npage; 657 break;
521 kfree(tmp); 658 }
522 npage = dma->npage; 659
523 size = NPAGE_TO_SIZE(npage); 660 dma->size = size;
661 dma->iova = iova;
662 dma->vaddr = vaddr;
663 dma->prot = prot;
664 vfio_insert_dma(iommu, dma);
524 } 665 }
525 } 666 }
526 667
527 vfio_insert_dma(iommu, dma); 668 if (ret) {
669 struct vfio_dma *tmp;
670 iova = map->iova;
671 size = map->size;
672 while ((tmp = vfio_find_dma(iommu, iova, size))) {
673 if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
674 pr_warn("%s: Error rolling back failed map\n",
675 __func__);
676 break;
677 }
678 }
679 }
528 680
529out_lock:
530 mutex_unlock(&iommu->lock); 681 mutex_unlock(&iommu->lock);
531 if (ret)
532 kfree(dma);
533 return ret; 682 return ret;
534} 683}
535 684
@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)
651 800
652 while ((node = rb_first(&iommu->dma_list))) { 801 while ((node = rb_first(&iommu->dma_list))) {
653 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 802 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
654 vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); 803 size_t size = dma->size;
655 vfio_remove_dma(iommu, dma); 804 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
656 kfree(dma);
657 } 805 }
658 806
659 iommu_domain_free(iommu->domain); 807 iommu_domain_free(iommu->domain);
@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
708 856
709 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { 857 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
710 struct vfio_iommu_type1_dma_unmap unmap; 858 struct vfio_iommu_type1_dma_unmap unmap;
859 long ret;
711 860
712 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 861 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
713 862
@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
717 if (unmap.argsz < minsz || unmap.flags) 866 if (unmap.argsz < minsz || unmap.flags)
718 return -EINVAL; 867 return -EINVAL;
719 868
720 return vfio_dma_do_unmap(iommu, &unmap); 869 ret = vfio_dma_do_unmap(iommu, &unmap);
870 if (ret)
871 return ret;
872
873 return copy_to_user((void __user *)arg, &unmap, minsz);
721 } 874 }
722 875
723 return -ENOTTY; 876 return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..513600612995 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map {
361#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) 361#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
362 362
363/** 363/**
364 * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) 364 * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
365 * struct vfio_dma_unmap)
365 * 366 *
366 * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. 367 * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
367 * Caller sets argsz. 368 * Caller sets argsz. The actual unmapped size is returned in the size
369 * field. No guarantee is made to the user that arbitrary unmaps of iova
370 * or size different from those used in the original mapping call will
371 * succeed.
368 */ 372 */
369struct vfio_iommu_type1_dma_unmap { 373struct vfio_iommu_type1_dma_unmap {
370 __u32 argsz; 374 __u32 argsz;