diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-10 17:50:08 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-10 17:50:08 -0400 |
commit | 15a49b9a90c86c6cb7f270a699d2ae7468862c28 (patch) | |
tree | 218fc6bf3ae5beb989cddaede2e4ee0c2e679845 /drivers/vfio | |
parent | 8d10aae2741ec9ffd53c8d214f7ada6d543b3a46 (diff) | |
parent | 8d38ef1948bd415a5cb653a5c0ec16f3402aaca1 (diff) |
Merge tag 'vfio-v3.11' of git://github.com/awilliam/linux-vfio
Pull vfio updates from Alex Williamson:
"Largely hugepage support for vfio/type1 iommu and surrounding cleanups
and fixes"
* tag 'vfio-v3.11' of git://github.com/awilliam/linux-vfio:
vfio/type1: Fix leak on error path
vfio: Limit group opens
vfio/type1: Fix missed frees and zero sized removes
vfio: fix documentation
vfio: Provide module option to disable vfio_iommu_type1 hugepage support
vfio: hugepage support for vfio_iommu_type1
vfio: Convert type1 iommu to use rbtree
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/vfio.c | 14 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 626 |
2 files changed, 415 insertions, 225 deletions
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 259ad282ae5d..c488da5db7c7 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c | |||
@@ -76,6 +76,7 @@ struct vfio_group { | |||
76 | struct notifier_block nb; | 76 | struct notifier_block nb; |
77 | struct list_head vfio_next; | 77 | struct list_head vfio_next; |
78 | struct list_head container_next; | 78 | struct list_head container_next; |
79 | atomic_t opened; | ||
79 | }; | 80 | }; |
80 | 81 | ||
81 | struct vfio_device { | 82 | struct vfio_device { |
@@ -206,6 +207,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | |||
206 | INIT_LIST_HEAD(&group->device_list); | 207 | INIT_LIST_HEAD(&group->device_list); |
207 | mutex_init(&group->device_lock); | 208 | mutex_init(&group->device_lock); |
208 | atomic_set(&group->container_users, 0); | 209 | atomic_set(&group->container_users, 0); |
210 | atomic_set(&group->opened, 0); | ||
209 | group->iommu_group = iommu_group; | 211 | group->iommu_group = iommu_group; |
210 | 212 | ||
211 | group->nb.notifier_call = vfio_iommu_group_notifier; | 213 | group->nb.notifier_call = vfio_iommu_group_notifier; |
@@ -1236,12 +1238,22 @@ static long vfio_group_fops_compat_ioctl(struct file *filep, | |||
1236 | static int vfio_group_fops_open(struct inode *inode, struct file *filep) | 1238 | static int vfio_group_fops_open(struct inode *inode, struct file *filep) |
1237 | { | 1239 | { |
1238 | struct vfio_group *group; | 1240 | struct vfio_group *group; |
1241 | int opened; | ||
1239 | 1242 | ||
1240 | group = vfio_group_get_from_minor(iminor(inode)); | 1243 | group = vfio_group_get_from_minor(iminor(inode)); |
1241 | if (!group) | 1244 | if (!group) |
1242 | return -ENODEV; | 1245 | return -ENODEV; |
1243 | 1246 | ||
1247 | /* Do we need multiple instances of the group open? Seems not. */ | ||
1248 | opened = atomic_cmpxchg(&group->opened, 0, 1); | ||
1249 | if (opened) { | ||
1250 | vfio_group_put(group); | ||
1251 | return -EBUSY; | ||
1252 | } | ||
1253 | |||
1254 | /* Is something still in use from a previous open? */ | ||
1244 | if (group->container) { | 1255 | if (group->container) { |
1256 | atomic_dec(&group->opened); | ||
1245 | vfio_group_put(group); | 1257 | vfio_group_put(group); |
1246 | return -EBUSY; | 1258 | return -EBUSY; |
1247 | } | 1259 | } |
@@ -1259,6 +1271,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) | |||
1259 | 1271 | ||
1260 | vfio_group_try_dissolve_container(group); | 1272 | vfio_group_try_dissolve_container(group); |
1261 | 1273 | ||
1274 | atomic_dec(&group->opened); | ||
1275 | |||
1262 | vfio_group_put(group); | 1276 | vfio_group_put(group); |
1263 | 1277 | ||
1264 | return 0; | 1278 | return 0; |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6f3fbc48a6c7..a9807dea3887 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/module.h> | 31 | #include <linux/module.h> |
32 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/pci.h> /* pci_bus_type */ | 33 | #include <linux/pci.h> /* pci_bus_type */ |
34 | #include <linux/rbtree.h> | ||
34 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
35 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
36 | #include <linux/uaccess.h> | 37 | #include <linux/uaccess.h> |
@@ -47,19 +48,25 @@ module_param_named(allow_unsafe_interrupts, | |||
47 | MODULE_PARM_DESC(allow_unsafe_interrupts, | 48 | MODULE_PARM_DESC(allow_unsafe_interrupts, |
48 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | 49 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); |
49 | 50 | ||
51 | static bool disable_hugepages; | ||
52 | module_param_named(disable_hugepages, | ||
53 | disable_hugepages, bool, S_IRUGO | S_IWUSR); | ||
54 | MODULE_PARM_DESC(disable_hugepages, | ||
55 | "Disable VFIO IOMMU support for IOMMU hugepages."); | ||
56 | |||
50 | struct vfio_iommu { | 57 | struct vfio_iommu { |
51 | struct iommu_domain *domain; | 58 | struct iommu_domain *domain; |
52 | struct mutex lock; | 59 | struct mutex lock; |
53 | struct list_head dma_list; | 60 | struct rb_root dma_list; |
54 | struct list_head group_list; | 61 | struct list_head group_list; |
55 | bool cache; | 62 | bool cache; |
56 | }; | 63 | }; |
57 | 64 | ||
58 | struct vfio_dma { | 65 | struct vfio_dma { |
59 | struct list_head next; | 66 | struct rb_node node; |
60 | dma_addr_t iova; /* Device address */ | 67 | dma_addr_t iova; /* Device address */ |
61 | unsigned long vaddr; /* Process virtual addr */ | 68 | unsigned long vaddr; /* Process virtual addr */ |
62 | long npage; /* Number of pages */ | 69 | size_t size; /* Map size (bytes) */ |
63 | int prot; /* IOMMU_READ/WRITE */ | 70 | int prot; /* IOMMU_READ/WRITE */ |
64 | }; | 71 | }; |
65 | 72 | ||
@@ -73,7 +80,48 @@ struct vfio_group { | |||
73 | * into DMA'ble space using the IOMMU | 80 | * into DMA'ble space using the IOMMU |
74 | */ | 81 | */ |
75 | 82 | ||
76 | #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) | 83 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, |
84 | dma_addr_t start, size_t size) | ||
85 | { | ||
86 | struct rb_node *node = iommu->dma_list.rb_node; | ||
87 | |||
88 | while (node) { | ||
89 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | ||
90 | |||
91 | if (start + size <= dma->iova) | ||
92 | node = node->rb_left; | ||
93 | else if (start >= dma->iova + dma->size) | ||
94 | node = node->rb_right; | ||
95 | else | ||
96 | return dma; | ||
97 | } | ||
98 | |||
99 | return NULL; | ||
100 | } | ||
101 | |||
102 | static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) | ||
103 | { | ||
104 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; | ||
105 | struct vfio_dma *dma; | ||
106 | |||
107 | while (*link) { | ||
108 | parent = *link; | ||
109 | dma = rb_entry(parent, struct vfio_dma, node); | ||
110 | |||
111 | if (new->iova + new->size <= dma->iova) | ||
112 | link = &(*link)->rb_left; | ||
113 | else | ||
114 | link = &(*link)->rb_right; | ||
115 | } | ||
116 | |||
117 | rb_link_node(&new->node, parent, link); | ||
118 | rb_insert_color(&new->node, &iommu->dma_list); | ||
119 | } | ||
120 | |||
121 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) | ||
122 | { | ||
123 | rb_erase(&old->node, &iommu->dma_list); | ||
124 | } | ||
77 | 125 | ||
78 | struct vwork { | 126 | struct vwork { |
79 | struct mm_struct *mm; | 127 | struct mm_struct *mm; |
@@ -100,8 +148,8 @@ static void vfio_lock_acct(long npage) | |||
100 | struct vwork *vwork; | 148 | struct vwork *vwork; |
101 | struct mm_struct *mm; | 149 | struct mm_struct *mm; |
102 | 150 | ||
103 | if (!current->mm) | 151 | if (!current->mm || !npage) |
104 | return; /* process exited */ | 152 | return; /* process exited or nothing to do */ |
105 | 153 | ||
106 | if (down_write_trylock(¤t->mm->mmap_sem)) { | 154 | if (down_write_trylock(¤t->mm->mmap_sem)) { |
107 | current->mm->locked_vm += npage; | 155 | current->mm->locked_vm += npage; |
@@ -173,33 +221,6 @@ static int put_pfn(unsigned long pfn, int prot) | |||
173 | return 0; | 221 | return 0; |
174 | } | 222 | } |
175 | 223 | ||
176 | /* Unmap DMA region */ | ||
177 | static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
178 | long npage, int prot) | ||
179 | { | ||
180 | long i, unlocked = 0; | ||
181 | |||
182 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { | ||
183 | unsigned long pfn; | ||
184 | |||
185 | pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; | ||
186 | if (pfn) { | ||
187 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
188 | unlocked += put_pfn(pfn, prot); | ||
189 | } | ||
190 | } | ||
191 | return unlocked; | ||
192 | } | ||
193 | |||
194 | static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, | ||
195 | long npage, int prot) | ||
196 | { | ||
197 | long unlocked; | ||
198 | |||
199 | unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); | ||
200 | vfio_lock_acct(-unlocked); | ||
201 | } | ||
202 | |||
203 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | 224 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) |
204 | { | 225 | { |
205 | struct page *page[1]; | 226 | struct page *page[1]; |
@@ -226,198 +247,306 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) | |||
226 | return ret; | 247 | return ret; |
227 | } | 248 | } |
228 | 249 | ||
229 | /* Map DMA region */ | 250 | /* |
230 | static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, | 251 | * Attempt to pin pages. We really don't want to track all the pfns and |
231 | unsigned long vaddr, long npage, int prot) | 252 | * the iommu can only map chunks of consecutive pfns anyway, so get the |
253 | * first page and all consecutive pages with the same locking. | ||
254 | */ | ||
255 | static long vfio_pin_pages(unsigned long vaddr, long npage, | ||
256 | int prot, unsigned long *pfn_base) | ||
232 | { | 257 | { |
233 | dma_addr_t start = iova; | 258 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
234 | long i, locked = 0; | 259 | bool lock_cap = capable(CAP_IPC_LOCK); |
235 | int ret; | 260 | long ret, i; |
236 | 261 | ||
237 | /* Verify that pages are not already mapped */ | 262 | if (!current->mm) |
238 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) | 263 | return -ENODEV; |
239 | if (iommu_iova_to_phys(iommu->domain, iova)) | ||
240 | return -EBUSY; | ||
241 | 264 | ||
242 | iova = start; | 265 | ret = vaddr_get_pfn(vaddr, prot, pfn_base); |
266 | if (ret) | ||
267 | return ret; | ||
243 | 268 | ||
244 | if (iommu->cache) | 269 | if (is_invalid_reserved_pfn(*pfn_base)) |
245 | prot |= IOMMU_CACHE; | 270 | return 1; |
246 | 271 | ||
247 | /* | 272 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { |
248 | * XXX We break mappings into pages and use get_user_pages_fast to | 273 | put_pfn(*pfn_base, prot); |
249 | * pin the pages in memory. It's been suggested that mlock might | 274 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, |
250 | * provide a more efficient mechanism, but nothing prevents the | 275 | limit << PAGE_SHIFT); |
251 | * user from munlocking the pages, which could then allow the user | 276 | return -ENOMEM; |
252 | * access to random host memory. We also have no guarantee from the | 277 | } |
253 | * IOMMU API that the iommu driver can unmap sub-pages of previous | 278 | |
254 | * mappings. This means we might lose an entire range if a single | 279 | if (unlikely(disable_hugepages)) { |
255 | * page within it is unmapped. Single page mappings are inefficient, | 280 | vfio_lock_acct(1); |
256 | * but provide the most flexibility for now. | 281 | return 1; |
257 | */ | 282 | } |
258 | for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { | 283 | |
284 | /* Lock all the consecutive pages from pfn_base */ | ||
285 | for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { | ||
259 | unsigned long pfn = 0; | 286 | unsigned long pfn = 0; |
260 | 287 | ||
261 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | 288 | ret = vaddr_get_pfn(vaddr, prot, &pfn); |
262 | if (ret) { | 289 | if (ret) |
263 | __vfio_dma_do_unmap(iommu, start, i, prot); | 290 | break; |
264 | return ret; | ||
265 | } | ||
266 | 291 | ||
267 | /* | 292 | if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { |
268 | * Only add actual locked pages to accounting | 293 | put_pfn(pfn, prot); |
269 | * XXX We're effectively marking a page locked for every | 294 | break; |
270 | * IOVA page even though it's possible the user could be | 295 | } |
271 | * backing multiple IOVAs with the same vaddr. This over- | ||
272 | * penalizes the user process, but we currently have no | ||
273 | * easy way to do this properly. | ||
274 | */ | ||
275 | if (!is_invalid_reserved_pfn(pfn)) | ||
276 | locked++; | ||
277 | 296 | ||
278 | ret = iommu_map(iommu->domain, iova, | 297 | if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { |
279 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
280 | PAGE_SIZE, prot); | ||
281 | if (ret) { | ||
282 | /* Back out mappings on error */ | ||
283 | put_pfn(pfn, prot); | 298 | put_pfn(pfn, prot); |
284 | __vfio_dma_do_unmap(iommu, start, i, prot); | 299 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", |
285 | return ret; | 300 | __func__, limit << PAGE_SHIFT); |
301 | break; | ||
286 | } | 302 | } |
287 | } | 303 | } |
288 | vfio_lock_acct(locked); | 304 | |
289 | return 0; | 305 | vfio_lock_acct(i); |
306 | |||
307 | return i; | ||
290 | } | 308 | } |
291 | 309 | ||
292 | static inline bool ranges_overlap(dma_addr_t start1, size_t size1, | 310 | static long vfio_unpin_pages(unsigned long pfn, long npage, |
293 | dma_addr_t start2, size_t size2) | 311 | int prot, bool do_accounting) |
294 | { | 312 | { |
295 | if (start1 < start2) | 313 | unsigned long unlocked = 0; |
296 | return (start2 - start1 < size1); | 314 | long i; |
297 | else if (start2 < start1) | 315 | |
298 | return (start1 - start2 < size2); | 316 | for (i = 0; i < npage; i++) |
299 | return (size1 > 0 && size2 > 0); | 317 | unlocked += put_pfn(pfn++, prot); |
318 | |||
319 | if (do_accounting) | ||
320 | vfio_lock_acct(-unlocked); | ||
321 | |||
322 | return unlocked; | ||
300 | } | 323 | } |
301 | 324 | ||
302 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, | 325 | static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, |
303 | dma_addr_t start, size_t size) | 326 | dma_addr_t iova, size_t *size) |
304 | { | 327 | { |
305 | struct vfio_dma *dma; | 328 | dma_addr_t start = iova, end = iova + *size; |
329 | long unlocked = 0; | ||
306 | 330 | ||
307 | list_for_each_entry(dma, &iommu->dma_list, next) { | 331 | while (iova < end) { |
308 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | 332 | size_t unmapped; |
309 | start, size)) | 333 | phys_addr_t phys; |
310 | return dma; | 334 | |
335 | /* | ||
336 | * We use the IOMMU to track the physical address. This | ||
337 | * saves us from having a lot more entries in our mapping | ||
338 | * tree. The downside is that we don't track the size | ||
339 | * used to do the mapping. We request unmap of a single | ||
340 | * page, but expect IOMMUs that support large pages to | ||
341 | * unmap a larger chunk. | ||
342 | */ | ||
343 | phys = iommu_iova_to_phys(iommu->domain, iova); | ||
344 | if (WARN_ON(!phys)) { | ||
345 | iova += PAGE_SIZE; | ||
346 | continue; | ||
347 | } | ||
348 | |||
349 | unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
350 | if (!unmapped) | ||
351 | break; | ||
352 | |||
353 | unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, | ||
354 | unmapped >> PAGE_SHIFT, | ||
355 | dma->prot, false); | ||
356 | iova += unmapped; | ||
311 | } | 357 | } |
312 | return NULL; | 358 | |
359 | vfio_lock_acct(-unlocked); | ||
360 | |||
361 | *size = iova - start; | ||
362 | |||
363 | return 0; | ||
313 | } | 364 | } |
314 | 365 | ||
315 | static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, | 366 | static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, |
316 | size_t size, struct vfio_dma *dma) | 367 | size_t *size, struct vfio_dma *dma) |
317 | { | 368 | { |
369 | size_t offset, overlap, tmp; | ||
318 | struct vfio_dma *split; | 370 | struct vfio_dma *split; |
319 | long npage_lo, npage_hi; | 371 | int ret; |
320 | 372 | ||
321 | /* Existing dma region is completely covered, unmap all */ | 373 | if (!*size) |
322 | if (start <= dma->iova && | 374 | return 0; |
323 | start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | 375 | |
324 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | 376 | /* |
325 | list_del(&dma->next); | 377 | * Existing dma region is completely covered, unmap all. This is |
326 | npage_lo = dma->npage; | 378 | * the likely case since userspace tends to map and unmap buffers |
379 | * in one shot rather than multiple mappings within a buffer. | ||
380 | */ | ||
381 | if (likely(start <= dma->iova && | ||
382 | start + *size >= dma->iova + dma->size)) { | ||
383 | *size = dma->size; | ||
384 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); | ||
385 | if (ret) | ||
386 | return ret; | ||
387 | |||
388 | /* | ||
389 | * Did we remove more than we have? Should never happen | ||
390 | * since a vfio_dma is contiguous in iova and vaddr. | ||
391 | */ | ||
392 | WARN_ON(*size != dma->size); | ||
393 | |||
394 | vfio_remove_dma(iommu, dma); | ||
327 | kfree(dma); | 395 | kfree(dma); |
328 | return npage_lo; | 396 | return 0; |
329 | } | 397 | } |
330 | 398 | ||
331 | /* Overlap low address of existing range */ | 399 | /* Overlap low address of existing range */ |
332 | if (start <= dma->iova) { | 400 | if (start <= dma->iova) { |
333 | size_t overlap; | 401 | overlap = start + *size - dma->iova; |
402 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); | ||
403 | if (ret) | ||
404 | return ret; | ||
334 | 405 | ||
335 | overlap = start + size - dma->iova; | 406 | vfio_remove_dma(iommu, dma); |
336 | npage_lo = overlap >> PAGE_SHIFT; | ||
337 | 407 | ||
338 | vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); | 408 | /* |
339 | dma->iova += overlap; | 409 | * Check, we may have removed to whole vfio_dma. If not |
340 | dma->vaddr += overlap; | 410 | * fixup and re-insert. |
341 | dma->npage -= npage_lo; | 411 | */ |
342 | return npage_lo; | 412 | if (overlap < dma->size) { |
413 | dma->iova += overlap; | ||
414 | dma->vaddr += overlap; | ||
415 | dma->size -= overlap; | ||
416 | vfio_insert_dma(iommu, dma); | ||
417 | } else | ||
418 | kfree(dma); | ||
419 | |||
420 | *size = overlap; | ||
421 | return 0; | ||
343 | } | 422 | } |
344 | 423 | ||
345 | /* Overlap high address of existing range */ | 424 | /* Overlap high address of existing range */ |
346 | if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { | 425 | if (start + *size >= dma->iova + dma->size) { |
347 | size_t overlap; | 426 | offset = start - dma->iova; |
427 | overlap = dma->size - offset; | ||
348 | 428 | ||
349 | overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; | 429 | ret = vfio_unmap_unpin(iommu, dma, start, &overlap); |
350 | npage_hi = overlap >> PAGE_SHIFT; | 430 | if (ret) |
431 | return ret; | ||
351 | 432 | ||
352 | vfio_dma_unmap(iommu, start, npage_hi, dma->prot); | 433 | dma->size -= overlap; |
353 | dma->npage -= npage_hi; | 434 | *size = overlap; |
354 | return npage_hi; | 435 | return 0; |
355 | } | 436 | } |
356 | 437 | ||
357 | /* Split existing */ | 438 | /* Split existing */ |
358 | npage_lo = (start - dma->iova) >> PAGE_SHIFT; | ||
359 | npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; | ||
360 | 439 | ||
361 | split = kzalloc(sizeof *split, GFP_KERNEL); | 440 | /* |
441 | * Allocate our tracking structure early even though it may not | ||
442 | * be used. An Allocation failure later loses track of pages and | ||
443 | * is more difficult to unwind. | ||
444 | */ | ||
445 | split = kzalloc(sizeof(*split), GFP_KERNEL); | ||
362 | if (!split) | 446 | if (!split) |
363 | return -ENOMEM; | 447 | return -ENOMEM; |
364 | 448 | ||
365 | vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); | 449 | offset = start - dma->iova; |
450 | |||
451 | ret = vfio_unmap_unpin(iommu, dma, start, size); | ||
452 | if (ret || !*size) { | ||
453 | kfree(split); | ||
454 | return ret; | ||
455 | } | ||
456 | |||
457 | tmp = dma->size; | ||
366 | 458 | ||
367 | dma->npage = npage_lo; | 459 | /* Resize the lower vfio_dma in place, before the below insert */ |
460 | dma->size = offset; | ||
368 | 461 | ||
369 | split->npage = npage_hi; | 462 | /* Insert new for remainder, assuming it didn't all get unmapped */ |
370 | split->iova = start + size; | 463 | if (likely(offset + *size < tmp)) { |
371 | split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; | 464 | split->size = tmp - offset - *size; |
372 | split->prot = dma->prot; | 465 | split->iova = dma->iova + offset + *size; |
373 | list_add(&split->next, &iommu->dma_list); | 466 | split->vaddr = dma->vaddr + offset + *size; |
374 | return size >> PAGE_SHIFT; | 467 | split->prot = dma->prot; |
468 | vfio_insert_dma(iommu, split); | ||
469 | } else | ||
470 | kfree(split); | ||
471 | |||
472 | return 0; | ||
375 | } | 473 | } |
376 | 474 | ||
377 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | 475 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, |
378 | struct vfio_iommu_type1_dma_unmap *unmap) | 476 | struct vfio_iommu_type1_dma_unmap *unmap) |
379 | { | 477 | { |
380 | long ret = 0, npage = unmap->size >> PAGE_SHIFT; | ||
381 | struct vfio_dma *dma, *tmp; | ||
382 | uint64_t mask; | 478 | uint64_t mask; |
479 | struct vfio_dma *dma; | ||
480 | size_t unmapped = 0, size; | ||
481 | int ret = 0; | ||
383 | 482 | ||
384 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | 483 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; |
385 | 484 | ||
386 | if (unmap->iova & mask) | 485 | if (unmap->iova & mask) |
387 | return -EINVAL; | 486 | return -EINVAL; |
388 | if (unmap->size & mask) | 487 | if (!unmap->size || unmap->size & mask) |
389 | return -EINVAL; | 488 | return -EINVAL; |
390 | 489 | ||
391 | /* XXX We still break these down into PAGE_SIZE */ | ||
392 | WARN_ON(mask & PAGE_MASK); | 490 | WARN_ON(mask & PAGE_MASK); |
393 | 491 | ||
394 | mutex_lock(&iommu->lock); | 492 | mutex_lock(&iommu->lock); |
395 | 493 | ||
396 | list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { | 494 | while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { |
397 | if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), | 495 | size = unmap->size; |
398 | unmap->iova, unmap->size)) { | 496 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); |
399 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, | 497 | if (ret || !size) |
400 | unmap->size, dma); | 498 | break; |
401 | if (ret > 0) | 499 | unmapped += size; |
402 | npage -= ret; | ||
403 | if (ret < 0 || npage == 0) | ||
404 | break; | ||
405 | } | ||
406 | } | 500 | } |
501 | |||
407 | mutex_unlock(&iommu->lock); | 502 | mutex_unlock(&iommu->lock); |
408 | return ret > 0 ? 0 : (int)ret; | 503 | |
504 | /* | ||
505 | * We may unmap more than requested, update the unmap struct so | ||
506 | * userspace can know. | ||
507 | */ | ||
508 | unmap->size = unmapped; | ||
509 | |||
510 | return ret; | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Turns out AMD IOMMU has a page table bug where it won't map large pages | ||
515 | * to a region that previously mapped smaller pages. This should be fixed | ||
516 | * soon, so this is just a temporary workaround to break mappings down into | ||
517 | * PAGE_SIZE. Better to map smaller pages than nothing. | ||
518 | */ | ||
519 | static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, | ||
520 | unsigned long pfn, long npage, int prot) | ||
521 | { | ||
522 | long i; | ||
523 | int ret; | ||
524 | |||
525 | for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { | ||
526 | ret = iommu_map(iommu->domain, iova, | ||
527 | (phys_addr_t)pfn << PAGE_SHIFT, | ||
528 | PAGE_SIZE, prot); | ||
529 | if (ret) | ||
530 | break; | ||
531 | } | ||
532 | |||
533 | for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) | ||
534 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | ||
535 | |||
536 | return ret; | ||
409 | } | 537 | } |
410 | 538 | ||
411 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | 539 | static int vfio_dma_do_map(struct vfio_iommu *iommu, |
412 | struct vfio_iommu_type1_dma_map *map) | 540 | struct vfio_iommu_type1_dma_map *map) |
413 | { | 541 | { |
414 | struct vfio_dma *dma, *pdma = NULL; | 542 | dma_addr_t end, iova; |
415 | dma_addr_t iova = map->iova; | 543 | unsigned long vaddr = map->vaddr; |
416 | unsigned long locked, lock_limit, vaddr = map->vaddr; | ||
417 | size_t size = map->size; | 544 | size_t size = map->size; |
545 | long npage; | ||
418 | int ret = 0, prot = 0; | 546 | int ret = 0, prot = 0; |
419 | uint64_t mask; | 547 | uint64_t mask; |
420 | long npage; | 548 | |
549 | end = map->iova + map->size; | ||
421 | 550 | ||
422 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | 551 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; |
423 | 552 | ||
@@ -430,104 +559,144 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, | |||
430 | if (!prot) | 559 | if (!prot) |
431 | return -EINVAL; /* No READ/WRITE? */ | 560 | return -EINVAL; /* No READ/WRITE? */ |
432 | 561 | ||
562 | if (iommu->cache) | ||
563 | prot |= IOMMU_CACHE; | ||
564 | |||
433 | if (vaddr & mask) | 565 | if (vaddr & mask) |
434 | return -EINVAL; | 566 | return -EINVAL; |
435 | if (iova & mask) | 567 | if (map->iova & mask) |
436 | return -EINVAL; | 568 | return -EINVAL; |
437 | if (size & mask) | 569 | if (!map->size || map->size & mask) |
438 | return -EINVAL; | 570 | return -EINVAL; |
439 | 571 | ||
440 | /* XXX We still break these down into PAGE_SIZE */ | ||
441 | WARN_ON(mask & PAGE_MASK); | 572 | WARN_ON(mask & PAGE_MASK); |
442 | 573 | ||
443 | /* Don't allow IOVA wrap */ | 574 | /* Don't allow IOVA wrap */ |
444 | if (iova + size && iova + size < iova) | 575 | if (end && end < map->iova) |
445 | return -EINVAL; | 576 | return -EINVAL; |
446 | 577 | ||
447 | /* Don't allow virtual address wrap */ | 578 | /* Don't allow virtual address wrap */ |
448 | if (vaddr + size && vaddr + size < vaddr) | 579 | if (vaddr + map->size && vaddr + map->size < vaddr) |
449 | return -EINVAL; | ||
450 | |||
451 | npage = size >> PAGE_SHIFT; | ||
452 | if (!npage) | ||
453 | return -EINVAL; | 580 | return -EINVAL; |
454 | 581 | ||
455 | mutex_lock(&iommu->lock); | 582 | mutex_lock(&iommu->lock); |
456 | 583 | ||
457 | if (vfio_find_dma(iommu, iova, size)) { | 584 | if (vfio_find_dma(iommu, map->iova, map->size)) { |
458 | ret = -EBUSY; | 585 | mutex_unlock(&iommu->lock); |
459 | goto out_lock; | 586 | return -EEXIST; |
460 | } | 587 | } |
461 | 588 | ||
462 | /* account for locked pages */ | 589 | for (iova = map->iova; iova < end; iova += size, vaddr += size) { |
463 | locked = current->mm->locked_vm + npage; | 590 | struct vfio_dma *dma = NULL; |
464 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 591 | unsigned long pfn; |
465 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { | 592 | long i; |
466 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | 593 | |
467 | __func__, rlimit(RLIMIT_MEMLOCK)); | 594 | /* Pin a contiguous chunk of memory */ |
468 | ret = -ENOMEM; | 595 | npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, |
469 | goto out_lock; | 596 | prot, &pfn); |
470 | } | 597 | if (npage <= 0) { |
598 | WARN_ON(!npage); | ||
599 | ret = (int)npage; | ||
600 | break; | ||
601 | } | ||
471 | 602 | ||
472 | ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); | 603 | /* Verify pages are not already mapped */ |
473 | if (ret) | 604 | for (i = 0; i < npage; i++) { |
474 | goto out_lock; | 605 | if (iommu_iova_to_phys(iommu->domain, |
606 | iova + (i << PAGE_SHIFT))) { | ||
607 | vfio_unpin_pages(pfn, npage, prot, true); | ||
608 | ret = -EBUSY; | ||
609 | break; | ||
610 | } | ||
611 | } | ||
475 | 612 | ||
476 | /* Check if we abut a region below - nothing below 0 */ | 613 | ret = iommu_map(iommu->domain, iova, |
477 | if (iova) { | 614 | (phys_addr_t)pfn << PAGE_SHIFT, |
478 | dma = vfio_find_dma(iommu, iova - 1, 1); | 615 | npage << PAGE_SHIFT, prot); |
479 | if (dma && dma->prot == prot && | 616 | if (ret) { |
480 | dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { | 617 | if (ret != -EBUSY || |
618 | map_try_harder(iommu, iova, pfn, npage, prot)) { | ||
619 | vfio_unpin_pages(pfn, npage, prot, true); | ||
620 | break; | ||
621 | } | ||
622 | } | ||
481 | 623 | ||
482 | dma->npage += npage; | 624 | size = npage << PAGE_SHIFT; |
483 | iova = dma->iova; | ||
484 | vaddr = dma->vaddr; | ||
485 | npage = dma->npage; | ||
486 | size = NPAGE_TO_SIZE(npage); | ||
487 | 625 | ||
488 | pdma = dma; | 626 | /* |
627 | * Check if we abut a region below - nothing below 0. | ||
628 | * This is the most likely case when mapping chunks of | ||
629 | * physically contiguous regions within a virtual address | ||
630 | * range. Update the abutting entry in place since iova | ||
631 | * doesn't change. | ||
632 | */ | ||
633 | if (likely(iova)) { | ||
634 | struct vfio_dma *tmp; | ||
635 | tmp = vfio_find_dma(iommu, iova - 1, 1); | ||
636 | if (tmp && tmp->prot == prot && | ||
637 | tmp->vaddr + tmp->size == vaddr) { | ||
638 | tmp->size += size; | ||
639 | iova = tmp->iova; | ||
640 | size = tmp->size; | ||
641 | vaddr = tmp->vaddr; | ||
642 | dma = tmp; | ||
643 | } | ||
644 | } | ||
645 | |||
646 | /* | ||
647 | * Check if we abut a region above - nothing above ~0 + 1. | ||
648 | * If we abut above and below, remove and free. If only | ||
649 | * abut above, remove, modify, reinsert. | ||
650 | */ | ||
651 | if (likely(iova + size)) { | ||
652 | struct vfio_dma *tmp; | ||
653 | tmp = vfio_find_dma(iommu, iova + size, 1); | ||
654 | if (tmp && tmp->prot == prot && | ||
655 | tmp->vaddr == vaddr + size) { | ||
656 | vfio_remove_dma(iommu, tmp); | ||
657 | if (dma) { | ||
658 | dma->size += tmp->size; | ||
659 | kfree(tmp); | ||
660 | } else { | ||
661 | size += tmp->size; | ||
662 | tmp->size = size; | ||
663 | tmp->iova = iova; | ||
664 | tmp->vaddr = vaddr; | ||
665 | vfio_insert_dma(iommu, tmp); | ||
666 | dma = tmp; | ||
667 | } | ||
668 | } | ||
489 | } | 669 | } |
490 | } | ||
491 | 670 | ||
492 | /* Check if we abut a region above - nothing above ~0 + 1 */ | 671 | if (!dma) { |
493 | if (iova + size) { | 672 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); |
494 | dma = vfio_find_dma(iommu, iova + size, 1); | 673 | if (!dma) { |
495 | if (dma && dma->prot == prot && | 674 | iommu_unmap(iommu->domain, iova, size); |
496 | dma->vaddr == vaddr + size) { | 675 | vfio_unpin_pages(pfn, npage, prot, true); |
676 | ret = -ENOMEM; | ||
677 | break; | ||
678 | } | ||
497 | 679 | ||
498 | dma->npage += npage; | 680 | dma->size = size; |
499 | dma->iova = iova; | 681 | dma->iova = iova; |
500 | dma->vaddr = vaddr; | 682 | dma->vaddr = vaddr; |
501 | 683 | dma->prot = prot; | |
502 | /* | 684 | vfio_insert_dma(iommu, dma); |
503 | * If merged above and below, remove previously | ||
504 | * merged entry. New entry covers it. | ||
505 | */ | ||
506 | if (pdma) { | ||
507 | list_del(&pdma->next); | ||
508 | kfree(pdma); | ||
509 | } | ||
510 | pdma = dma; | ||
511 | } | 685 | } |
512 | } | 686 | } |
513 | 687 | ||
514 | /* Isolated, new region */ | 688 | if (ret) { |
515 | if (!pdma) { | 689 | struct vfio_dma *tmp; |
516 | dma = kzalloc(sizeof *dma, GFP_KERNEL); | 690 | iova = map->iova; |
517 | if (!dma) { | 691 | size = map->size; |
518 | ret = -ENOMEM; | 692 | while ((tmp = vfio_find_dma(iommu, iova, size))) { |
519 | vfio_dma_unmap(iommu, iova, npage, prot); | 693 | int r = vfio_remove_dma_overlap(iommu, iova, |
520 | goto out_lock; | 694 | &size, tmp); |
695 | if (WARN_ON(r || !size)) | ||
696 | break; | ||
521 | } | 697 | } |
522 | |||
523 | dma->npage = npage; | ||
524 | dma->iova = iova; | ||
525 | dma->vaddr = vaddr; | ||
526 | dma->prot = prot; | ||
527 | list_add(&dma->next, &iommu->dma_list); | ||
528 | } | 698 | } |
529 | 699 | ||
530 | out_lock: | ||
531 | mutex_unlock(&iommu->lock); | 700 | mutex_unlock(&iommu->lock); |
532 | return ret; | 701 | return ret; |
533 | } | 702 | } |
@@ -606,7 +775,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) | |||
606 | return ERR_PTR(-ENOMEM); | 775 | return ERR_PTR(-ENOMEM); |
607 | 776 | ||
608 | INIT_LIST_HEAD(&iommu->group_list); | 777 | INIT_LIST_HEAD(&iommu->group_list); |
609 | INIT_LIST_HEAD(&iommu->dma_list); | 778 | iommu->dma_list = RB_ROOT; |
610 | mutex_init(&iommu->lock); | 779 | mutex_init(&iommu->lock); |
611 | 780 | ||
612 | /* | 781 | /* |
@@ -640,7 +809,7 @@ static void vfio_iommu_type1_release(void *iommu_data) | |||
640 | { | 809 | { |
641 | struct vfio_iommu *iommu = iommu_data; | 810 | struct vfio_iommu *iommu = iommu_data; |
642 | struct vfio_group *group, *group_tmp; | 811 | struct vfio_group *group, *group_tmp; |
643 | struct vfio_dma *dma, *dma_tmp; | 812 | struct rb_node *node; |
644 | 813 | ||
645 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | 814 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { |
646 | iommu_detach_group(iommu->domain, group->iommu_group); | 815 | iommu_detach_group(iommu->domain, group->iommu_group); |
@@ -648,10 +817,12 @@ static void vfio_iommu_type1_release(void *iommu_data) | |||
648 | kfree(group); | 817 | kfree(group); |
649 | } | 818 | } |
650 | 819 | ||
651 | list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { | 820 | while ((node = rb_first(&iommu->dma_list))) { |
652 | vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); | 821 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); |
653 | list_del(&dma->next); | 822 | size_t size = dma->size; |
654 | kfree(dma); | 823 | vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); |
824 | if (WARN_ON(!size)) | ||
825 | break; | ||
655 | } | 826 | } |
656 | 827 | ||
657 | iommu_domain_free(iommu->domain); | 828 | iommu_domain_free(iommu->domain); |
@@ -706,6 +877,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, | |||
706 | 877 | ||
707 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | 878 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { |
708 | struct vfio_iommu_type1_dma_unmap unmap; | 879 | struct vfio_iommu_type1_dma_unmap unmap; |
880 | long ret; | ||
709 | 881 | ||
710 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | 882 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); |
711 | 883 | ||
@@ -715,7 +887,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, | |||
715 | if (unmap.argsz < minsz || unmap.flags) | 887 | if (unmap.argsz < minsz || unmap.flags) |
716 | return -EINVAL; | 888 | return -EINVAL; |
717 | 889 | ||
718 | return vfio_dma_do_unmap(iommu, &unmap); | 890 | ret = vfio_dma_do_unmap(iommu, &unmap); |
891 | if (ret) | ||
892 | return ret; | ||
893 | |||
894 | return copy_to_user((void __user *)arg, &unmap, minsz); | ||
719 | } | 895 | } |
720 | 896 | ||
721 | return -ENOTTY; | 897 | return -ENOTTY; |