diff options
-rw-r--r-- | Documentation/vfio.txt | 31 | ||||
-rw-r--r-- | arch/powerpc/include/asm/iommu.h | 6 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_spapr_tce.c | 501 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 27 |
4 files changed, 482 insertions, 83 deletions
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 4c746a7e717a..dcc37e109c68 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt | |||
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note | |||
289 | 289 | ||
290 | This implementation has some specifics: | 290 | This implementation has some specifics: |
291 | 291 | ||
292 | 1) Only one IOMMU group per container is supported as an IOMMU group | 292 | 1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per |
293 | represents the minimal entity which isolation can be guaranteed for and | 293 | container is supported as an IOMMU table is allocated at the boot time, |
294 | groups are allocated statically, one per a Partitionable Endpoint (PE) | 294 | one table per a IOMMU group which is a Partitionable Endpoint (PE) |
295 | (PE is often a PCI domain but not always). | 295 | (PE is often a PCI domain but not always). |
296 | Newer systems (POWER8 with IODA2) have improved hardware design which allows | ||
297 | to remove this limitation and have multiple IOMMU groups per a VFIO container. | ||
296 | 298 | ||
297 | 2) The hardware supports so called DMA windows - the PCI address range | 299 | 2) The hardware supports so called DMA windows - the PCI address range |
298 | within which DMA transfer is allowed, any attempt to access address space | 300 | within which DMA transfer is allowed, any attempt to access address space |
@@ -439,6 +441,29 @@ The code flow from the example above should be slightly changed: | |||
439 | 441 | ||
440 | .... | 442 | .... |
441 | 443 | ||
444 | 5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/ | ||
445 | VFIO_IOMMU_DISABLE and implements 2 new ioctls: | ||
446 | VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY | ||
447 | (which are unsupported in v1 IOMMU). | ||
448 | |||
449 | PPC64 paravirtualized guests generate a lot of map/unmap requests, | ||
450 | and the handling of those includes pinning/unpinning pages and updating | ||
451 | mm::locked_vm counter to make sure we do not exceed the rlimit. | ||
452 | The v2 IOMMU splits accounting and pinning into separate operations: | ||
453 | |||
454 | - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls | ||
455 | receive a user space address and size of the block to be pinned. | ||
456 | Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to | ||
457 | be called with the exact address and size used for registering | ||
458 | the memory block. The userspace is not expected to call these often. | ||
459 | The ranges are stored in a linked list in a VFIO container. | ||
460 | |||
461 | - VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual | ||
462 | IOMMU table and do not do pinning; instead these check that the userspace | ||
463 | address is from pre-registered range. | ||
464 | |||
465 | This separation helps in optimizing DMA for guests. | ||
466 | |||
442 | ------------------------------------------------------------------------------- | 467 | ------------------------------------------------------------------------------- |
443 | 468 | ||
444 | [1] VFIO was originally an acronym for "Virtual Function I/O" in its | 469 | [1] VFIO was originally an acronym for "Virtual Function I/O" in its |
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 9d3749287689..f9957eb4c659 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h | |||
@@ -112,9 +112,15 @@ struct iommu_table { | |||
112 | unsigned long *it_map; /* A simple allocation bitmap for now */ | 112 | unsigned long *it_map; /* A simple allocation bitmap for now */ |
113 | unsigned long it_page_shift;/* table iommu page size */ | 113 | unsigned long it_page_shift;/* table iommu page size */ |
114 | struct list_head it_group_list;/* List of iommu_table_group_link */ | 114 | struct list_head it_group_list;/* List of iommu_table_group_link */ |
115 | unsigned long *it_userspace; /* userspace view of the table */ | ||
115 | struct iommu_table_ops *it_ops; | 116 | struct iommu_table_ops *it_ops; |
116 | }; | 117 | }; |
117 | 118 | ||
119 | #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ | ||
120 | ((tbl)->it_userspace ? \ | ||
121 | &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ | ||
122 | NULL) | ||
123 | |||
118 | /* Pure 2^n version of get_order */ | 124 | /* Pure 2^n version of get_order */ |
119 | static inline __attribute_const__ | 125 | static inline __attribute_const__ |
120 | int get_iommu_order(unsigned long size, struct iommu_table *tbl) | 126 | int get_iommu_order(unsigned long size, struct iommu_table *tbl) |
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 203caacf2242..91a32239bd0a 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c | |||
@@ -19,8 +19,10 @@ | |||
19 | #include <linux/uaccess.h> | 19 | #include <linux/uaccess.h> |
20 | #include <linux/err.h> | 20 | #include <linux/err.h> |
21 | #include <linux/vfio.h> | 21 | #include <linux/vfio.h> |
22 | #include <linux/vmalloc.h> | ||
22 | #include <asm/iommu.h> | 23 | #include <asm/iommu.h> |
23 | #include <asm/tce.h> | 24 | #include <asm/tce.h> |
25 | #include <asm/mmu_context.h> | ||
24 | 26 | ||
25 | #define DRIVER_VERSION "0.1" | 27 | #define DRIVER_VERSION "0.1" |
26 | #define DRIVER_AUTHOR "aik@ozlabs.ru" | 28 | #define DRIVER_AUTHOR "aik@ozlabs.ru" |
@@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages) | |||
81 | * into DMA'ble space using the IOMMU | 83 | * into DMA'ble space using the IOMMU |
82 | */ | 84 | */ |
83 | 85 | ||
86 | struct tce_iommu_group { | ||
87 | struct list_head next; | ||
88 | struct iommu_group *grp; | ||
89 | }; | ||
90 | |||
84 | /* | 91 | /* |
85 | * The container descriptor supports only a single group per container. | 92 | * The container descriptor supports only a single group per container. |
86 | * Required by the API as the container is not supplied with the IOMMU group | 93 | * Required by the API as the container is not supplied with the IOMMU group |
@@ -88,11 +95,84 @@ static void decrement_locked_vm(long npages) | |||
88 | */ | 95 | */ |
89 | struct tce_container { | 96 | struct tce_container { |
90 | struct mutex lock; | 97 | struct mutex lock; |
91 | struct iommu_group *grp; | ||
92 | bool enabled; | 98 | bool enabled; |
99 | bool v2; | ||
93 | unsigned long locked_pages; | 100 | unsigned long locked_pages; |
101 | struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; | ||
102 | struct list_head group_list; | ||
94 | }; | 103 | }; |
95 | 104 | ||
105 | static long tce_iommu_unregister_pages(struct tce_container *container, | ||
106 | __u64 vaddr, __u64 size) | ||
107 | { | ||
108 | struct mm_iommu_table_group_mem_t *mem; | ||
109 | |||
110 | if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) | ||
111 | return -EINVAL; | ||
112 | |||
113 | mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT); | ||
114 | if (!mem) | ||
115 | return -ENOENT; | ||
116 | |||
117 | return mm_iommu_put(mem); | ||
118 | } | ||
119 | |||
120 | static long tce_iommu_register_pages(struct tce_container *container, | ||
121 | __u64 vaddr, __u64 size) | ||
122 | { | ||
123 | long ret = 0; | ||
124 | struct mm_iommu_table_group_mem_t *mem = NULL; | ||
125 | unsigned long entries = size >> PAGE_SHIFT; | ||
126 | |||
127 | if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || | ||
128 | ((vaddr + size) < vaddr)) | ||
129 | return -EINVAL; | ||
130 | |||
131 | ret = mm_iommu_get(vaddr, entries, &mem); | ||
132 | if (ret) | ||
133 | return ret; | ||
134 | |||
135 | container->enabled = true; | ||
136 | |||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) | ||
141 | { | ||
142 | unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * | ||
143 | tbl->it_size, PAGE_SIZE); | ||
144 | unsigned long *uas; | ||
145 | long ret; | ||
146 | |||
147 | BUG_ON(tbl->it_userspace); | ||
148 | |||
149 | ret = try_increment_locked_vm(cb >> PAGE_SHIFT); | ||
150 | if (ret) | ||
151 | return ret; | ||
152 | |||
153 | uas = vzalloc(cb); | ||
154 | if (!uas) { | ||
155 | decrement_locked_vm(cb >> PAGE_SHIFT); | ||
156 | return -ENOMEM; | ||
157 | } | ||
158 | tbl->it_userspace = uas; | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | static void tce_iommu_userspace_view_free(struct iommu_table *tbl) | ||
164 | { | ||
165 | unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * | ||
166 | tbl->it_size, PAGE_SIZE); | ||
167 | |||
168 | if (!tbl->it_userspace) | ||
169 | return; | ||
170 | |||
171 | vfree(tbl->it_userspace); | ||
172 | tbl->it_userspace = NULL; | ||
173 | decrement_locked_vm(cb >> PAGE_SHIFT); | ||
174 | } | ||
175 | |||
96 | static bool tce_page_is_contained(struct page *page, unsigned page_shift) | 176 | static bool tce_page_is_contained(struct page *page, unsigned page_shift) |
97 | { | 177 | { |
98 | /* | 178 | /* |
@@ -103,18 +183,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift) | |||
103 | return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; | 183 | return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; |
104 | } | 184 | } |
105 | 185 | ||
186 | static inline bool tce_groups_attached(struct tce_container *container) | ||
187 | { | ||
188 | return !list_empty(&container->group_list); | ||
189 | } | ||
190 | |||
106 | static long tce_iommu_find_table(struct tce_container *container, | 191 | static long tce_iommu_find_table(struct tce_container *container, |
107 | phys_addr_t ioba, struct iommu_table **ptbl) | 192 | phys_addr_t ioba, struct iommu_table **ptbl) |
108 | { | 193 | { |
109 | long i; | 194 | long i; |
110 | struct iommu_table_group *table_group; | ||
111 | |||
112 | table_group = iommu_group_get_iommudata(container->grp); | ||
113 | if (!table_group) | ||
114 | return -1; | ||
115 | 195 | ||
116 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { | 196 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { |
117 | struct iommu_table *tbl = table_group->tables[i]; | 197 | struct iommu_table *tbl = container->tables[i]; |
118 | 198 | ||
119 | if (tbl) { | 199 | if (tbl) { |
120 | unsigned long entry = ioba >> tbl->it_page_shift; | 200 | unsigned long entry = ioba >> tbl->it_page_shift; |
@@ -136,9 +216,7 @@ static int tce_iommu_enable(struct tce_container *container) | |||
136 | int ret = 0; | 216 | int ret = 0; |
137 | unsigned long locked; | 217 | unsigned long locked; |
138 | struct iommu_table_group *table_group; | 218 | struct iommu_table_group *table_group; |
139 | 219 | struct tce_iommu_group *tcegrp; | |
140 | if (!container->grp) | ||
141 | return -ENXIO; | ||
142 | 220 | ||
143 | if (!current->mm) | 221 | if (!current->mm) |
144 | return -ESRCH; /* process exited */ | 222 | return -ESRCH; /* process exited */ |
@@ -175,7 +253,12 @@ static int tce_iommu_enable(struct tce_container *container) | |||
175 | * as there is no way to know how much we should increment | 253 | * as there is no way to know how much we should increment |
176 | * the locked_vm counter. | 254 | * the locked_vm counter. |
177 | */ | 255 | */ |
178 | table_group = iommu_group_get_iommudata(container->grp); | 256 | if (!tce_groups_attached(container)) |
257 | return -ENODEV; | ||
258 | |||
259 | tcegrp = list_first_entry(&container->group_list, | ||
260 | struct tce_iommu_group, next); | ||
261 | table_group = iommu_group_get_iommudata(tcegrp->grp); | ||
179 | if (!table_group) | 262 | if (!table_group) |
180 | return -ENODEV; | 263 | return -ENODEV; |
181 | 264 | ||
@@ -211,7 +294,7 @@ static void *tce_iommu_open(unsigned long arg) | |||
211 | { | 294 | { |
212 | struct tce_container *container; | 295 | struct tce_container *container; |
213 | 296 | ||
214 | if (arg != VFIO_SPAPR_TCE_IOMMU) { | 297 | if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { |
215 | pr_err("tce_vfio: Wrong IOMMU type\n"); | 298 | pr_err("tce_vfio: Wrong IOMMU type\n"); |
216 | return ERR_PTR(-EINVAL); | 299 | return ERR_PTR(-EINVAL); |
217 | } | 300 | } |
@@ -221,18 +304,45 @@ static void *tce_iommu_open(unsigned long arg) | |||
221 | return ERR_PTR(-ENOMEM); | 304 | return ERR_PTR(-ENOMEM); |
222 | 305 | ||
223 | mutex_init(&container->lock); | 306 | mutex_init(&container->lock); |
307 | INIT_LIST_HEAD_RCU(&container->group_list); | ||
308 | |||
309 | container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; | ||
224 | 310 | ||
225 | return container; | 311 | return container; |
226 | } | 312 | } |
227 | 313 | ||
314 | static int tce_iommu_clear(struct tce_container *container, | ||
315 | struct iommu_table *tbl, | ||
316 | unsigned long entry, unsigned long pages); | ||
317 | static void tce_iommu_free_table(struct iommu_table *tbl); | ||
318 | |||
228 | static void tce_iommu_release(void *iommu_data) | 319 | static void tce_iommu_release(void *iommu_data) |
229 | { | 320 | { |
230 | struct tce_container *container = iommu_data; | 321 | struct tce_container *container = iommu_data; |
322 | struct iommu_table_group *table_group; | ||
323 | struct tce_iommu_group *tcegrp; | ||
324 | long i; | ||
231 | 325 | ||
232 | WARN_ON(container->grp); | 326 | while (tce_groups_attached(container)) { |
327 | tcegrp = list_first_entry(&container->group_list, | ||
328 | struct tce_iommu_group, next); | ||
329 | table_group = iommu_group_get_iommudata(tcegrp->grp); | ||
330 | tce_iommu_detach_group(iommu_data, tcegrp->grp); | ||
331 | } | ||
233 | 332 | ||
234 | if (container->grp) | 333 | /* |
235 | tce_iommu_detach_group(iommu_data, container->grp); | 334 | * If VFIO created a table, it was not disposed |
335 | * by tce_iommu_detach_group() so do it now. | ||
336 | */ | ||
337 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { | ||
338 | struct iommu_table *tbl = container->tables[i]; | ||
339 | |||
340 | if (!tbl) | ||
341 | continue; | ||
342 | |||
343 | tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); | ||
344 | tce_iommu_free_table(tbl); | ||
345 | } | ||
236 | 346 | ||
237 | tce_iommu_disable(container); | 347 | tce_iommu_disable(container); |
238 | mutex_destroy(&container->lock); | 348 | mutex_destroy(&container->lock); |
@@ -249,6 +359,47 @@ static void tce_iommu_unuse_page(struct tce_container *container, | |||
249 | put_page(page); | 359 | put_page(page); |
250 | } | 360 | } |
251 | 361 | ||
362 | static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, | ||
363 | unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) | ||
364 | { | ||
365 | long ret = 0; | ||
366 | struct mm_iommu_table_group_mem_t *mem; | ||
367 | |||
368 | mem = mm_iommu_lookup(tce, size); | ||
369 | if (!mem) | ||
370 | return -EINVAL; | ||
371 | |||
372 | ret = mm_iommu_ua_to_hpa(mem, tce, phpa); | ||
373 | if (ret) | ||
374 | return -EINVAL; | ||
375 | |||
376 | *pmem = mem; | ||
377 | |||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | static void tce_iommu_unuse_page_v2(struct iommu_table *tbl, | ||
382 | unsigned long entry) | ||
383 | { | ||
384 | struct mm_iommu_table_group_mem_t *mem = NULL; | ||
385 | int ret; | ||
386 | unsigned long hpa = 0; | ||
387 | unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); | ||
388 | |||
389 | if (!pua || !current || !current->mm) | ||
390 | return; | ||
391 | |||
392 | ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl), | ||
393 | &hpa, &mem); | ||
394 | if (ret) | ||
395 | pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", | ||
396 | __func__, *pua, entry, ret); | ||
397 | if (mem) | ||
398 | mm_iommu_mapped_dec(mem); | ||
399 | |||
400 | *pua = 0; | ||
401 | } | ||
402 | |||
252 | static int tce_iommu_clear(struct tce_container *container, | 403 | static int tce_iommu_clear(struct tce_container *container, |
253 | struct iommu_table *tbl, | 404 | struct iommu_table *tbl, |
254 | unsigned long entry, unsigned long pages) | 405 | unsigned long entry, unsigned long pages) |
@@ -267,6 +418,11 @@ static int tce_iommu_clear(struct tce_container *container, | |||
267 | if (direction == DMA_NONE) | 418 | if (direction == DMA_NONE) |
268 | continue; | 419 | continue; |
269 | 420 | ||
421 | if (container->v2) { | ||
422 | tce_iommu_unuse_page_v2(tbl, entry); | ||
423 | continue; | ||
424 | } | ||
425 | |||
270 | tce_iommu_unuse_page(container, oldhpa); | 426 | tce_iommu_unuse_page(container, oldhpa); |
271 | } | 427 | } |
272 | 428 | ||
@@ -333,6 +489,64 @@ static long tce_iommu_build(struct tce_container *container, | |||
333 | return ret; | 489 | return ret; |
334 | } | 490 | } |
335 | 491 | ||
492 | static long tce_iommu_build_v2(struct tce_container *container, | ||
493 | struct iommu_table *tbl, | ||
494 | unsigned long entry, unsigned long tce, unsigned long pages, | ||
495 | enum dma_data_direction direction) | ||
496 | { | ||
497 | long i, ret = 0; | ||
498 | struct page *page; | ||
499 | unsigned long hpa; | ||
500 | enum dma_data_direction dirtmp; | ||
501 | |||
502 | for (i = 0; i < pages; ++i) { | ||
503 | struct mm_iommu_table_group_mem_t *mem = NULL; | ||
504 | unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, | ||
505 | entry + i); | ||
506 | |||
507 | ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl), | ||
508 | &hpa, &mem); | ||
509 | if (ret) | ||
510 | break; | ||
511 | |||
512 | page = pfn_to_page(hpa >> PAGE_SHIFT); | ||
513 | if (!tce_page_is_contained(page, tbl->it_page_shift)) { | ||
514 | ret = -EPERM; | ||
515 | break; | ||
516 | } | ||
517 | |||
518 | /* Preserve offset within IOMMU page */ | ||
519 | hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; | ||
520 | dirtmp = direction; | ||
521 | |||
522 | /* The registered region is being unregistered */ | ||
523 | if (mm_iommu_mapped_inc(mem)) | ||
524 | break; | ||
525 | |||
526 | ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); | ||
527 | if (ret) { | ||
528 | /* dirtmp cannot be DMA_NONE here */ | ||
529 | tce_iommu_unuse_page_v2(tbl, entry + i); | ||
530 | pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", | ||
531 | __func__, entry << tbl->it_page_shift, | ||
532 | tce, ret); | ||
533 | break; | ||
534 | } | ||
535 | |||
536 | if (dirtmp != DMA_NONE) | ||
537 | tce_iommu_unuse_page_v2(tbl, entry + i); | ||
538 | |||
539 | *pua = tce; | ||
540 | |||
541 | tce += IOMMU_PAGE_SIZE(tbl); | ||
542 | } | ||
543 | |||
544 | if (ret) | ||
545 | tce_iommu_clear(container, tbl, entry, i); | ||
546 | |||
547 | return ret; | ||
548 | } | ||
549 | |||
336 | static long tce_iommu_create_table(struct tce_container *container, | 550 | static long tce_iommu_create_table(struct tce_container *container, |
337 | struct iommu_table_group *table_group, | 551 | struct iommu_table_group *table_group, |
338 | int num, | 552 | int num, |
@@ -358,6 +572,12 @@ static long tce_iommu_create_table(struct tce_container *container, | |||
358 | WARN_ON(!ret && !(*ptbl)->it_ops->free); | 572 | WARN_ON(!ret && !(*ptbl)->it_ops->free); |
359 | WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); | 573 | WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); |
360 | 574 | ||
575 | if (!ret && container->v2) { | ||
576 | ret = tce_iommu_userspace_view_alloc(*ptbl); | ||
577 | if (ret) | ||
578 | (*ptbl)->it_ops->free(*ptbl); | ||
579 | } | ||
580 | |||
361 | if (ret) | 581 | if (ret) |
362 | decrement_locked_vm(table_size >> PAGE_SHIFT); | 582 | decrement_locked_vm(table_size >> PAGE_SHIFT); |
363 | 583 | ||
@@ -368,6 +588,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl) | |||
368 | { | 588 | { |
369 | unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; | 589 | unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; |
370 | 590 | ||
591 | tce_iommu_userspace_view_free(tbl); | ||
371 | tbl->it_ops->free(tbl); | 592 | tbl->it_ops->free(tbl); |
372 | decrement_locked_vm(pages); | 593 | decrement_locked_vm(pages); |
373 | } | 594 | } |
@@ -383,6 +604,7 @@ static long tce_iommu_ioctl(void *iommu_data, | |||
383 | case VFIO_CHECK_EXTENSION: | 604 | case VFIO_CHECK_EXTENSION: |
384 | switch (arg) { | 605 | switch (arg) { |
385 | case VFIO_SPAPR_TCE_IOMMU: | 606 | case VFIO_SPAPR_TCE_IOMMU: |
607 | case VFIO_SPAPR_TCE_v2_IOMMU: | ||
386 | ret = 1; | 608 | ret = 1; |
387 | break; | 609 | break; |
388 | default: | 610 | default: |
@@ -394,12 +616,15 @@ static long tce_iommu_ioctl(void *iommu_data, | |||
394 | 616 | ||
395 | case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { | 617 | case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { |
396 | struct vfio_iommu_spapr_tce_info info; | 618 | struct vfio_iommu_spapr_tce_info info; |
619 | struct tce_iommu_group *tcegrp; | ||
397 | struct iommu_table_group *table_group; | 620 | struct iommu_table_group *table_group; |
398 | 621 | ||
399 | if (WARN_ON(!container->grp)) | 622 | if (!tce_groups_attached(container)) |
400 | return -ENXIO; | 623 | return -ENXIO; |
401 | 624 | ||
402 | table_group = iommu_group_get_iommudata(container->grp); | 625 | tcegrp = list_first_entry(&container->group_list, |
626 | struct tce_iommu_group, next); | ||
627 | table_group = iommu_group_get_iommudata(tcegrp->grp); | ||
403 | 628 | ||
404 | if (!table_group) | 629 | if (!table_group) |
405 | return -ENXIO; | 630 | return -ENXIO; |
@@ -468,11 +693,18 @@ static long tce_iommu_ioctl(void *iommu_data, | |||
468 | if (ret) | 693 | if (ret) |
469 | return ret; | 694 | return ret; |
470 | 695 | ||
471 | ret = tce_iommu_build(container, tbl, | 696 | if (container->v2) |
472 | param.iova >> tbl->it_page_shift, | 697 | ret = tce_iommu_build_v2(container, tbl, |
473 | param.vaddr, | 698 | param.iova >> tbl->it_page_shift, |
474 | param.size >> tbl->it_page_shift, | 699 | param.vaddr, |
475 | direction); | 700 | param.size >> tbl->it_page_shift, |
701 | direction); | ||
702 | else | ||
703 | ret = tce_iommu_build(container, tbl, | ||
704 | param.iova >> tbl->it_page_shift, | ||
705 | param.vaddr, | ||
706 | param.size >> tbl->it_page_shift, | ||
707 | direction); | ||
476 | 708 | ||
477 | iommu_flush_tce(tbl); | 709 | iommu_flush_tce(tbl); |
478 | 710 | ||
@@ -518,7 +750,62 @@ static long tce_iommu_ioctl(void *iommu_data, | |||
518 | 750 | ||
519 | return ret; | 751 | return ret; |
520 | } | 752 | } |
753 | case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { | ||
754 | struct vfio_iommu_spapr_register_memory param; | ||
755 | |||
756 | if (!container->v2) | ||
757 | break; | ||
758 | |||
759 | minsz = offsetofend(struct vfio_iommu_spapr_register_memory, | ||
760 | size); | ||
761 | |||
762 | if (copy_from_user(¶m, (void __user *)arg, minsz)) | ||
763 | return -EFAULT; | ||
764 | |||
765 | if (param.argsz < minsz) | ||
766 | return -EINVAL; | ||
767 | |||
768 | /* No flag is supported now */ | ||
769 | if (param.flags) | ||
770 | return -EINVAL; | ||
771 | |||
772 | mutex_lock(&container->lock); | ||
773 | ret = tce_iommu_register_pages(container, param.vaddr, | ||
774 | param.size); | ||
775 | mutex_unlock(&container->lock); | ||
776 | |||
777 | return ret; | ||
778 | } | ||
779 | case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { | ||
780 | struct vfio_iommu_spapr_register_memory param; | ||
781 | |||
782 | if (!container->v2) | ||
783 | break; | ||
784 | |||
785 | minsz = offsetofend(struct vfio_iommu_spapr_register_memory, | ||
786 | size); | ||
787 | |||
788 | if (copy_from_user(¶m, (void __user *)arg, minsz)) | ||
789 | return -EFAULT; | ||
790 | |||
791 | if (param.argsz < minsz) | ||
792 | return -EINVAL; | ||
793 | |||
794 | /* No flag is supported now */ | ||
795 | if (param.flags) | ||
796 | return -EINVAL; | ||
797 | |||
798 | mutex_lock(&container->lock); | ||
799 | ret = tce_iommu_unregister_pages(container, param.vaddr, | ||
800 | param.size); | ||
801 | mutex_unlock(&container->lock); | ||
802 | |||
803 | return ret; | ||
804 | } | ||
521 | case VFIO_IOMMU_ENABLE: | 805 | case VFIO_IOMMU_ENABLE: |
806 | if (container->v2) | ||
807 | break; | ||
808 | |||
522 | mutex_lock(&container->lock); | 809 | mutex_lock(&container->lock); |
523 | ret = tce_iommu_enable(container); | 810 | ret = tce_iommu_enable(container); |
524 | mutex_unlock(&container->lock); | 811 | mutex_unlock(&container->lock); |
@@ -526,16 +813,27 @@ static long tce_iommu_ioctl(void *iommu_data, | |||
526 | 813 | ||
527 | 814 | ||
528 | case VFIO_IOMMU_DISABLE: | 815 | case VFIO_IOMMU_DISABLE: |
816 | if (container->v2) | ||
817 | break; | ||
818 | |||
529 | mutex_lock(&container->lock); | 819 | mutex_lock(&container->lock); |
530 | tce_iommu_disable(container); | 820 | tce_iommu_disable(container); |
531 | mutex_unlock(&container->lock); | 821 | mutex_unlock(&container->lock); |
532 | return 0; | 822 | return 0; |
533 | case VFIO_EEH_PE_OP: | ||
534 | if (!container->grp) | ||
535 | return -ENODEV; | ||
536 | 823 | ||
537 | return vfio_spapr_iommu_eeh_ioctl(container->grp, | 824 | case VFIO_EEH_PE_OP: { |
538 | cmd, arg); | 825 | struct tce_iommu_group *tcegrp; |
826 | |||
827 | ret = 0; | ||
828 | list_for_each_entry(tcegrp, &container->group_list, next) { | ||
829 | ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, | ||
830 | cmd, arg); | ||
831 | if (ret) | ||
832 | return ret; | ||
833 | } | ||
834 | return ret; | ||
835 | } | ||
836 | |||
539 | } | 837 | } |
540 | 838 | ||
541 | return -ENOTTY; | 839 | return -ENOTTY; |
@@ -547,14 +845,17 @@ static void tce_iommu_release_ownership(struct tce_container *container, | |||
547 | int i; | 845 | int i; |
548 | 846 | ||
549 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { | 847 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { |
550 | struct iommu_table *tbl = table_group->tables[i]; | 848 | struct iommu_table *tbl = container->tables[i]; |
551 | 849 | ||
552 | if (!tbl) | 850 | if (!tbl) |
553 | continue; | 851 | continue; |
554 | 852 | ||
555 | tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); | 853 | tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); |
854 | tce_iommu_userspace_view_free(tbl); | ||
556 | if (tbl->it_map) | 855 | if (tbl->it_map) |
557 | iommu_release_ownership(tbl); | 856 | iommu_release_ownership(tbl); |
857 | |||
858 | container->tables[i] = NULL; | ||
558 | } | 859 | } |
559 | } | 860 | } |
560 | 861 | ||
@@ -569,7 +870,10 @@ static int tce_iommu_take_ownership(struct tce_container *container, | |||
569 | if (!tbl || !tbl->it_map) | 870 | if (!tbl || !tbl->it_map) |
570 | continue; | 871 | continue; |
571 | 872 | ||
572 | rc = iommu_take_ownership(tbl); | 873 | rc = tce_iommu_userspace_view_alloc(tbl); |
874 | if (!rc) | ||
875 | rc = iommu_take_ownership(tbl); | ||
876 | |||
573 | if (rc) { | 877 | if (rc) { |
574 | for (j = 0; j < i; ++j) | 878 | for (j = 0; j < i; ++j) |
575 | iommu_release_ownership( | 879 | iommu_release_ownership( |
@@ -579,6 +883,9 @@ static int tce_iommu_take_ownership(struct tce_container *container, | |||
579 | } | 883 | } |
580 | } | 884 | } |
581 | 885 | ||
886 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) | ||
887 | container->tables[i] = table_group->tables[i]; | ||
888 | |||
582 | return 0; | 889 | return 0; |
583 | } | 890 | } |
584 | 891 | ||
@@ -592,18 +899,8 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container, | |||
592 | return; | 899 | return; |
593 | } | 900 | } |
594 | 901 | ||
595 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { | 902 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) |
596 | /* Store table pointer as unset_window resets it */ | ||
597 | struct iommu_table *tbl = table_group->tables[i]; | ||
598 | |||
599 | if (!tbl) | ||
600 | continue; | ||
601 | |||
602 | table_group->ops->unset_window(table_group, i); | 903 | table_group->ops->unset_window(table_group, i); |
603 | tce_iommu_clear(container, tbl, | ||
604 | tbl->it_offset, tbl->it_size); | ||
605 | tce_iommu_free_table(tbl); | ||
606 | } | ||
607 | 904 | ||
608 | table_group->ops->release_ownership(table_group); | 905 | table_group->ops->release_ownership(table_group); |
609 | } | 906 | } |
@@ -611,7 +908,7 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container, | |||
611 | static long tce_iommu_take_ownership_ddw(struct tce_container *container, | 908 | static long tce_iommu_take_ownership_ddw(struct tce_container *container, |
612 | struct iommu_table_group *table_group) | 909 | struct iommu_table_group *table_group) |
613 | { | 910 | { |
614 | long ret; | 911 | long i, ret = 0; |
615 | struct iommu_table *tbl = NULL; | 912 | struct iommu_table *tbl = NULL; |
616 | 913 | ||
617 | if (!table_group->ops->create_table || !table_group->ops->set_window || | 914 | if (!table_group->ops->create_table || !table_group->ops->set_window || |
@@ -622,23 +919,45 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container, | |||
622 | 919 | ||
623 | table_group->ops->take_ownership(table_group); | 920 | table_group->ops->take_ownership(table_group); |
624 | 921 | ||
625 | ret = tce_iommu_create_table(container, | 922 | /* |
626 | table_group, | 923 | * If it the first group attached, check if there is |
627 | 0, /* window number */ | 924 | * a default DMA window and create one if none as |
628 | IOMMU_PAGE_SHIFT_4K, | 925 | * the userspace expects it to exist. |
629 | table_group->tce32_size, | 926 | */ |
630 | 1, /* default levels */ | 927 | if (!tce_groups_attached(container) && !container->tables[0]) { |
631 | &tbl); | 928 | ret = tce_iommu_create_table(container, |
632 | if (!ret) { | 929 | table_group, |
633 | ret = table_group->ops->set_window(table_group, 0, tbl); | 930 | 0, /* window number */ |
931 | IOMMU_PAGE_SHIFT_4K, | ||
932 | table_group->tce32_size, | ||
933 | 1, /* default levels */ | ||
934 | &tbl); | ||
634 | if (ret) | 935 | if (ret) |
635 | tce_iommu_free_table(tbl); | 936 | goto release_exit; |
636 | else | 937 | else |
637 | table_group->tables[0] = tbl; | 938 | container->tables[0] = tbl; |
638 | } | 939 | } |
639 | 940 | ||
640 | if (ret) | 941 | /* Set all windows to the new group */ |
641 | table_group->ops->release_ownership(table_group); | 942 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { |
943 | tbl = container->tables[i]; | ||
944 | |||
945 | if (!tbl) | ||
946 | continue; | ||
947 | |||
948 | /* Set the default window to a new group */ | ||
949 | ret = table_group->ops->set_window(table_group, i, tbl); | ||
950 | if (ret) | ||
951 | goto release_exit; | ||
952 | } | ||
953 | |||
954 | return 0; | ||
955 | |||
956 | release_exit: | ||
957 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) | ||
958 | table_group->ops->unset_window(table_group, i); | ||
959 | |||
960 | table_group->ops->release_ownership(table_group); | ||
642 | 961 | ||
643 | return ret; | 962 | return ret; |
644 | } | 963 | } |
@@ -649,29 +968,44 @@ static int tce_iommu_attach_group(void *iommu_data, | |||
649 | int ret; | 968 | int ret; |
650 | struct tce_container *container = iommu_data; | 969 | struct tce_container *container = iommu_data; |
651 | struct iommu_table_group *table_group; | 970 | struct iommu_table_group *table_group; |
971 | struct tce_iommu_group *tcegrp = NULL; | ||
652 | 972 | ||
653 | mutex_lock(&container->lock); | 973 | mutex_lock(&container->lock); |
654 | 974 | ||
655 | /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", | 975 | /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", |
656 | iommu_group_id(iommu_group), iommu_group); */ | 976 | iommu_group_id(iommu_group), iommu_group); */ |
657 | if (container->grp) { | 977 | table_group = iommu_group_get_iommudata(iommu_group); |
658 | pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", | 978 | |
659 | iommu_group_id(container->grp), | 979 | if (tce_groups_attached(container) && (!table_group->ops || |
660 | iommu_group_id(iommu_group)); | 980 | !table_group->ops->take_ownership || |
981 | !table_group->ops->release_ownership)) { | ||
661 | ret = -EBUSY; | 982 | ret = -EBUSY; |
662 | goto unlock_exit; | 983 | goto unlock_exit; |
663 | } | 984 | } |
664 | 985 | ||
665 | if (container->enabled) { | 986 | /* Check if new group has the same iommu_ops (i.e. compatible) */ |
666 | pr_err("tce_vfio: attaching group #%u to enabled container\n", | 987 | list_for_each_entry(tcegrp, &container->group_list, next) { |
667 | iommu_group_id(iommu_group)); | 988 | struct iommu_table_group *table_group_tmp; |
668 | ret = -EBUSY; | 989 | |
669 | goto unlock_exit; | 990 | if (tcegrp->grp == iommu_group) { |
991 | pr_warn("tce_vfio: Group %d is already attached\n", | ||
992 | iommu_group_id(iommu_group)); | ||
993 | ret = -EBUSY; | ||
994 | goto unlock_exit; | ||
995 | } | ||
996 | table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); | ||
997 | if (table_group_tmp->ops != table_group->ops) { | ||
998 | pr_warn("tce_vfio: Group %d is incompatible with group %d\n", | ||
999 | iommu_group_id(iommu_group), | ||
1000 | iommu_group_id(tcegrp->grp)); | ||
1001 | ret = -EPERM; | ||
1002 | goto unlock_exit; | ||
1003 | } | ||
670 | } | 1004 | } |
671 | 1005 | ||
672 | table_group = iommu_group_get_iommudata(iommu_group); | 1006 | tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); |
673 | if (!table_group) { | 1007 | if (!tcegrp) { |
674 | ret = -ENXIO; | 1008 | ret = -ENOMEM; |
675 | goto unlock_exit; | 1009 | goto unlock_exit; |
676 | } | 1010 | } |
677 | 1011 | ||
@@ -681,10 +1015,15 @@ static int tce_iommu_attach_group(void *iommu_data, | |||
681 | else | 1015 | else |
682 | ret = tce_iommu_take_ownership_ddw(container, table_group); | 1016 | ret = tce_iommu_take_ownership_ddw(container, table_group); |
683 | 1017 | ||
684 | if (!ret) | 1018 | if (!ret) { |
685 | container->grp = iommu_group; | 1019 | tcegrp->grp = iommu_group; |
1020 | list_add(&tcegrp->next, &container->group_list); | ||
1021 | } | ||
686 | 1022 | ||
687 | unlock_exit: | 1023 | unlock_exit: |
1024 | if (ret && tcegrp) | ||
1025 | kfree(tcegrp); | ||
1026 | |||
688 | mutex_unlock(&container->lock); | 1027 | mutex_unlock(&container->lock); |
689 | 1028 | ||
690 | return ret; | 1029 | return ret; |
@@ -695,24 +1034,26 @@ static void tce_iommu_detach_group(void *iommu_data, | |||
695 | { | 1034 | { |
696 | struct tce_container *container = iommu_data; | 1035 | struct tce_container *container = iommu_data; |
697 | struct iommu_table_group *table_group; | 1036 | struct iommu_table_group *table_group; |
1037 | bool found = false; | ||
1038 | struct tce_iommu_group *tcegrp; | ||
698 | 1039 | ||
699 | mutex_lock(&container->lock); | 1040 | mutex_lock(&container->lock); |
700 | if (iommu_group != container->grp) { | 1041 | |
701 | pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", | 1042 | list_for_each_entry(tcegrp, &container->group_list, next) { |
702 | iommu_group_id(iommu_group), | 1043 | if (tcegrp->grp == iommu_group) { |
703 | iommu_group_id(container->grp)); | 1044 | found = true; |
704 | goto unlock_exit; | 1045 | break; |
1046 | } | ||
705 | } | 1047 | } |
706 | 1048 | ||
707 | if (container->enabled) { | 1049 | if (!found) { |
708 | pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", | 1050 | pr_warn("tce_vfio: detaching unattached group #%u\n", |
709 | iommu_group_id(container->grp)); | 1051 | iommu_group_id(iommu_group)); |
710 | tce_iommu_disable(container); | 1052 | goto unlock_exit; |
711 | } | 1053 | } |
712 | 1054 | ||
713 | /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", | 1055 | list_del(&tcegrp->next); |
714 | iommu_group_id(iommu_group), iommu_group); */ | 1056 | kfree(tcegrp); |
715 | container->grp = NULL; | ||
716 | 1057 | ||
717 | table_group = iommu_group_get_iommudata(iommu_group); | 1058 | table_group = iommu_group_get_iommudata(iommu_group); |
718 | BUG_ON(!table_group); | 1059 | BUG_ON(!table_group); |
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index e4fa1995f613..fa84391a0d00 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h | |||
@@ -36,6 +36,8 @@ | |||
36 | /* Two-stage IOMMU */ | 36 | /* Two-stage IOMMU */ |
37 | #define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ | 37 | #define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ |
38 | 38 | ||
39 | #define VFIO_SPAPR_TCE_v2_IOMMU 7 | ||
40 | |||
39 | /* | 41 | /* |
40 | * The IOCTL interface is designed for extensibility by embedding the | 42 | * The IOCTL interface is designed for extensibility by embedding the |
41 | * structure length (argsz) and flags into structures passed between | 43 | * structure length (argsz) and flags into structures passed between |
@@ -507,6 +509,31 @@ struct vfio_eeh_pe_op { | |||
507 | 509 | ||
508 | #define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) | 510 | #define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) |
509 | 511 | ||
512 | /** | ||
513 | * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory) | ||
514 | * | ||
515 | * Registers user space memory where DMA is allowed. It pins | ||
516 | * user pages and does the locked memory accounting so | ||
517 | * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls | ||
518 | * get faster. | ||
519 | */ | ||
520 | struct vfio_iommu_spapr_register_memory { | ||
521 | __u32 argsz; | ||
522 | __u32 flags; | ||
523 | __u64 vaddr; /* Process virtual address */ | ||
524 | __u64 size; /* Size of mapping (bytes) */ | ||
525 | }; | ||
526 | #define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) | ||
527 | |||
528 | /** | ||
529 | * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory) | ||
530 | * | ||
531 | * Unregisters user space memory registered with | ||
532 | * VFIO_IOMMU_SPAPR_REGISTER_MEMORY. | ||
533 | * Uses vfio_iommu_spapr_register_memory for parameters. | ||
534 | */ | ||
535 | #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) | ||
536 | |||
510 | /* ***************************************************************** */ | 537 | /* ***************************************************************** */ |
511 | 538 | ||
512 | #endif /* _UAPIVFIO_H */ | 539 | #endif /* _UAPIVFIO_H */ |