aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Kardashevskiy <aik@ozlabs.ru>2013-05-20 23:33:10 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-20 02:55:14 -0400
commit5ffd229c02731a91d08ca21e76b503c5bbb5c095 (patch)
treeed5f684cb6a5b683846d6b3d97fa2c0999835c24
parent4e13c1ac6baa1d6c2b650d66ca89e1e12727ec19 (diff)
powerpc/vfio: Implement IOMMU driver for VFIO
VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--Documentation/vfio.txt63
-rw-r--r--drivers/vfio/Kconfig6
-rw-r--r--drivers/vfio/Makefile1
-rw-r--r--drivers/vfio/vfio.c1
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c377
-rw-r--r--include/uapi/linux/vfio.h34
6 files changed, 482 insertions, 0 deletions
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda3635a17d..c55533c0adb3 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap
283interfaces implement the device region access defined by the device's 283interfaces implement the device region access defined by the device's
284own VFIO_DEVICE_GET_REGION_INFO ioctl. 284own VFIO_DEVICE_GET_REGION_INFO ioctl.
285 285
286
287PPC64 sPAPR implementation note
288-------------------------------------------------------------------------------
289
290This implementation has some specifics:
291
2921) Only one IOMMU group per container is supported as an IOMMU group
293represents the minimal entity which isolation can be guaranteed for and
294groups are allocated statically, one per a Partitionable Endpoint (PE)
295(PE is often a PCI domain but not always).
296
2972) The hardware supports so called DMA windows - the PCI address range
298within which DMA transfer is allowed, any attempt to access address space
299out of the window leads to the whole PE isolation.
300
3013) PPC64 guests are paravirtualized but not fully emulated. There is an API
302to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
303currently there is no way to reduce the number of calls. In order to make things
304faster, the map/unmap handling has been implemented in real mode which provides
305an excellent performance which has limitations such as inability to do
306locked pages accounting in real time.
307
308So 3 additional ioctls have been added:
309
310 VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
311 of the DMA window on the PCI bus.
312
313 VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
314 is done at this point. This lets user first to know what
315 the DMA window is and adjust rlimit before doing any real job.
316
317 VFIO_IOMMU_DISABLE - disables the container.
318
319
320The code flow from the example above should be slightly changed:
321
322 .....
323 /* Add the group to the container */
324 ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
325
326 /* Enable the IOMMU model we want */
327 ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
328
329 /* Get addition sPAPR IOMMU info */
330 vfio_iommu_spapr_tce_info spapr_iommu_info;
331 ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
332
333 if (ioctl(container, VFIO_IOMMU_ENABLE))
334 /* Cannot enable container, may be low rlimit */
335
336 /* Allocate some space and setup a DMA mapping */
337 dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
338 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
339
340 dma_map.size = 1024 * 1024;
341 dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
342 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
343
344 /* Check here is .iova/.size are within DMA window from spapr_iommu_info */
345
346 ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
347 .....
348
286------------------------------------------------------------------------------- 349-------------------------------------------------------------------------------
287 350
288[1] VFIO was originally an acronym for "Virtual Function I/O" in its 351[1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec0abd1..b464687f6e14 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
3 depends on VFIO 3 depends on VFIO
4 default n 4 default n
5 5
6config VFIO_IOMMU_SPAPR_TCE
7 tristate
8 depends on VFIO && SPAPR_TCE_IOMMU
9 default n
10
6menuconfig VFIO 11menuconfig VFIO
7 tristate "VFIO Non-Privileged userspace driver framework" 12 tristate "VFIO Non-Privileged userspace driver framework"
8 depends on IOMMU_API 13 depends on IOMMU_API
9 select VFIO_IOMMU_TYPE1 if X86 14 select VFIO_IOMMU_TYPE1 if X86
15 select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
10 help 16 help
11 VFIO provides a framework for secure userspace device drivers. 17 VFIO provides a framework for secure userspace device drivers.
12 See Documentation/vfio.txt for more details. 18 See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a0e38b..72bfabc8629e 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_VFIO) += vfio.o 1obj-$(CONFIG_VFIO) += vfio.o
2obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o 2obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
3obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
3obj-$(CONFIG_VFIO_PCI) += pci/ 4obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736563de..259ad282ae5d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
1415 * drivers. 1415 * drivers.
1416 */ 1416 */
1417 request_module_nowait("vfio_iommu_type1"); 1417 request_module_nowait("vfio_iommu_type1");
1418 request_module_nowait("vfio_iommu_spapr_tce");
1418 1419
1419 return 0; 1420 return 0;
1420 1421
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000000000000..bdae7a04af75
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,377 @@
1/*
2 * VFIO: IOMMU DMA mapping support for TCE on POWER
3 *
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
14 */
15
16#include <linux/module.h>
17#include <linux/pci.h>
18#include <linux/slab.h>
19#include <linux/uaccess.h>
20#include <linux/err.h>
21#include <linux/vfio.h>
22#include <asm/iommu.h>
23#include <asm/tce.h>
24
25#define DRIVER_VERSION "0.1"
26#define DRIVER_AUTHOR "aik@ozlabs.ru"
27#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
28
29static void tce_iommu_detach_group(void *iommu_data,
30 struct iommu_group *iommu_group);
31
32/*
33 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
34 *
35 * This code handles mapping and unmapping of user data buffers
36 * into DMA'ble space using the IOMMU
37 */
38
39/*
40 * The container descriptor supports only a single group per container.
41 * Required by the API as the container is not supplied with the IOMMU group
42 * at the moment of initialization.
43 */
44struct tce_container {
45 struct mutex lock;
46 struct iommu_table *tbl;
47 bool enabled;
48};
49
50static int tce_iommu_enable(struct tce_container *container)
51{
52 int ret = 0;
53 unsigned long locked, lock_limit, npages;
54 struct iommu_table *tbl = container->tbl;
55
56 if (!container->tbl)
57 return -ENXIO;
58
59 if (!current->mm)
60 return -ESRCH; /* process exited */
61
62 if (container->enabled)
63 return -EBUSY;
64
65 /*
66 * When userspace pages are mapped into the IOMMU, they are effectively
67 * locked memory, so, theoretically, we need to update the accounting
68 * of locked pages on each map and unmap. For powerpc, the map unmap
69 * paths can be very hot, though, and the accounting would kill
70 * performance, especially since it would be difficult to impossible
71 * to handle the accounting in real mode only.
72 *
73 * To address that, rather than precisely accounting every page, we
74 * instead account for a worst case on locked memory when the iommu is
75 * enabled and disabled. The worst case upper bound on locked memory
76 * is the size of the whole iommu window, which is usually relatively
77 * small (compared to total memory sizes) on POWER hardware.
78 *
79 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
80 * that would effectively kill the guest at random points, much better
81 * enforcing the limit based on the max that the guest can map.
82 */
83 down_write(&current->mm->mmap_sem);
84 npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
85 locked = current->mm->locked_vm + npages;
86 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
87 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
88 pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
89 rlimit(RLIMIT_MEMLOCK));
90 ret = -ENOMEM;
91 } else {
92
93 current->mm->locked_vm += npages;
94 container->enabled = true;
95 }
96 up_write(&current->mm->mmap_sem);
97
98 return ret;
99}
100
101static void tce_iommu_disable(struct tce_container *container)
102{
103 if (!container->enabled)
104 return;
105
106 container->enabled = false;
107
108 if (!container->tbl || !current->mm)
109 return;
110
111 down_write(&current->mm->mmap_sem);
112 current->mm->locked_vm -= (container->tbl->it_size <<
113 IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
114 up_write(&current->mm->mmap_sem);
115}
116
117static void *tce_iommu_open(unsigned long arg)
118{
119 struct tce_container *container;
120
121 if (arg != VFIO_SPAPR_TCE_IOMMU) {
122 pr_err("tce_vfio: Wrong IOMMU type\n");
123 return ERR_PTR(-EINVAL);
124 }
125
126 container = kzalloc(sizeof(*container), GFP_KERNEL);
127 if (!container)
128 return ERR_PTR(-ENOMEM);
129
130 mutex_init(&container->lock);
131
132 return container;
133}
134
135static void tce_iommu_release(void *iommu_data)
136{
137 struct tce_container *container = iommu_data;
138
139 WARN_ON(container->tbl && !container->tbl->it_group);
140 tce_iommu_disable(container);
141
142 if (container->tbl && container->tbl->it_group)
143 tce_iommu_detach_group(iommu_data, container->tbl->it_group);
144
145 mutex_destroy(&container->lock);
146
147 kfree(container);
148}
149
150static long tce_iommu_ioctl(void *iommu_data,
151 unsigned int cmd, unsigned long arg)
152{
153 struct tce_container *container = iommu_data;
154 unsigned long minsz;
155 long ret;
156
157 switch (cmd) {
158 case VFIO_CHECK_EXTENSION:
159 return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
160
161 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
162 struct vfio_iommu_spapr_tce_info info;
163 struct iommu_table *tbl = container->tbl;
164
165 if (WARN_ON(!tbl))
166 return -ENXIO;
167
168 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
169 dma32_window_size);
170
171 if (copy_from_user(&info, (void __user *)arg, minsz))
172 return -EFAULT;
173
174 if (info.argsz < minsz)
175 return -EINVAL;
176
177 info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
178 info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
179 info.flags = 0;
180
181 if (copy_to_user((void __user *)arg, &info, minsz))
182 return -EFAULT;
183
184 return 0;
185 }
186 case VFIO_IOMMU_MAP_DMA: {
187 struct vfio_iommu_type1_dma_map param;
188 struct iommu_table *tbl = container->tbl;
189 unsigned long tce, i;
190
191 if (!tbl)
192 return -ENXIO;
193
194 BUG_ON(!tbl->it_group);
195
196 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
197
198 if (copy_from_user(&param, (void __user *)arg, minsz))
199 return -EFAULT;
200
201 if (param.argsz < minsz)
202 return -EINVAL;
203
204 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
205 VFIO_DMA_MAP_FLAG_WRITE))
206 return -EINVAL;
207
208 if ((param.size & ~IOMMU_PAGE_MASK) ||
209 (param.vaddr & ~IOMMU_PAGE_MASK))
210 return -EINVAL;
211
212 /* iova is checked by the IOMMU API */
213 tce = param.vaddr;
214 if (param.flags & VFIO_DMA_MAP_FLAG_READ)
215 tce |= TCE_PCI_READ;
216 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
217 tce |= TCE_PCI_WRITE;
218
219 ret = iommu_tce_put_param_check(tbl, param.iova, tce);
220 if (ret)
221 return ret;
222
223 for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
224 ret = iommu_put_tce_user_mode(tbl,
225 (param.iova >> IOMMU_PAGE_SHIFT) + i,
226 tce);
227 if (ret)
228 break;
229 tce += IOMMU_PAGE_SIZE;
230 }
231 if (ret)
232 iommu_clear_tces_and_put_pages(tbl,
233 param.iova >> IOMMU_PAGE_SHIFT, i);
234
235 iommu_flush_tce(tbl);
236
237 return ret;
238 }
239 case VFIO_IOMMU_UNMAP_DMA: {
240 struct vfio_iommu_type1_dma_unmap param;
241 struct iommu_table *tbl = container->tbl;
242
243 if (WARN_ON(!tbl))
244 return -ENXIO;
245
246 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
247 size);
248
249 if (copy_from_user(&param, (void __user *)arg, minsz))
250 return -EFAULT;
251
252 if (param.argsz < minsz)
253 return -EINVAL;
254
255 /* No flag is supported now */
256 if (param.flags)
257 return -EINVAL;
258
259 if (param.size & ~IOMMU_PAGE_MASK)
260 return -EINVAL;
261
262 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
263 param.size >> IOMMU_PAGE_SHIFT);
264 if (ret)
265 return ret;
266
267 ret = iommu_clear_tces_and_put_pages(tbl,
268 param.iova >> IOMMU_PAGE_SHIFT,
269 param.size >> IOMMU_PAGE_SHIFT);
270 iommu_flush_tce(tbl);
271
272 return ret;
273 }
274 case VFIO_IOMMU_ENABLE:
275 mutex_lock(&container->lock);
276 ret = tce_iommu_enable(container);
277 mutex_unlock(&container->lock);
278 return ret;
279
280
281 case VFIO_IOMMU_DISABLE:
282 mutex_lock(&container->lock);
283 tce_iommu_disable(container);
284 mutex_unlock(&container->lock);
285 return 0;
286 }
287
288 return -ENOTTY;
289}
290
291static int tce_iommu_attach_group(void *iommu_data,
292 struct iommu_group *iommu_group)
293{
294 int ret;
295 struct tce_container *container = iommu_data;
296 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
297
298 BUG_ON(!tbl);
299 mutex_lock(&container->lock);
300
301 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
302 iommu_group_id(iommu_group), iommu_group); */
303 if (container->tbl) {
304 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
305 iommu_group_id(container->tbl->it_group),
306 iommu_group_id(iommu_group));
307 ret = -EBUSY;
308 } else if (container->enabled) {
309 pr_err("tce_vfio: attaching group #%u to enabled container\n",
310 iommu_group_id(iommu_group));
311 ret = -EBUSY;
312 } else {
313 ret = iommu_take_ownership(tbl);
314 if (!ret)
315 container->tbl = tbl;
316 }
317
318 mutex_unlock(&container->lock);
319
320 return ret;
321}
322
323static void tce_iommu_detach_group(void *iommu_data,
324 struct iommu_group *iommu_group)
325{
326 struct tce_container *container = iommu_data;
327 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
328
329 BUG_ON(!tbl);
330 mutex_lock(&container->lock);
331 if (tbl != container->tbl) {
332 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
333 iommu_group_id(iommu_group),
334 iommu_group_id(tbl->it_group));
335 } else {
336 if (container->enabled) {
337 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
338 iommu_group_id(tbl->it_group));
339 tce_iommu_disable(container);
340 }
341
342 /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
343 iommu_group_id(iommu_group), iommu_group); */
344 container->tbl = NULL;
345 iommu_release_ownership(tbl);
346 }
347 mutex_unlock(&container->lock);
348}
349
350const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
351 .name = "iommu-vfio-powerpc",
352 .owner = THIS_MODULE,
353 .open = tce_iommu_open,
354 .release = tce_iommu_release,
355 .ioctl = tce_iommu_ioctl,
356 .attach_group = tce_iommu_attach_group,
357 .detach_group = tce_iommu_detach_group,
358};
359
360static int __init tce_iommu_init(void)
361{
362 return vfio_register_iommu_driver(&tce_iommu_driver_ops);
363}
364
365static void __exit tce_iommu_cleanup(void)
366{
367 vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
368}
369
370module_init(tce_iommu_init);
371module_exit(tce_iommu_cleanup);
372
373MODULE_VERSION(DRIVER_VERSION);
374MODULE_LICENSE("GPL v2");
375MODULE_AUTHOR(DRIVER_AUTHOR);
376MODULE_DESCRIPTION(DRIVER_DESC);
377
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..87ee4f4cff25 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -22,6 +22,7 @@
22/* Extensions */ 22/* Extensions */
23 23
24#define VFIO_TYPE1_IOMMU 1 24#define VFIO_TYPE1_IOMMU 1
25#define VFIO_SPAPR_TCE_IOMMU 2
25 26
26/* 27/*
27 * The IOCTL interface is designed for extensibility by embedding the 28 * The IOCTL interface is designed for extensibility by embedding the
@@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap {
375 376
376#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) 377#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
377 378
379/*
380 * IOCTLs to enable/disable IOMMU container usage.
381 * No parameters are supported.
382 */
383#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
384#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
385
386/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
387
388/*
389 * The SPAPR TCE info struct provides the information about the PCI bus
390 * address ranges available for DMA, these values are programmed into
391 * the hardware so the guest has to know that information.
392 *
393 * The DMA 32 bit window start is an absolute PCI bus address.
394 * The IOVA address passed via map/unmap ioctls are absolute PCI bus
395 * addresses too so the window works as a filter rather than an offset
396 * for IOVA addresses.
397 *
398 * A flag will need to be added if other page sizes are supported,
399 * so as defined here, it is always 4k.
400 */
401struct vfio_iommu_spapr_tce_info {
402 __u32 argsz;
403 __u32 flags; /* reserved for future use */
404 __u32 dma32_window_start; /* 32 bit window start (bytes) */
405 __u32 dma32_window_size; /* 32 bit window size (bytes) */
406};
407
408#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
409
410/* ***************************************************************** */
411
378#endif /* _UAPIVFIO_H */ 412#endif /* _UAPIVFIO_H */