aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/xen
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2010-05-11 10:05:49 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2010-07-27 11:51:00 -0400
commitb097186fd29d5bc5a26d1ae87995821ffc27b66e (patch)
tree64aeb8bf4f6e0eea409d4ad5e12d9788a136732c /drivers/xen
parentd2cb214551de8180542a04ec8c86c0c9412c5124 (diff)
swiotlb-xen: SWIOTLB library for Xen PV guest with PCI passthrough.
This patchset: PV guests under Xen are running in an non-contiguous memory architecture. When PCI pass-through is utilized, this necessitates an IOMMU for translating bus (DMA) to virtual and vice-versa and also providing a mechanism to have contiguous pages for device drivers operations (say DMA operations). Specifically, under Xen the Linux idea of pages is an illusion. It assumes that pages start at zero and go up to the available memory. To help with that, the Linux Xen MMU provides a lookup mechanism to translate the page frame numbers (PFN) to machine frame numbers (MFN) and vice-versa. The MFN are the "real" frame numbers. Furthermore memory is not contiguous. Xen hypervisor stitches memory for guests from different pools, which means there is no guarantee that PFN==MFN and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are allocated in descending order (high to low), meaning the guest might never get any MFN's under the 4GB mark. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Albert Herranz <albert_herranz@yahoo.es> Cc: Ian Campbell <Ian.Campbell@citrix.com>
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig4
-rw-r--r--drivers/xen/Makefile3
-rw-r--r--drivers/xen/swiotlb-xen.c515
3 files changed, 521 insertions, 1 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index fad3df2c1276..97199c2a64a0 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -62,4 +62,8 @@ config XEN_SYS_HYPERVISOR
62 virtual environment, /sys/hypervisor will still be present, 62 virtual environment, /sys/hypervisor will still be present,
63 but will have no xen contents. 63 but will have no xen contents.
64 64
65config SWIOTLB_XEN
66 def_bool y
67 depends on SWIOTLB
68
65endmenu 69endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 7c284342f30f..85f84cff8104 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
9obj-$(CONFIG_XEN_BALLOON) += balloon.o 9obj-$(CONFIG_XEN_BALLOON) += balloon.o
10obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o 10obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
11obj-$(CONFIG_XENFS) += xenfs/ 11obj-$(CONFIG_XENFS) += xenfs/
12obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file 12obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
13obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
new file mode 100644
index 000000000000..54469c3eeacd
--- /dev/null
+++ b/drivers/xen/swiotlb-xen.c
@@ -0,0 +1,515 @@
1/*
2 * Copyright 2010
3 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
4 *
5 * This code provides a IOMMU for Xen PV guests with PCI passthrough.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License v2.0 as published by
9 * the Free Software Foundation
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * PV guests under Xen are running in an non-contiguous memory architecture.
17 *
18 * When PCI pass-through is utilized, this necessitates an IOMMU for
19 * translating bus (DMA) to virtual and vice-versa and also providing a
20 * mechanism to have contiguous pages for device drivers operations (say DMA
21 * operations).
22 *
23 * Specifically, under Xen the Linux idea of pages is an illusion. It
24 * assumes that pages start at zero and go up to the available memory. To
25 * help with that, the Linux Xen MMU provides a lookup mechanism to
26 * translate the page frame numbers (PFN) to machine frame numbers (MFN)
27 * and vice-versa. The MFN are the "real" frame numbers. Furthermore
28 * memory is not contiguous. Xen hypervisor stitches memory for guests
29 * from different pools, which means there is no guarantee that PFN==MFN
30 * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are
31 * allocated in descending order (high to low), meaning the guest might
32 * never get any MFN's under the 4GB mark.
33 *
34 */
35
36#include <linux/bootmem.h>
37#include <linux/dma-mapping.h>
38#include <xen/swiotlb-xen.h>
39#include <xen/page.h>
40#include <xen/xen-ops.h>
41/*
42 * Used to do a quick range check in swiotlb_tbl_unmap_single and
43 * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
44 * API.
45 */
46
47static char *xen_io_tlb_start, *xen_io_tlb_end;
48static unsigned long xen_io_tlb_nslabs;
49/*
50 * Quick lookup value of the bus address of the IOTLB.
51 */
52
53u64 start_dma_addr;
54
55static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
56{
57 return phys_to_machine(XPADDR(paddr)).maddr;;
58}
59
60static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
61{
62 return machine_to_phys(XMADDR(baddr)).paddr;
63}
64
65static dma_addr_t xen_virt_to_bus(void *address)
66{
67 return xen_phys_to_bus(virt_to_phys(address));
68}
69
70static int check_pages_physically_contiguous(unsigned long pfn,
71 unsigned int offset,
72 size_t length)
73{
74 unsigned long next_mfn;
75 int i;
76 int nr_pages;
77
78 next_mfn = pfn_to_mfn(pfn);
79 nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
80
81 for (i = 1; i < nr_pages; i++) {
82 if (pfn_to_mfn(++pfn) != ++next_mfn)
83 return 0;
84 }
85 return 1;
86}
87
88static int range_straddles_page_boundary(phys_addr_t p, size_t size)
89{
90 unsigned long pfn = PFN_DOWN(p);
91 unsigned int offset = p & ~PAGE_MASK;
92
93 if (offset + size <= PAGE_SIZE)
94 return 0;
95 if (check_pages_physically_contiguous(pfn, offset, size))
96 return 0;
97 return 1;
98}
99
100static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
101{
102 unsigned long mfn = PFN_DOWN(dma_addr);
103 unsigned long pfn = mfn_to_local_pfn(mfn);
104 phys_addr_t paddr;
105
106 /* If the address is outside our domain, it CAN
107 * have the same virtual address as another address
108 * in our domain. Therefore _only_ check address within our domain.
109 */
110 if (pfn_valid(pfn)) {
111 paddr = PFN_PHYS(pfn);
112 return paddr >= virt_to_phys(xen_io_tlb_start) &&
113 paddr < virt_to_phys(xen_io_tlb_end);
114 }
115 return 0;
116}
117
118static int max_dma_bits = 32;
119
120static int
121xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
122{
123 int i, rc;
124 int dma_bits;
125
126 dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
127
128 i = 0;
129 do {
130 int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
131
132 do {
133 rc = xen_create_contiguous_region(
134 (unsigned long)buf + (i << IO_TLB_SHIFT),
135 get_order(slabs << IO_TLB_SHIFT),
136 dma_bits);
137 } while (rc && dma_bits++ < max_dma_bits);
138 if (rc)
139 return rc;
140
141 i += slabs;
142 } while (i < nslabs);
143 return 0;
144}
145
146void __init xen_swiotlb_init(int verbose)
147{
148 unsigned long bytes;
149 int rc;
150
151 xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
152 xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
153
154 bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
155
156 /*
157 * Get IO TLB memory from any location.
158 */
159 xen_io_tlb_start = alloc_bootmem(bytes);
160 if (!xen_io_tlb_start)
161 panic("Cannot allocate SWIOTLB buffer");
162
163 xen_io_tlb_end = xen_io_tlb_start + bytes;
164 /*
165 * And replace that memory with pages under 4GB.
166 */
167 rc = xen_swiotlb_fixup(xen_io_tlb_start,
168 bytes,
169 xen_io_tlb_nslabs);
170 if (rc)
171 goto error;
172
173 start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
174 swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
175
176 return;
177error:
178 panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
179 "We either don't have the permission or you do not have enough"\
180 "free memory under 4GB!\n", rc);
181}
182
183void *
184xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
185 dma_addr_t *dma_handle, gfp_t flags)
186{
187 void *ret;
188 int order = get_order(size);
189 u64 dma_mask = DMA_BIT_MASK(32);
190 unsigned long vstart;
191
192 /*
193 * Ignore region specifiers - the kernel's ideas of
194 * pseudo-phys memory layout has nothing to do with the
195 * machine physical layout. We can't allocate highmem
196 * because we can't return a pointer to it.
197 */
198 flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
199
200 if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
201 return ret;
202
203 vstart = __get_free_pages(flags, order);
204 ret = (void *)vstart;
205
206 if (hwdev && hwdev->coherent_dma_mask)
207 dma_mask = dma_alloc_coherent_mask(hwdev, flags);
208
209 if (ret) {
210 if (xen_create_contiguous_region(vstart, order,
211 fls64(dma_mask)) != 0) {
212 free_pages(vstart, order);
213 return NULL;
214 }
215 memset(ret, 0, size);
216 *dma_handle = virt_to_machine(ret).maddr;
217 }
218 return ret;
219}
220EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
221
222void
223xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
224 dma_addr_t dev_addr)
225{
226 int order = get_order(size);
227
228 if (dma_release_from_coherent(hwdev, order, vaddr))
229 return;
230
231 xen_destroy_contiguous_region((unsigned long)vaddr, order);
232 free_pages((unsigned long)vaddr, order);
233}
234EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
235
236
237/*
238 * Map a single buffer of the indicated size for DMA in streaming mode. The
239 * physical address to use is returned.
240 *
241 * Once the device is given the dma address, the device owns this memory until
242 * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
243 */
244dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
245 unsigned long offset, size_t size,
246 enum dma_data_direction dir,
247 struct dma_attrs *attrs)
248{
249 phys_addr_t phys = page_to_phys(page) + offset;
250 dma_addr_t dev_addr = xen_phys_to_bus(phys);
251 void *map;
252
253 BUG_ON(dir == DMA_NONE);
254 /*
255 * If the address happens to be in the device's DMA window,
256 * we can safely return the device addr and not worry about bounce
257 * buffering it.
258 */
259 if (dma_capable(dev, dev_addr, size) &&
260 !range_straddles_page_boundary(phys, size) && !swiotlb_force)
261 return dev_addr;
262
263 /*
264 * Oh well, have to allocate and map a bounce buffer.
265 */
266 map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
267 if (!map)
268 return DMA_ERROR_CODE;
269
270 dev_addr = xen_virt_to_bus(map);
271
272 /*
273 * Ensure that the address returned is DMA'ble
274 */
275 if (!dma_capable(dev, dev_addr, size))
276 panic("map_single: bounce buffer is not DMA'ble");
277
278 return dev_addr;
279}
280EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
281
282/*
283 * Unmap a single streaming mode DMA translation. The dma_addr and size must
284 * match what was provided for in a previous xen_swiotlb_map_page call. All
285 * other usages are undefined.
286 *
287 * After this call, reads by the cpu to the buffer are guaranteed to see
288 * whatever the device wrote there.
289 */
290static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
291 size_t size, enum dma_data_direction dir)
292{
293 phys_addr_t paddr = xen_bus_to_phys(dev_addr);
294
295 BUG_ON(dir == DMA_NONE);
296
297 /* NOTE: We use dev_addr here, not paddr! */
298 if (is_xen_swiotlb_buffer(dev_addr)) {
299 swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
300 return;
301 }
302
303 if (dir != DMA_FROM_DEVICE)
304 return;
305
306 /*
307 * phys_to_virt doesn't work with hihgmem page but we could
308 * call dma_mark_clean() with hihgmem page here. However, we
309 * are fine since dma_mark_clean() is null on POWERPC. We can
310 * make dma_mark_clean() take a physical address if necessary.
311 */
312 dma_mark_clean(phys_to_virt(paddr), size);
313}
314
315void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
316 size_t size, enum dma_data_direction dir,
317 struct dma_attrs *attrs)
318{
319 xen_unmap_single(hwdev, dev_addr, size, dir);
320}
321EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
322
323/*
324 * Make physical memory consistent for a single streaming mode DMA translation
325 * after a transfer.
326 *
327 * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
328 * using the cpu, yet do not wish to teardown the dma mapping, you must
329 * call this function before doing so. At the next point you give the dma
330 * address back to the card, you must first perform a
331 * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
332 */
333static void
334xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
335 size_t size, enum dma_data_direction dir,
336 enum dma_sync_target target)
337{
338 phys_addr_t paddr = xen_bus_to_phys(dev_addr);
339
340 BUG_ON(dir == DMA_NONE);
341
342 /* NOTE: We use dev_addr here, not paddr! */
343 if (is_xen_swiotlb_buffer(dev_addr)) {
344 swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
345 target);
346 return;
347 }
348
349 if (dir != DMA_FROM_DEVICE)
350 return;
351
352 dma_mark_clean(phys_to_virt(paddr), size);
353}
354
355void
356xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
357 size_t size, enum dma_data_direction dir)
358{
359 xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
360}
361EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
362
363void
364xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
365 size_t size, enum dma_data_direction dir)
366{
367 xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
368}
369EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device);
370
371/*
372 * Map a set of buffers described by scatterlist in streaming mode for DMA.
373 * This is the scatter-gather version of the above xen_swiotlb_map_page
374 * interface. Here the scatter gather list elements are each tagged with the
375 * appropriate dma address and length. They are obtained via
376 * sg_dma_{address,length}(SG).
377 *
378 * NOTE: An implementation may be able to use a smaller number of
379 * DMA address/length pairs than there are SG table elements.
380 * (for example via virtual mapping capabilities)
381 * The routine returns the number of addr/length pairs actually
382 * used, at most nents.
383 *
384 * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
385 * same here.
386 */
387int
388xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
389 int nelems, enum dma_data_direction dir,
390 struct dma_attrs *attrs)
391{
392 struct scatterlist *sg;
393 int i;
394
395 BUG_ON(dir == DMA_NONE);
396
397 for_each_sg(sgl, sg, nelems, i) {
398 phys_addr_t paddr = sg_phys(sg);
399 dma_addr_t dev_addr = xen_phys_to_bus(paddr);
400
401 if (swiotlb_force ||
402 !dma_capable(hwdev, dev_addr, sg->length) ||
403 range_straddles_page_boundary(paddr, sg->length)) {
404 void *map = swiotlb_tbl_map_single(hwdev,
405 start_dma_addr,
406 sg_phys(sg),
407 sg->length, dir);
408 if (!map) {
409 /* Don't panic here, we expect map_sg users
410 to do proper error handling. */
411 xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
412 attrs);
413 sgl[0].dma_length = 0;
414 return DMA_ERROR_CODE;
415 }
416 sg->dma_address = xen_virt_to_bus(map);
417 } else
418 sg->dma_address = dev_addr;
419 sg->dma_length = sg->length;
420 }
421 return nelems;
422}
423EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs);
424
425int
426xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
427 enum dma_data_direction dir)
428{
429 return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
430}
431EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg);
432
433/*
434 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
435 * concerning calls here are the same as for swiotlb_unmap_page() above.
436 */
437void
438xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
439 int nelems, enum dma_data_direction dir,
440 struct dma_attrs *attrs)
441{
442 struct scatterlist *sg;
443 int i;
444
445 BUG_ON(dir == DMA_NONE);
446
447 for_each_sg(sgl, sg, nelems, i)
448 xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
449
450}
451EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
452
453void
454xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
455 enum dma_data_direction dir)
456{
457 return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
458}
459EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg);
460
461/*
462 * Make physical memory consistent for a set of streaming mode DMA translations
463 * after a transfer.
464 *
465 * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
466 * and usage.
467 */
468static void
469xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
470 int nelems, enum dma_data_direction dir,
471 enum dma_sync_target target)
472{
473 struct scatterlist *sg;
474 int i;
475
476 for_each_sg(sgl, sg, nelems, i)
477 xen_swiotlb_sync_single(hwdev, sg->dma_address,
478 sg->dma_length, dir, target);
479}
480
481void
482xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
483 int nelems, enum dma_data_direction dir)
484{
485 xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
486}
487EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu);
488
489void
490xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
491 int nelems, enum dma_data_direction dir)
492{
493 xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
494}
495EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device);
496
497int
498xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
499{
500 return !dma_addr;
501}
502EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error);
503
504/*
505 * Return whether the given device DMA address mask can be supported
506 * properly. For example, if your device can only drive the low 24-bits
507 * during bus mastering, then you would pass 0x00ffffff as the mask to
508 * this function.
509 */
510int
511xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
512{
513 return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
514}
515EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);