diff options
author | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2014-02-10 19:32:38 -0500 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2014-02-11 00:07:37 -0500 |
commit | cd15b048445d0a54f7147c35a86c5a16ef231554 (patch) | |
tree | 80c7e63624143adc5d453106ec13f445438c6135 /arch | |
parent | ea961a828fe7250e954f086d74d9323c3d44c3e4 (diff) |
powerpc/powernv: Add iommu DMA bypass support for IODA2
This patch adds the support for to create a direct iommu "bypass"
window on IODA2 bridges (such as Power8) allowing to bypass iommu
page translation completely for 64-bit DMA capable devices, thus
significantly improving DMA performances.
Additionally, this adds a hook to the struct iommu_table so that
the IOMMU API / VFIO can disable the bypass when external ownership
is requested, since in that case, the device will be used by an
environment such as userspace or a KVM guest which must not be
allowed to bypass translations.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/include/asm/dma-mapping.h | 1 | ||||
-rw-r--r-- | arch/powerpc/include/asm/iommu.h | 1 | ||||
-rw-r--r-- | arch/powerpc/kernel/dma.c | 10 | ||||
-rw-r--r-- | arch/powerpc/kernel/iommu.c | 12 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 84 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 10 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 6 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/powernv.h | 8 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 9 |
9 files changed, 137 insertions, 4 deletions
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index e27e9ad6818e..150866b2a3fe 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h | |||
@@ -134,6 +134,7 @@ static inline int dma_supported(struct device *dev, u64 mask) | |||
134 | } | 134 | } |
135 | 135 | ||
136 | extern int dma_set_mask(struct device *dev, u64 dma_mask); | 136 | extern int dma_set_mask(struct device *dev, u64 dma_mask); |
137 | extern int __dma_set_mask(struct device *dev, u64 dma_mask); | ||
137 | 138 | ||
138 | #define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) | 139 | #define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) |
139 | 140 | ||
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index f7a8036579b5..42632c7a2a4e 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h | |||
@@ -77,6 +77,7 @@ struct iommu_table { | |||
77 | #ifdef CONFIG_IOMMU_API | 77 | #ifdef CONFIG_IOMMU_API |
78 | struct iommu_group *it_group; | 78 | struct iommu_group *it_group; |
79 | #endif | 79 | #endif |
80 | void (*set_bypass)(struct iommu_table *tbl, bool enable); | ||
80 | }; | 81 | }; |
81 | 82 | ||
82 | /* Pure 2^n version of get_order */ | 83 | /* Pure 2^n version of get_order */ |
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 8032b97ccdcb..ee78f6e49d64 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c | |||
@@ -191,12 +191,10 @@ EXPORT_SYMBOL(dma_direct_ops); | |||
191 | 191 | ||
192 | #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) | 192 | #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) |
193 | 193 | ||
194 | int dma_set_mask(struct device *dev, u64 dma_mask) | 194 | int __dma_set_mask(struct device *dev, u64 dma_mask) |
195 | { | 195 | { |
196 | struct dma_map_ops *dma_ops = get_dma_ops(dev); | 196 | struct dma_map_ops *dma_ops = get_dma_ops(dev); |
197 | 197 | ||
198 | if (ppc_md.dma_set_mask) | ||
199 | return ppc_md.dma_set_mask(dev, dma_mask); | ||
200 | if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) | 198 | if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) |
201 | return dma_ops->set_dma_mask(dev, dma_mask); | 199 | return dma_ops->set_dma_mask(dev, dma_mask); |
202 | if (!dev->dma_mask || !dma_supported(dev, dma_mask)) | 200 | if (!dev->dma_mask || !dma_supported(dev, dma_mask)) |
@@ -204,6 +202,12 @@ int dma_set_mask(struct device *dev, u64 dma_mask) | |||
204 | *dev->dma_mask = dma_mask; | 202 | *dev->dma_mask = dma_mask; |
205 | return 0; | 203 | return 0; |
206 | } | 204 | } |
205 | int dma_set_mask(struct device *dev, u64 dma_mask) | ||
206 | { | ||
207 | if (ppc_md.dma_set_mask) | ||
208 | return ppc_md.dma_set_mask(dev, dma_mask); | ||
209 | return __dma_set_mask(dev, dma_mask); | ||
210 | } | ||
207 | EXPORT_SYMBOL(dma_set_mask); | 211 | EXPORT_SYMBOL(dma_set_mask); |
208 | 212 | ||
209 | u64 dma_get_required_mask(struct device *dev) | 213 | u64 dma_get_required_mask(struct device *dev) |
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index d773dd440a45..88e3ec6e1d96 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c | |||
@@ -1088,6 +1088,14 @@ int iommu_take_ownership(struct iommu_table *tbl) | |||
1088 | memset(tbl->it_map, 0xff, sz); | 1088 | memset(tbl->it_map, 0xff, sz); |
1089 | iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); | 1089 | iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); |
1090 | 1090 | ||
1091 | /* | ||
1092 | * Disable iommu bypass, otherwise the user can DMA to all of | ||
1093 | * our physical memory via the bypass window instead of just | ||
1094 | * the pages that has been explicitly mapped into the iommu | ||
1095 | */ | ||
1096 | if (tbl->set_bypass) | ||
1097 | tbl->set_bypass(tbl, false); | ||
1098 | |||
1091 | return 0; | 1099 | return 0; |
1092 | } | 1100 | } |
1093 | EXPORT_SYMBOL_GPL(iommu_take_ownership); | 1101 | EXPORT_SYMBOL_GPL(iommu_take_ownership); |
@@ -1102,6 +1110,10 @@ void iommu_release_ownership(struct iommu_table *tbl) | |||
1102 | /* Restore bit#0 set by iommu_init_table() */ | 1110 | /* Restore bit#0 set by iommu_init_table() */ |
1103 | if (tbl->it_offset == 0) | 1111 | if (tbl->it_offset == 0) |
1104 | set_bit(0, tbl->it_map); | 1112 | set_bit(0, tbl->it_map); |
1113 | |||
1114 | /* The kernel owns the device now, we can restore the iommu bypass */ | ||
1115 | if (tbl->set_bypass) | ||
1116 | tbl->set_bypass(tbl, true); | ||
1105 | } | 1117 | } |
1106 | EXPORT_SYMBOL_GPL(iommu_release_ownership); | 1118 | EXPORT_SYMBOL_GPL(iommu_release_ownership); |
1107 | 1119 | ||
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7d6dcc6d5fa9..3b2b4fb3585b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/irq.h> | 21 | #include <linux/irq.h> |
22 | #include <linux/io.h> | 22 | #include <linux/io.h> |
23 | #include <linux/msi.h> | 23 | #include <linux/msi.h> |
24 | #include <linux/memblock.h> | ||
24 | 25 | ||
25 | #include <asm/sections.h> | 26 | #include <asm/sections.h> |
26 | #include <asm/io.h> | 27 | #include <asm/io.h> |
@@ -460,9 +461,39 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev | |||
460 | return; | 461 | return; |
461 | 462 | ||
462 | pe = &phb->ioda.pe_array[pdn->pe_number]; | 463 | pe = &phb->ioda.pe_array[pdn->pe_number]; |
464 | WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); | ||
463 | set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); | 465 | set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); |
464 | } | 466 | } |
465 | 467 | ||
468 | static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, | ||
469 | struct pci_dev *pdev, u64 dma_mask) | ||
470 | { | ||
471 | struct pci_dn *pdn = pci_get_pdn(pdev); | ||
472 | struct pnv_ioda_pe *pe; | ||
473 | uint64_t top; | ||
474 | bool bypass = false; | ||
475 | |||
476 | if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) | ||
477 | return -ENODEV;; | ||
478 | |||
479 | pe = &phb->ioda.pe_array[pdn->pe_number]; | ||
480 | if (pe->tce_bypass_enabled) { | ||
481 | top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; | ||
482 | bypass = (dma_mask >= top); | ||
483 | } | ||
484 | |||
485 | if (bypass) { | ||
486 | dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); | ||
487 | set_dma_ops(&pdev->dev, &dma_direct_ops); | ||
488 | set_dma_offset(&pdev->dev, pe->tce_bypass_base); | ||
489 | } else { | ||
490 | dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); | ||
491 | set_dma_ops(&pdev->dev, &dma_iommu_ops); | ||
492 | set_iommu_table_base(&pdev->dev, &pe->tce32_table); | ||
493 | } | ||
494 | return 0; | ||
495 | } | ||
496 | |||
466 | static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) | 497 | static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) |
467 | { | 498 | { |
468 | struct pci_dev *dev; | 499 | struct pci_dev *dev; |
@@ -657,6 +688,56 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, | |||
657 | __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); | 688 | __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); |
658 | } | 689 | } |
659 | 690 | ||
691 | static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) | ||
692 | { | ||
693 | struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, | ||
694 | tce32_table); | ||
695 | uint16_t window_id = (pe->pe_number << 1 ) + 1; | ||
696 | int64_t rc; | ||
697 | |||
698 | pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); | ||
699 | if (enable) { | ||
700 | phys_addr_t top = memblock_end_of_DRAM(); | ||
701 | |||
702 | top = roundup_pow_of_two(top); | ||
703 | rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, | ||
704 | pe->pe_number, | ||
705 | window_id, | ||
706 | pe->tce_bypass_base, | ||
707 | top); | ||
708 | } else { | ||
709 | rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, | ||
710 | pe->pe_number, | ||
711 | window_id, | ||
712 | pe->tce_bypass_base, | ||
713 | 0); | ||
714 | |||
715 | /* | ||
716 | * We might want to reset the DMA ops of all devices on | ||
717 | * this PE. However in theory, that shouldn't be necessary | ||
718 | * as this is used for VFIO/KVM pass-through and the device | ||
719 | * hasn't yet been returned to its kernel driver | ||
720 | */ | ||
721 | } | ||
722 | if (rc) | ||
723 | pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); | ||
724 | else | ||
725 | pe->tce_bypass_enabled = enable; | ||
726 | } | ||
727 | |||
728 | static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, | ||
729 | struct pnv_ioda_pe *pe) | ||
730 | { | ||
731 | /* TVE #1 is selected by PCI address bit 59 */ | ||
732 | pe->tce_bypass_base = 1ull << 59; | ||
733 | |||
734 | /* Install set_bypass callback for VFIO */ | ||
735 | pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; | ||
736 | |||
737 | /* Enable bypass by default */ | ||
738 | pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); | ||
739 | } | ||
740 | |||
660 | static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | 741 | static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, |
661 | struct pnv_ioda_pe *pe) | 742 | struct pnv_ioda_pe *pe) |
662 | { | 743 | { |
@@ -727,6 +808,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | |||
727 | else | 808 | else |
728 | pnv_ioda_setup_bus_dma(pe, pe->pbus); | 809 | pnv_ioda_setup_bus_dma(pe, pe->pbus); |
729 | 810 | ||
811 | /* Also create a bypass window */ | ||
812 | pnv_pci_ioda2_setup_bypass_pe(phb, pe); | ||
730 | return; | 813 | return; |
731 | fail: | 814 | fail: |
732 | if (pe->tce32_seg >= 0) | 815 | if (pe->tce32_seg >= 0) |
@@ -1286,6 +1369,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, | |||
1286 | 1369 | ||
1287 | /* Setup TCEs */ | 1370 | /* Setup TCEs */ |
1288 | phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; | 1371 | phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; |
1372 | phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; | ||
1289 | 1373 | ||
1290 | /* Setup shutdown function for kexec */ | 1374 | /* Setup shutdown function for kexec */ |
1291 | phb->shutdown = pnv_pci_ioda_shutdown; | 1375 | phb->shutdown = pnv_pci_ioda_shutdown; |
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b555ebc57ef5..95633d79ef5d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c | |||
@@ -634,6 +634,16 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) | |||
634 | pnv_pci_dma_fallback_setup(hose, pdev); | 634 | pnv_pci_dma_fallback_setup(hose, pdev); |
635 | } | 635 | } |
636 | 636 | ||
637 | int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) | ||
638 | { | ||
639 | struct pci_controller *hose = pci_bus_to_host(pdev->bus); | ||
640 | struct pnv_phb *phb = hose->private_data; | ||
641 | |||
642 | if (phb && phb->dma_set_mask) | ||
643 | return phb->dma_set_mask(phb, pdev, dma_mask); | ||
644 | return __dma_set_mask(&pdev->dev, dma_mask); | ||
645 | } | ||
646 | |||
637 | void pnv_pci_shutdown(void) | 647 | void pnv_pci_shutdown(void) |
638 | { | 648 | { |
639 | struct pci_controller *hose; | 649 | struct pci_controller *hose; |
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 13f1942a9a5f..cde169442775 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h | |||
@@ -54,7 +54,9 @@ struct pnv_ioda_pe { | |||
54 | struct iommu_table tce32_table; | 54 | struct iommu_table tce32_table; |
55 | phys_addr_t tce_inval_reg_phys; | 55 | phys_addr_t tce_inval_reg_phys; |
56 | 56 | ||
57 | /* XXX TODO: Add support for additional 64-bit iommus */ | 57 | /* 64-bit TCE bypass region */ |
58 | bool tce_bypass_enabled; | ||
59 | uint64_t tce_bypass_base; | ||
58 | 60 | ||
59 | /* MSIs. MVE index is identical for for 32 and 64 bit MSI | 61 | /* MSIs. MVE index is identical for for 32 and 64 bit MSI |
60 | * and -1 if not supported. (It's actually identical to the | 62 | * and -1 if not supported. (It's actually identical to the |
@@ -113,6 +115,8 @@ struct pnv_phb { | |||
113 | unsigned int hwirq, unsigned int virq, | 115 | unsigned int hwirq, unsigned int virq, |
114 | unsigned int is_64, struct msi_msg *msg); | 116 | unsigned int is_64, struct msi_msg *msg); |
115 | void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); | 117 | void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); |
118 | int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, | ||
119 | u64 dma_mask); | ||
116 | void (*fixup_phb)(struct pci_controller *hose); | 120 | void (*fixup_phb)(struct pci_controller *hose); |
117 | u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); | 121 | u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); |
118 | void (*shutdown)(struct pnv_phb *phb); | 122 | void (*shutdown)(struct pnv_phb *phb); |
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index de6819be1f95..0051e108ef0f 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h | |||
@@ -7,12 +7,20 @@ extern void pnv_smp_init(void); | |||
7 | static inline void pnv_smp_init(void) { } | 7 | static inline void pnv_smp_init(void) { } |
8 | #endif | 8 | #endif |
9 | 9 | ||
10 | struct pci_dev; | ||
11 | |||
10 | #ifdef CONFIG_PCI | 12 | #ifdef CONFIG_PCI |
11 | extern void pnv_pci_init(void); | 13 | extern void pnv_pci_init(void); |
12 | extern void pnv_pci_shutdown(void); | 14 | extern void pnv_pci_shutdown(void); |
15 | extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); | ||
13 | #else | 16 | #else |
14 | static inline void pnv_pci_init(void) { } | 17 | static inline void pnv_pci_init(void) { } |
15 | static inline void pnv_pci_shutdown(void) { } | 18 | static inline void pnv_pci_shutdown(void) { } |
19 | |||
20 | static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) | ||
21 | { | ||
22 | return -ENODEV; | ||
23 | } | ||
16 | #endif | 24 | #endif |
17 | 25 | ||
18 | extern void pnv_lpc_init(void); | 26 | extern void pnv_lpc_init(void); |
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 21166f65c97c..110f4fbd319f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
28 | #include <linux/bug.h> | 28 | #include <linux/bug.h> |
29 | #include <linux/cpuidle.h> | 29 | #include <linux/cpuidle.h> |
30 | #include <linux/pci.h> | ||
30 | 31 | ||
31 | #include <asm/machdep.h> | 32 | #include <asm/machdep.h> |
32 | #include <asm/firmware.h> | 33 | #include <asm/firmware.h> |
@@ -141,6 +142,13 @@ static void pnv_progress(char *s, unsigned short hex) | |||
141 | { | 142 | { |
142 | } | 143 | } |
143 | 144 | ||
145 | static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) | ||
146 | { | ||
147 | if (dev_is_pci(dev)) | ||
148 | return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); | ||
149 | return __dma_set_mask(dev, dma_mask); | ||
150 | } | ||
151 | |||
144 | static void pnv_shutdown(void) | 152 | static void pnv_shutdown(void) |
145 | { | 153 | { |
146 | /* Let the PCI code clear up IODA tables */ | 154 | /* Let the PCI code clear up IODA tables */ |
@@ -238,6 +246,7 @@ define_machine(powernv) { | |||
238 | .machine_shutdown = pnv_shutdown, | 246 | .machine_shutdown = pnv_shutdown, |
239 | .power_save = powernv_idle, | 247 | .power_save = powernv_idle, |
240 | .calibrate_decr = generic_calibrate_decr, | 248 | .calibrate_decr = generic_calibrate_decr, |
249 | .dma_set_mask = pnv_dma_set_mask, | ||
241 | #ifdef CONFIG_KEXEC | 250 | #ifdef CONFIG_KEXEC |
242 | .kexec_cpu_down = pnv_kexec_cpu_down, | 251 | .kexec_cpu_down = pnv_kexec_cpu_down, |
243 | #endif | 252 | #endif |