aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeshavamurthy, Anil S <anil.s.keshavamurthy@intel.com>2007-10-21 19:41:49 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-22 11:13:18 -0400
commitba39592764ed20cee09aae5352e603a27bf56b0d (patch)
treeefe7ec88bbd4d6b08b639830352c68411a7ef7fb
parentf8de50eb6b085572ea773f26e066835ea3d3028b (diff)
Intel IOMMU: Intel IOMMU driver
Actual intel IOMMU driver. Hardware spec can be found at: http://www.intel.com/technology/virtualization This driver sets X86_64 'dma_ops', so hook into standard DMA APIs. In this way, PCI driver will get virtual DMA address. This change is transparent to PCI drivers. [akpm@linux-foundation.org: remove unneeded cast] [akpm@linux-foundation.org: build fix] [bunk@stusta.de: fix duplicate CONFIG_DMAR Makefile line] Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> Cc: Andi Kleen <ak@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Muli Ben-Yehuda <muli@il.ibm.com> Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Ashok Raj <ashok.raj@intel.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Christoph Lameter <clameter@sgi.com> Cc: Greg KH <greg@kroah.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/Intel-IOMMU.txt93
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--arch/x86/kernel/pci-dma_64.c5
-rw-r--r--drivers/pci/Makefile2
-rw-r--r--drivers/pci/intel-iommu.c1957
-rw-r--r--drivers/pci/intel-iommu.h318
-rw-r--r--include/linux/dmar.h22
7 files changed, 2406 insertions, 1 deletions
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 000000000000..cbb4dbaef761
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,93 @@
1Linux IOMMU Support
2===================
3
4The architecture spec can be obtained from the below location.
5
6http://www.intel.com/technology/virtualization/
7
8This guide gives a quick cheat sheet for some basic understanding.
9
10Some Keywords
11
12DMAR - DMA remapping
13DRHD - DMA Engine Reporting Structure
14RMRR - Reserved memory Region Reporting Structure
15ZLR - Zero length reads from PCI devices
16IOVA - IO Virtual address.
17
18Basic stuff
19-----------
20
21ACPI enumerates and lists the different DMA engines in the platform, and
22device scope relationships between PCI devices and which DMA engine controls
23them.
24
25What is RMRR?
26-------------
27
28There are some devices the BIOS controls, for e.g USB devices to perform
29PS2 emulation. The regions of memory used for these devices are marked
30reserved in the e820 map. When we turn on DMA translation, DMA to those
31regions will fail. Hence BIOS uses RMRR to specify these regions along with
32devices that need to access these regions. OS is expected to setup
33unity mappings for these regions for these devices to access these regions.
34
35How is IOVA generated?
36---------------------
37
38Well behaved drivers call pci_map_*() calls before sending command to device
39that needs to perform DMA. Once DMA is completed and mapping is no longer
40required, device performs a pci_unmap_*() calls to unmap the region.
41
42The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
43device has its own domain (hence protection). Devices under p2p bridges
44share the virtual address with all devices under the p2p bridge due to
45transaction id aliasing for p2p bridges.
46
47IOVA generation is pretty generic. We used the same technique as vmalloc()
48but these are not global address spaces, but separate for each domain.
49Different DMA engines may support different number of domains.
50
51We also allocate gaurd pages with each mapping, so we can attempt to catch
52any overflow that might happen.
53
54
55Graphics Problems?
56------------------
57If you encounter issues with graphics devices, you can try adding
58option intel_iommu=igfx_off to turn off the integrated graphics engine.
59
60Some exceptions to IOVA
61-----------------------
62Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
63The same is true for peer to peer transactions. Hence we reserve the
64address from PCI MMIO ranges so they are not allocated for IOVA addresses.
65
66Boot Message Sample
67-------------------
68
69Something like this gets printed indicating presence of DMAR tables
70in ACPI.
71
72ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
73
74When DMAR is being processed and initialized by ACPI, prints DMAR locations
75and any RMRR's processed.
76
77ACPI DMAR:Host address width 36
78ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
79ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
80ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
81ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
82ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
83
84When DMAR is enabled for use, you will notice..
85
86PCI-DMA: Using DMAR IOMMU
87
88TBD
89----
90
91- For compatibility testing, could use unity map domain for all devices, just
92 provide a 1-1 for all useful memory under a single domain for all devices.
93- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6accd360da73..8157417724a8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -772,6 +772,16 @@ and is between 256 and 4096 characters. It is defined in the file
772 772
773 inttest= [IA64] 773 inttest= [IA64]
774 774
775 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
776 off
777 Disable intel iommu driver.
778 igfx_off [Default Off]
779 By default, gfx is mapped as normal device. If a gfx
780 device has a dedicated DMAR unit, the DMAR unit is
781 bypassed by not enabling DMAR with this option. In
782 this case, gfx device will use physical address for
783 DMA.
784
775 io7= [HW] IO7 for Marvel based alpha systems 785 io7= [HW] IO7 for Marvel based alpha systems
776 See comment before marvel_specify_io7 in 786 See comment before marvel_specify_io7 in
777 arch/alpha/kernel/core_marvel.c. 787 arch/alpha/kernel/core_marvel.c.
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index afaf9f12c032..393e2725a6e3 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/dmar.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/iommu.h> 12#include <asm/iommu.h>
12#include <asm/calgary.h> 13#include <asm/calgary.h>
@@ -305,6 +306,8 @@ void __init pci_iommu_alloc(void)
305 detect_calgary(); 306 detect_calgary();
306#endif 307#endif
307 308
309 detect_intel_iommu();
310
308#ifdef CONFIG_SWIOTLB 311#ifdef CONFIG_SWIOTLB
309 pci_swiotlb_init(); 312 pci_swiotlb_init();
310#endif 313#endif
@@ -316,6 +319,8 @@ static int __init pci_iommu_init(void)
316 calgary_iommu_init(); 319 calgary_iommu_init();
317#endif 320#endif
318 321
322 intel_iommu_init();
323
319#ifdef CONFIG_IOMMU 324#ifdef CONFIG_IOMMU
320 gart_iommu_init(); 325 gart_iommu_init();
321#endif 326#endif
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 836ab2f250d1..555055650733 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -21,7 +21,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o
21obj-$(CONFIG_HT_IRQ) += htirq.o 21obj-$(CONFIG_HT_IRQ) += htirq.o
22 22
23# Build Intel IOMMU support 23# Build Intel IOMMU support
24obj-$(CONFIG_DMAR) += dmar.o 24obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
25 25
26# 26#
27# Some architectures use the generic PCI setup functions 27# Some architectures use the generic PCI setup functions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
new file mode 100644
index 000000000000..93ed771b3254
--- /dev/null
+++ b/drivers/pci/intel-iommu.c
@@ -0,0 +1,1957 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20 */
21
22#include <linux/init.h>
23#include <linux/bitmap.h>
24#include <linux/slab.h>
25#include <linux/irq.h>
26#include <linux/interrupt.h>
27#include <linux/sysdev.h>
28#include <linux/spinlock.h>
29#include <linux/pci.h>
30#include <linux/dmar.h>
31#include <linux/dma-mapping.h>
32#include <linux/mempool.h>
33#include "iova.h"
34#include "intel-iommu.h"
35#include <asm/proto.h> /* force_iommu in this header in x86-64*/
36#include <asm/cacheflush.h>
37#include <asm/iommu.h>
38#include "pci.h"
39
40#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53static void domain_remove_dev_info(struct dmar_domain *domain);
54
55static int dmar_disabled;
56static int __initdata dmar_map_gfx = 1;
57
58#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
59static DEFINE_SPINLOCK(device_domain_lock);
60static LIST_HEAD(device_domain_list);
61
62static int __init intel_iommu_setup(char *str)
63{
64 if (!str)
65 return -EINVAL;
66 while (*str) {
67 if (!strncmp(str, "off", 3)) {
68 dmar_disabled = 1;
69 printk(KERN_INFO"Intel-IOMMU: disabled\n");
70 } else if (!strncmp(str, "igfx_off", 8)) {
71 dmar_map_gfx = 0;
72 printk(KERN_INFO
73 "Intel-IOMMU: disable GFX device mapping\n");
74 }
75
76 str += strcspn(str, ",");
77 while (*str == ',')
78 str++;
79 }
80 return 0;
81}
82__setup("intel_iommu=", intel_iommu_setup);
83
84static struct kmem_cache *iommu_domain_cache;
85static struct kmem_cache *iommu_devinfo_cache;
86static struct kmem_cache *iommu_iova_cache;
87
88static inline void *alloc_pgtable_page(void)
89{
90 return (void *)get_zeroed_page(GFP_ATOMIC);
91}
92
93static inline void free_pgtable_page(void *vaddr)
94{
95 free_page((unsigned long)vaddr);
96}
97
98static inline void *alloc_domain_mem(void)
99{
100 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
101}
102
103static inline void free_domain_mem(void *vaddr)
104{
105 kmem_cache_free(iommu_domain_cache, vaddr);
106}
107
108static inline void * alloc_devinfo_mem(void)
109{
110 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
111}
112
113static inline void free_devinfo_mem(void *vaddr)
114{
115 kmem_cache_free(iommu_devinfo_cache, vaddr);
116}
117
118struct iova *alloc_iova_mem(void)
119{
120 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
121}
122
123void free_iova_mem(struct iova *iova)
124{
125 kmem_cache_free(iommu_iova_cache, iova);
126}
127
128static inline void __iommu_flush_cache(
129 struct intel_iommu *iommu, void *addr, int size)
130{
131 if (!ecap_coherent(iommu->ecap))
132 clflush_cache_range(addr, size);
133}
134
135/* Gets context entry for a given bus and devfn */
136static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
137 u8 bus, u8 devfn)
138{
139 struct root_entry *root;
140 struct context_entry *context;
141 unsigned long phy_addr;
142 unsigned long flags;
143
144 spin_lock_irqsave(&iommu->lock, flags);
145 root = &iommu->root_entry[bus];
146 context = get_context_addr_from_root(root);
147 if (!context) {
148 context = (struct context_entry *)alloc_pgtable_page();
149 if (!context) {
150 spin_unlock_irqrestore(&iommu->lock, flags);
151 return NULL;
152 }
153 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
154 phy_addr = virt_to_phys((void *)context);
155 set_root_value(root, phy_addr);
156 set_root_present(root);
157 __iommu_flush_cache(iommu, root, sizeof(*root));
158 }
159 spin_unlock_irqrestore(&iommu->lock, flags);
160 return &context[devfn];
161}
162
163static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
164{
165 struct root_entry *root;
166 struct context_entry *context;
167 int ret;
168 unsigned long flags;
169
170 spin_lock_irqsave(&iommu->lock, flags);
171 root = &iommu->root_entry[bus];
172 context = get_context_addr_from_root(root);
173 if (!context) {
174 ret = 0;
175 goto out;
176 }
177 ret = context_present(context[devfn]);
178out:
179 spin_unlock_irqrestore(&iommu->lock, flags);
180 return ret;
181}
182
183static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
184{
185 struct root_entry *root;
186 struct context_entry *context;
187 unsigned long flags;
188
189 spin_lock_irqsave(&iommu->lock, flags);
190 root = &iommu->root_entry[bus];
191 context = get_context_addr_from_root(root);
192 if (context) {
193 context_clear_entry(context[devfn]);
194 __iommu_flush_cache(iommu, &context[devfn], \
195 sizeof(*context));
196 }
197 spin_unlock_irqrestore(&iommu->lock, flags);
198}
199
200static void free_context_table(struct intel_iommu *iommu)
201{
202 struct root_entry *root;
203 int i;
204 unsigned long flags;
205 struct context_entry *context;
206
207 spin_lock_irqsave(&iommu->lock, flags);
208 if (!iommu->root_entry) {
209 goto out;
210 }
211 for (i = 0; i < ROOT_ENTRY_NR; i++) {
212 root = &iommu->root_entry[i];
213 context = get_context_addr_from_root(root);
214 if (context)
215 free_pgtable_page(context);
216 }
217 free_pgtable_page(iommu->root_entry);
218 iommu->root_entry = NULL;
219out:
220 spin_unlock_irqrestore(&iommu->lock, flags);
221}
222
223/* page table handling */
224#define LEVEL_STRIDE (9)
225#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
226
227static inline int agaw_to_level(int agaw)
228{
229 return agaw + 2;
230}
231
232static inline int agaw_to_width(int agaw)
233{
234 return 30 + agaw * LEVEL_STRIDE;
235
236}
237
238static inline int width_to_agaw(int width)
239{
240 return (width - 30) / LEVEL_STRIDE;
241}
242
243static inline unsigned int level_to_offset_bits(int level)
244{
245 return (12 + (level - 1) * LEVEL_STRIDE);
246}
247
248static inline int address_level_offset(u64 addr, int level)
249{
250 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
251}
252
253static inline u64 level_mask(int level)
254{
255 return ((u64)-1 << level_to_offset_bits(level));
256}
257
258static inline u64 level_size(int level)
259{
260 return ((u64)1 << level_to_offset_bits(level));
261}
262
263static inline u64 align_to_level(u64 addr, int level)
264{
265 return ((addr + level_size(level) - 1) & level_mask(level));
266}
267
268static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
269{
270 int addr_width = agaw_to_width(domain->agaw);
271 struct dma_pte *parent, *pte = NULL;
272 int level = agaw_to_level(domain->agaw);
273 int offset;
274 unsigned long flags;
275
276 BUG_ON(!domain->pgd);
277
278 addr &= (((u64)1) << addr_width) - 1;
279 parent = domain->pgd;
280
281 spin_lock_irqsave(&domain->mapping_lock, flags);
282 while (level > 0) {
283 void *tmp_page;
284
285 offset = address_level_offset(addr, level);
286 pte = &parent[offset];
287 if (level == 1)
288 break;
289
290 if (!dma_pte_present(*pte)) {
291 tmp_page = alloc_pgtable_page();
292
293 if (!tmp_page) {
294 spin_unlock_irqrestore(&domain->mapping_lock,
295 flags);
296 return NULL;
297 }
298 __iommu_flush_cache(domain->iommu, tmp_page,
299 PAGE_SIZE_4K);
300 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
301 /*
302 * high level table always sets r/w, last level page
303 * table control read/write
304 */
305 dma_set_pte_readable(*pte);
306 dma_set_pte_writable(*pte);
307 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
308 }
309 parent = phys_to_virt(dma_pte_addr(*pte));
310 level--;
311 }
312
313 spin_unlock_irqrestore(&domain->mapping_lock, flags);
314 return pte;
315}
316
317/* return address's pte at specific level */
318static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
319 int level)
320{
321 struct dma_pte *parent, *pte = NULL;
322 int total = agaw_to_level(domain->agaw);
323 int offset;
324
325 parent = domain->pgd;
326 while (level <= total) {
327 offset = address_level_offset(addr, total);
328 pte = &parent[offset];
329 if (level == total)
330 return pte;
331
332 if (!dma_pte_present(*pte))
333 break;
334 parent = phys_to_virt(dma_pte_addr(*pte));
335 total--;
336 }
337 return NULL;
338}
339
340/* clear one page's page table */
341static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
342{
343 struct dma_pte *pte = NULL;
344
345 /* get last level pte */
346 pte = dma_addr_level_pte(domain, addr, 1);
347
348 if (pte) {
349 dma_clear_pte(*pte);
350 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
351 }
352}
353
354/* clear last level pte, a tlb flush should be followed */
355static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
356{
357 int addr_width = agaw_to_width(domain->agaw);
358
359 start &= (((u64)1) << addr_width) - 1;
360 end &= (((u64)1) << addr_width) - 1;
361 /* in case it's partial page */
362 start = PAGE_ALIGN_4K(start);
363 end &= PAGE_MASK_4K;
364
365 /* we don't need lock here, nobody else touches the iova range */
366 while (start < end) {
367 dma_pte_clear_one(domain, start);
368 start += PAGE_SIZE_4K;
369 }
370}
371
372/* free page table pages. last level pte should already be cleared */
373static void dma_pte_free_pagetable(struct dmar_domain *domain,
374 u64 start, u64 end)
375{
376 int addr_width = agaw_to_width(domain->agaw);
377 struct dma_pte *pte;
378 int total = agaw_to_level(domain->agaw);
379 int level;
380 u64 tmp;
381
382 start &= (((u64)1) << addr_width) - 1;
383 end &= (((u64)1) << addr_width) - 1;
384
385 /* we don't need lock here, nobody else touches the iova range */
386 level = 2;
387 while (level <= total) {
388 tmp = align_to_level(start, level);
389 if (tmp >= end || (tmp + level_size(level) > end))
390 return;
391
392 while (tmp < end) {
393 pte = dma_addr_level_pte(domain, tmp, level);
394 if (pte) {
395 free_pgtable_page(
396 phys_to_virt(dma_pte_addr(*pte)));
397 dma_clear_pte(*pte);
398 __iommu_flush_cache(domain->iommu,
399 pte, sizeof(*pte));
400 }
401 tmp += level_size(level);
402 }
403 level++;
404 }
405 /* free pgd */
406 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
407 free_pgtable_page(domain->pgd);
408 domain->pgd = NULL;
409 }
410}
411
412/* iommu handling */
413static int iommu_alloc_root_entry(struct intel_iommu *iommu)
414{
415 struct root_entry *root;
416 unsigned long flags;
417
418 root = (struct root_entry *)alloc_pgtable_page();
419 if (!root)
420 return -ENOMEM;
421
422 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
423
424 spin_lock_irqsave(&iommu->lock, flags);
425 iommu->root_entry = root;
426 spin_unlock_irqrestore(&iommu->lock, flags);
427
428 return 0;
429}
430
431#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
432{\
433 unsigned long start_time = jiffies;\
434 while (1) {\
435 sts = op (iommu->reg + offset);\
436 if (cond)\
437 break;\
438 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
439 panic("DMAR hardware is malfunctioning\n");\
440 cpu_relax();\
441 }\
442}
443
444static void iommu_set_root_entry(struct intel_iommu *iommu)
445{
446 void *addr;
447 u32 cmd, sts;
448 unsigned long flag;
449
450 addr = iommu->root_entry;
451
452 spin_lock_irqsave(&iommu->register_lock, flag);
453 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
454
455 cmd = iommu->gcmd | DMA_GCMD_SRTP;
456 writel(cmd, iommu->reg + DMAR_GCMD_REG);
457
458 /* Make sure hardware complete it */
459 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
460 readl, (sts & DMA_GSTS_RTPS), sts);
461
462 spin_unlock_irqrestore(&iommu->register_lock, flag);
463}
464
465static void iommu_flush_write_buffer(struct intel_iommu *iommu)
466{
467 u32 val;
468 unsigned long flag;
469
470 if (!cap_rwbf(iommu->cap))
471 return;
472 val = iommu->gcmd | DMA_GCMD_WBF;
473
474 spin_lock_irqsave(&iommu->register_lock, flag);
475 writel(val, iommu->reg + DMAR_GCMD_REG);
476
477 /* Make sure hardware complete it */
478 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
479 readl, (!(val & DMA_GSTS_WBFS)), val);
480
481 spin_unlock_irqrestore(&iommu->register_lock, flag);
482}
483
484/* return value determine if we need a write buffer flush */
485static int __iommu_flush_context(struct intel_iommu *iommu,
486 u16 did, u16 source_id, u8 function_mask, u64 type,
487 int non_present_entry_flush)
488{
489 u64 val = 0;
490 unsigned long flag;
491
492 /*
493 * In the non-present entry flush case, if hardware doesn't cache
494 * non-present entry we do nothing and if hardware cache non-present
495 * entry, we flush entries of domain 0 (the domain id is used to cache
496 * any non-present entries)
497 */
498 if (non_present_entry_flush) {
499 if (!cap_caching_mode(iommu->cap))
500 return 1;
501 else
502 did = 0;
503 }
504
505 switch (type) {
506 case DMA_CCMD_GLOBAL_INVL:
507 val = DMA_CCMD_GLOBAL_INVL;
508 break;
509 case DMA_CCMD_DOMAIN_INVL:
510 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
511 break;
512 case DMA_CCMD_DEVICE_INVL:
513 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
514 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
515 break;
516 default:
517 BUG();
518 }
519 val |= DMA_CCMD_ICC;
520
521 spin_lock_irqsave(&iommu->register_lock, flag);
522 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
523
524 /* Make sure hardware complete it */
525 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
526 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
527
528 spin_unlock_irqrestore(&iommu->register_lock, flag);
529
530 /* flush context entry will implictly flush write buffer */
531 return 0;
532}
533
534static int inline iommu_flush_context_global(struct intel_iommu *iommu,
535 int non_present_entry_flush)
536{
537 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
538 non_present_entry_flush);
539}
540
541static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
542 int non_present_entry_flush)
543{
544 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
545 non_present_entry_flush);
546}
547
548static int inline iommu_flush_context_device(struct intel_iommu *iommu,
549 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
550{
551 return __iommu_flush_context(iommu, did, source_id, function_mask,
552 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
553}
554
555/* return value determine if we need a write buffer flush */
556static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
557 u64 addr, unsigned int size_order, u64 type,
558 int non_present_entry_flush)
559{
560 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
561 u64 val = 0, val_iva = 0;
562 unsigned long flag;
563
564 /*
565 * In the non-present entry flush case, if hardware doesn't cache
566 * non-present entry we do nothing and if hardware cache non-present
567 * entry, we flush entries of domain 0 (the domain id is used to cache
568 * any non-present entries)
569 */
570 if (non_present_entry_flush) {
571 if (!cap_caching_mode(iommu->cap))
572 return 1;
573 else
574 did = 0;
575 }
576
577 switch (type) {
578 case DMA_TLB_GLOBAL_FLUSH:
579 /* global flush doesn't need set IVA_REG */
580 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
581 break;
582 case DMA_TLB_DSI_FLUSH:
583 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
584 break;
585 case DMA_TLB_PSI_FLUSH:
586 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
587 /* Note: always flush non-leaf currently */
588 val_iva = size_order | addr;
589 break;
590 default:
591 BUG();
592 }
593 /* Note: set drain read/write */
594#if 0
595 /*
596 * This is probably to be super secure.. Looks like we can
597 * ignore it without any impact.
598 */
599 if (cap_read_drain(iommu->cap))
600 val |= DMA_TLB_READ_DRAIN;
601#endif
602 if (cap_write_drain(iommu->cap))
603 val |= DMA_TLB_WRITE_DRAIN;
604
605 spin_lock_irqsave(&iommu->register_lock, flag);
606 /* Note: Only uses first TLB reg currently */
607 if (val_iva)
608 dmar_writeq(iommu->reg + tlb_offset, val_iva);
609 dmar_writeq(iommu->reg + tlb_offset + 8, val);
610
611 /* Make sure hardware complete it */
612 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
613 dmar_readq, (!(val & DMA_TLB_IVT)), val);
614
615 spin_unlock_irqrestore(&iommu->register_lock, flag);
616
617 /* check IOTLB invalidation granularity */
618 if (DMA_TLB_IAIG(val) == 0)
619 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
620 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
621 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
622 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
623 /* flush context entry will implictly flush write buffer */
624 return 0;
625}
626
627static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
628 int non_present_entry_flush)
629{
630 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
631 non_present_entry_flush);
632}
633
634static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
635 int non_present_entry_flush)
636{
637 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
638 non_present_entry_flush);
639}
640
641static int iommu_get_alignment(u64 base, unsigned int size)
642{
643 int t = 0;
644 u64 end;
645
646 end = base + size - 1;
647 while (base != end) {
648 t++;
649 base >>= 1;
650 end >>= 1;
651 }
652 return t;
653}
654
655static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
656 u64 addr, unsigned int pages, int non_present_entry_flush)
657{
658 unsigned int align;
659
660 BUG_ON(addr & (~PAGE_MASK_4K));
661 BUG_ON(pages == 0);
662
663 /* Fallback to domain selective flush if no PSI support */
664 if (!cap_pgsel_inv(iommu->cap))
665 return iommu_flush_iotlb_dsi(iommu, did,
666 non_present_entry_flush);
667
668 /*
669 * PSI requires page size to be 2 ^ x, and the base address is naturally
670 * aligned to the size
671 */
672 align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages);
673 /* Fallback to domain selective flush if size is too big */
674 if (align > cap_max_amask_val(iommu->cap))
675 return iommu_flush_iotlb_dsi(iommu, did,
676 non_present_entry_flush);
677
678 addr >>= PAGE_SHIFT_4K + align;
679 addr <<= PAGE_SHIFT_4K + align;
680
681 return __iommu_flush_iotlb(iommu, did, addr, align,
682 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
683}
684
685static int iommu_enable_translation(struct intel_iommu *iommu)
686{
687 u32 sts;
688 unsigned long flags;
689
690 spin_lock_irqsave(&iommu->register_lock, flags);
691 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
692
693 /* Make sure hardware complete it */
694 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
695 readl, (sts & DMA_GSTS_TES), sts);
696
697 iommu->gcmd |= DMA_GCMD_TE;
698 spin_unlock_irqrestore(&iommu->register_lock, flags);
699 return 0;
700}
701
702static int iommu_disable_translation(struct intel_iommu *iommu)
703{
704 u32 sts;
705 unsigned long flag;
706
707 spin_lock_irqsave(&iommu->register_lock, flag);
708 iommu->gcmd &= ~DMA_GCMD_TE;
709 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
710
711 /* Make sure hardware complete it */
712 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
713 readl, (!(sts & DMA_GSTS_TES)), sts);
714
715 spin_unlock_irqrestore(&iommu->register_lock, flag);
716 return 0;
717}
718
719static int iommu_init_domains(struct intel_iommu *iommu)
720{
721 unsigned long ndomains;
722 unsigned long nlongs;
723
724 ndomains = cap_ndoms(iommu->cap);
725 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
726 nlongs = BITS_TO_LONGS(ndomains);
727
728 /* TBD: there might be 64K domains,
729 * consider other allocation for future chip
730 */
731 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
732 if (!iommu->domain_ids) {
733 printk(KERN_ERR "Allocating domain id array failed\n");
734 return -ENOMEM;
735 }
736 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
737 GFP_KERNEL);
738 if (!iommu->domains) {
739 printk(KERN_ERR "Allocating domain array failed\n");
740 kfree(iommu->domain_ids);
741 return -ENOMEM;
742 }
743
744 /*
745 * if Caching mode is set, then invalid translations are tagged
746 * with domainid 0. Hence we need to pre-allocate it.
747 */
748 if (cap_caching_mode(iommu->cap))
749 set_bit(0, iommu->domain_ids);
750 return 0;
751}
752
753static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
754{
755 struct intel_iommu *iommu;
756 int ret;
757 int map_size;
758 u32 ver;
759
760 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
761 if (!iommu)
762 return NULL;
763 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
764 if (!iommu->reg) {
765 printk(KERN_ERR "IOMMU: can't map the region\n");
766 goto error;
767 }
768 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
769 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
770
771 /* the registers might be more than one page */
772 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
773 cap_max_fault_reg_offset(iommu->cap));
774 map_size = PAGE_ALIGN_4K(map_size);
775 if (map_size > PAGE_SIZE_4K) {
776 iounmap(iommu->reg);
777 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
778 if (!iommu->reg) {
779 printk(KERN_ERR "IOMMU: can't map the region\n");
780 goto error;
781 }
782 }
783
784 ver = readl(iommu->reg + DMAR_VER_REG);
785 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
786 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
787 iommu->cap, iommu->ecap);
788 ret = iommu_init_domains(iommu);
789 if (ret)
790 goto error_unmap;
791 spin_lock_init(&iommu->lock);
792 spin_lock_init(&iommu->register_lock);
793
794 drhd->iommu = iommu;
795 return iommu;
796error_unmap:
797 iounmap(iommu->reg);
798 iommu->reg = 0;
799error:
800 kfree(iommu);
801 return NULL;
802}
803
804static void domain_exit(struct dmar_domain *domain);
805static void free_iommu(struct intel_iommu *iommu)
806{
807 struct dmar_domain *domain;
808 int i;
809
810 if (!iommu)
811 return;
812
813 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
814 for (; i < cap_ndoms(iommu->cap); ) {
815 domain = iommu->domains[i];
816 clear_bit(i, iommu->domain_ids);
817 domain_exit(domain);
818 i = find_next_bit(iommu->domain_ids,
819 cap_ndoms(iommu->cap), i+1);
820 }
821
822 if (iommu->gcmd & DMA_GCMD_TE)
823 iommu_disable_translation(iommu);
824
825 if (iommu->irq) {
826 set_irq_data(iommu->irq, NULL);
827 /* This will mask the irq */
828 free_irq(iommu->irq, iommu);
829 destroy_irq(iommu->irq);
830 }
831
832 kfree(iommu->domains);
833 kfree(iommu->domain_ids);
834
835 /* free context mapping */
836 free_context_table(iommu);
837
838 if (iommu->reg)
839 iounmap(iommu->reg);
840 kfree(iommu);
841}
842
843static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
844{
845 unsigned long num;
846 unsigned long ndomains;
847 struct dmar_domain *domain;
848 unsigned long flags;
849
850 domain = alloc_domain_mem();
851 if (!domain)
852 return NULL;
853
854 ndomains = cap_ndoms(iommu->cap);
855
856 spin_lock_irqsave(&iommu->lock, flags);
857 num = find_first_zero_bit(iommu->domain_ids, ndomains);
858 if (num >= ndomains) {
859 spin_unlock_irqrestore(&iommu->lock, flags);
860 free_domain_mem(domain);
861 printk(KERN_ERR "IOMMU: no free domain ids\n");
862 return NULL;
863 }
864
865 set_bit(num, iommu->domain_ids);
866 domain->id = num;
867 domain->iommu = iommu;
868 iommu->domains[num] = domain;
869 spin_unlock_irqrestore(&iommu->lock, flags);
870
871 return domain;
872}
873
874static void iommu_free_domain(struct dmar_domain *domain)
875{
876 unsigned long flags;
877
878 spin_lock_irqsave(&domain->iommu->lock, flags);
879 clear_bit(domain->id, domain->iommu->domain_ids);
880 spin_unlock_irqrestore(&domain->iommu->lock, flags);
881}
882
883static struct iova_domain reserved_iova_list;
884
885static void dmar_init_reserved_ranges(void)
886{
887 struct pci_dev *pdev = NULL;
888 struct iova *iova;
889 int i;
890 u64 addr, size;
891
892 init_iova_domain(&reserved_iova_list);
893
894 /* IOAPIC ranges shouldn't be accessed by DMA */
895 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
896 IOVA_PFN(IOAPIC_RANGE_END));
897 if (!iova)
898 printk(KERN_ERR "Reserve IOAPIC range failed\n");
899
900 /* Reserve all PCI MMIO to avoid peer-to-peer access */
901 for_each_pci_dev(pdev) {
902 struct resource *r;
903
904 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
905 r = &pdev->resource[i];
906 if (!r->flags || !(r->flags & IORESOURCE_MEM))
907 continue;
908 addr = r->start;
909 addr &= PAGE_MASK_4K;
910 size = r->end - addr;
911 size = PAGE_ALIGN_4K(size);
912 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
913 IOVA_PFN(size + addr) - 1);
914 if (!iova)
915 printk(KERN_ERR "Reserve iova failed\n");
916 }
917 }
918
919}
920
921static void domain_reserve_special_ranges(struct dmar_domain *domain)
922{
923 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
924}
925
926static inline int guestwidth_to_adjustwidth(int gaw)
927{
928 int agaw;
929 int r = (gaw - 12) % 9;
930
931 if (r == 0)
932 agaw = gaw;
933 else
934 agaw = gaw + 9 - r;
935 if (agaw > 64)
936 agaw = 64;
937 return agaw;
938}
939
940static int domain_init(struct dmar_domain *domain, int guest_width)
941{
942 struct intel_iommu *iommu;
943 int adjust_width, agaw;
944 unsigned long sagaw;
945
946 init_iova_domain(&domain->iovad);
947 spin_lock_init(&domain->mapping_lock);
948
949 domain_reserve_special_ranges(domain);
950
951 /* calculate AGAW */
952 iommu = domain->iommu;
953 if (guest_width > cap_mgaw(iommu->cap))
954 guest_width = cap_mgaw(iommu->cap);
955 domain->gaw = guest_width;
956 adjust_width = guestwidth_to_adjustwidth(guest_width);
957 agaw = width_to_agaw(adjust_width);
958 sagaw = cap_sagaw(iommu->cap);
959 if (!test_bit(agaw, &sagaw)) {
960 /* hardware doesn't support it, choose a bigger one */
961 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
962 agaw = find_next_bit(&sagaw, 5, agaw);
963 if (agaw >= 5)
964 return -ENODEV;
965 }
966 domain->agaw = agaw;
967 INIT_LIST_HEAD(&domain->devices);
968
969 /* always allocate the top pgd */
970 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
971 if (!domain->pgd)
972 return -ENOMEM;
973 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
974 return 0;
975}
976
977static void domain_exit(struct dmar_domain *domain)
978{
979 u64 end;
980
981 /* Domain 0 is reserved, so dont process it */
982 if (!domain)
983 return;
984
985 domain_remove_dev_info(domain);
986 /* destroy iovas */
987 put_iova_domain(&domain->iovad);
988 end = DOMAIN_MAX_ADDR(domain->gaw);
989 end = end & (~PAGE_MASK_4K);
990
991 /* clear ptes */
992 dma_pte_clear_range(domain, 0, end);
993
994 /* free page tables */
995 dma_pte_free_pagetable(domain, 0, end);
996
997 iommu_free_domain(domain);
998 free_domain_mem(domain);
999}
1000
1001static int domain_context_mapping_one(struct dmar_domain *domain,
1002 u8 bus, u8 devfn)
1003{
1004 struct context_entry *context;
1005 struct intel_iommu *iommu = domain->iommu;
1006 unsigned long flags;
1007
1008 pr_debug("Set context mapping for %02x:%02x.%d\n",
1009 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1010 BUG_ON(!domain->pgd);
1011 context = device_to_context_entry(iommu, bus, devfn);
1012 if (!context)
1013 return -ENOMEM;
1014 spin_lock_irqsave(&iommu->lock, flags);
1015 if (context_present(*context)) {
1016 spin_unlock_irqrestore(&iommu->lock, flags);
1017 return 0;
1018 }
1019
1020 context_set_domain_id(*context, domain->id);
1021 context_set_address_width(*context, domain->agaw);
1022 context_set_address_root(*context, virt_to_phys(domain->pgd));
1023 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1024 context_set_fault_enable(*context);
1025 context_set_present(*context);
1026 __iommu_flush_cache(iommu, context, sizeof(*context));
1027
1028 /* it's a non-present to present mapping */
1029 if (iommu_flush_context_device(iommu, domain->id,
1030 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1031 iommu_flush_write_buffer(iommu);
1032 else
1033 iommu_flush_iotlb_dsi(iommu, 0, 0);
1034 spin_unlock_irqrestore(&iommu->lock, flags);
1035 return 0;
1036}
1037
1038static int
1039domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1040{
1041 int ret;
1042 struct pci_dev *tmp, *parent;
1043
1044 ret = domain_context_mapping_one(domain, pdev->bus->number,
1045 pdev->devfn);
1046 if (ret)
1047 return ret;
1048
1049 /* dependent device mapping */
1050 tmp = pci_find_upstream_pcie_bridge(pdev);
1051 if (!tmp)
1052 return 0;
1053 /* Secondary interface's bus number and devfn 0 */
1054 parent = pdev->bus->self;
1055 while (parent != tmp) {
1056 ret = domain_context_mapping_one(domain, parent->bus->number,
1057 parent->devfn);
1058 if (ret)
1059 return ret;
1060 parent = parent->bus->self;
1061 }
1062 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1063 return domain_context_mapping_one(domain,
1064 tmp->subordinate->number, 0);
1065 else /* this is a legacy PCI bridge */
1066 return domain_context_mapping_one(domain,
1067 tmp->bus->number, tmp->devfn);
1068}
1069
1070static int domain_context_mapped(struct dmar_domain *domain,
1071 struct pci_dev *pdev)
1072{
1073 int ret;
1074 struct pci_dev *tmp, *parent;
1075
1076 ret = device_context_mapped(domain->iommu,
1077 pdev->bus->number, pdev->devfn);
1078 if (!ret)
1079 return ret;
1080 /* dependent device mapping */
1081 tmp = pci_find_upstream_pcie_bridge(pdev);
1082 if (!tmp)
1083 return ret;
1084 /* Secondary interface's bus number and devfn 0 */
1085 parent = pdev->bus->self;
1086 while (parent != tmp) {
1087 ret = device_context_mapped(domain->iommu, parent->bus->number,
1088 parent->devfn);
1089 if (!ret)
1090 return ret;
1091 parent = parent->bus->self;
1092 }
1093 if (tmp->is_pcie)
1094 return device_context_mapped(domain->iommu,
1095 tmp->subordinate->number, 0);
1096 else
1097 return device_context_mapped(domain->iommu,
1098 tmp->bus->number, tmp->devfn);
1099}
1100
1101static int
1102domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1103 u64 hpa, size_t size, int prot)
1104{
1105 u64 start_pfn, end_pfn;
1106 struct dma_pte *pte;
1107 int index;
1108
1109 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1110 return -EINVAL;
1111 iova &= PAGE_MASK_4K;
1112 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1113 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1114 index = 0;
1115 while (start_pfn < end_pfn) {
1116 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1117 if (!pte)
1118 return -ENOMEM;
1119 /* We don't need lock here, nobody else
1120 * touches the iova range
1121 */
1122 BUG_ON(dma_pte_addr(*pte));
1123 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1124 dma_set_pte_prot(*pte, prot);
1125 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1126 start_pfn++;
1127 index++;
1128 }
1129 return 0;
1130}
1131
1132static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1133{
1134 clear_context_table(domain->iommu, bus, devfn);
1135 iommu_flush_context_global(domain->iommu, 0);
1136 iommu_flush_iotlb_global(domain->iommu, 0);
1137}
1138
1139static void domain_remove_dev_info(struct dmar_domain *domain)
1140{
1141 struct device_domain_info *info;
1142 unsigned long flags;
1143
1144 spin_lock_irqsave(&device_domain_lock, flags);
1145 while (!list_empty(&domain->devices)) {
1146 info = list_entry(domain->devices.next,
1147 struct device_domain_info, link);
1148 list_del(&info->link);
1149 list_del(&info->global);
1150 if (info->dev)
1151 info->dev->sysdata = NULL;
1152 spin_unlock_irqrestore(&device_domain_lock, flags);
1153
1154 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1155 free_devinfo_mem(info);
1156
1157 spin_lock_irqsave(&device_domain_lock, flags);
1158 }
1159 spin_unlock_irqrestore(&device_domain_lock, flags);
1160}
1161
1162/*
1163 * find_domain
1164 * Note: we use struct pci_dev->sysdata stores the info
1165 */
1166struct dmar_domain *
1167find_domain(struct pci_dev *pdev)
1168{
1169 struct device_domain_info *info;
1170
1171 /* No lock here, assumes no domain exit in normal case */
1172 info = pdev->sysdata;
1173 if (info)
1174 return info->domain;
1175 return NULL;
1176}
1177
1178static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1179 struct pci_dev *dev)
1180{
1181 int index;
1182
1183 while (dev) {
1184 for (index = 0; index < cnt; index ++)
1185 if (dev == devices[index])
1186 return 1;
1187
1188 /* Check our parent */
1189 dev = dev->bus->self;
1190 }
1191
1192 return 0;
1193}
1194
1195static struct dmar_drhd_unit *
1196dmar_find_matched_drhd_unit(struct pci_dev *dev)
1197{
1198 struct dmar_drhd_unit *drhd = NULL;
1199
1200 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1201 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1202 drhd->devices_cnt, dev))
1203 return drhd;
1204 }
1205
1206 return NULL;
1207}
1208
1209/* domain is initialized */
1210static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1211{
1212 struct dmar_domain *domain, *found = NULL;
1213 struct intel_iommu *iommu;
1214 struct dmar_drhd_unit *drhd;
1215 struct device_domain_info *info, *tmp;
1216 struct pci_dev *dev_tmp;
1217 unsigned long flags;
1218 int bus = 0, devfn = 0;
1219
1220 domain = find_domain(pdev);
1221 if (domain)
1222 return domain;
1223
1224 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1225 if (dev_tmp) {
1226 if (dev_tmp->is_pcie) {
1227 bus = dev_tmp->subordinate->number;
1228 devfn = 0;
1229 } else {
1230 bus = dev_tmp->bus->number;
1231 devfn = dev_tmp->devfn;
1232 }
1233 spin_lock_irqsave(&device_domain_lock, flags);
1234 list_for_each_entry(info, &device_domain_list, global) {
1235 if (info->bus == bus && info->devfn == devfn) {
1236 found = info->domain;
1237 break;
1238 }
1239 }
1240 spin_unlock_irqrestore(&device_domain_lock, flags);
1241 /* pcie-pci bridge already has a domain, uses it */
1242 if (found) {
1243 domain = found;
1244 goto found_domain;
1245 }
1246 }
1247
1248 /* Allocate new domain for the device */
1249 drhd = dmar_find_matched_drhd_unit(pdev);
1250 if (!drhd) {
1251 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1252 pci_name(pdev));
1253 return NULL;
1254 }
1255 iommu = drhd->iommu;
1256
1257 domain = iommu_alloc_domain(iommu);
1258 if (!domain)
1259 goto error;
1260
1261 if (domain_init(domain, gaw)) {
1262 domain_exit(domain);
1263 goto error;
1264 }
1265
1266 /* register pcie-to-pci device */
1267 if (dev_tmp) {
1268 info = alloc_devinfo_mem();
1269 if (!info) {
1270 domain_exit(domain);
1271 goto error;
1272 }
1273 info->bus = bus;
1274 info->devfn = devfn;
1275 info->dev = NULL;
1276 info->domain = domain;
1277 /* This domain is shared by devices under p2p bridge */
1278 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1279
1280 /* pcie-to-pci bridge already has a domain, uses it */
1281 found = NULL;
1282 spin_lock_irqsave(&device_domain_lock, flags);
1283 list_for_each_entry(tmp, &device_domain_list, global) {
1284 if (tmp->bus == bus && tmp->devfn == devfn) {
1285 found = tmp->domain;
1286 break;
1287 }
1288 }
1289 if (found) {
1290 free_devinfo_mem(info);
1291 domain_exit(domain);
1292 domain = found;
1293 } else {
1294 list_add(&info->link, &domain->devices);
1295 list_add(&info->global, &device_domain_list);
1296 }
1297 spin_unlock_irqrestore(&device_domain_lock, flags);
1298 }
1299
1300found_domain:
1301 info = alloc_devinfo_mem();
1302 if (!info)
1303 goto error;
1304 info->bus = pdev->bus->number;
1305 info->devfn = pdev->devfn;
1306 info->dev = pdev;
1307 info->domain = domain;
1308 spin_lock_irqsave(&device_domain_lock, flags);
1309 /* somebody is fast */
1310 found = find_domain(pdev);
1311 if (found != NULL) {
1312 spin_unlock_irqrestore(&device_domain_lock, flags);
1313 if (found != domain) {
1314 domain_exit(domain);
1315 domain = found;
1316 }
1317 free_devinfo_mem(info);
1318 return domain;
1319 }
1320 list_add(&info->link, &domain->devices);
1321 list_add(&info->global, &device_domain_list);
1322 pdev->sysdata = info;
1323 spin_unlock_irqrestore(&device_domain_lock, flags);
1324 return domain;
1325error:
1326 /* recheck it here, maybe others set it */
1327 return find_domain(pdev);
1328}
1329
1330static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1331{
1332 struct dmar_domain *domain;
1333 unsigned long size;
1334 u64 base;
1335 int ret;
1336
1337 printk(KERN_INFO
1338 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1339 pci_name(pdev), start, end);
1340 /* page table init */
1341 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1342 if (!domain)
1343 return -ENOMEM;
1344
1345 /* The address might not be aligned */
1346 base = start & PAGE_MASK_4K;
1347 size = end - base;
1348 size = PAGE_ALIGN_4K(size);
1349 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1350 IOVA_PFN(base + size) - 1)) {
1351 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1352 ret = -ENOMEM;
1353 goto error;
1354 }
1355
1356 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1357 size, base, pci_name(pdev));
1358 /*
1359 * RMRR range might have overlap with physical memory range,
1360 * clear it first
1361 */
1362 dma_pte_clear_range(domain, base, base + size);
1363
1364 ret = domain_page_mapping(domain, base, base, size,
1365 DMA_PTE_READ|DMA_PTE_WRITE);
1366 if (ret)
1367 goto error;
1368
1369 /* context entry init */
1370 ret = domain_context_mapping(domain, pdev);
1371 if (!ret)
1372 return 0;
1373error:
1374 domain_exit(domain);
1375 return ret;
1376
1377}
1378
1379static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1380 struct pci_dev *pdev)
1381{
1382 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1383 return 0;
1384 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1385 rmrr->end_address + 1);
1386}
1387
1388int __init init_dmars(void)
1389{
1390 struct dmar_drhd_unit *drhd;
1391 struct dmar_rmrr_unit *rmrr;
1392 struct pci_dev *pdev;
1393 struct intel_iommu *iommu;
1394 int ret, unit = 0;
1395
1396 /*
1397 * for each drhd
1398 * allocate root
1399 * initialize and program root entry to not present
1400 * endfor
1401 */
1402 for_each_drhd_unit(drhd) {
1403 if (drhd->ignored)
1404 continue;
1405 iommu = alloc_iommu(drhd);
1406 if (!iommu) {
1407 ret = -ENOMEM;
1408 goto error;
1409 }
1410
1411 /*
1412 * TBD:
1413 * we could share the same root & context tables
1414 * amoung all IOMMU's. Need to Split it later.
1415 */
1416 ret = iommu_alloc_root_entry(iommu);
1417 if (ret) {
1418 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1419 goto error;
1420 }
1421 }
1422
1423 /*
1424 * For each rmrr
1425 * for each dev attached to rmrr
1426 * do
1427 * locate drhd for dev, alloc domain for dev
1428 * allocate free domain
1429 * allocate page table entries for rmrr
1430 * if context not allocated for bus
1431 * allocate and init context
1432 * set present in root table for this bus
1433 * init context with domain, translation etc
1434 * endfor
1435 * endfor
1436 */
1437 for_each_rmrr_units(rmrr) {
1438 int i;
1439 for (i = 0; i < rmrr->devices_cnt; i++) {
1440 pdev = rmrr->devices[i];
1441 /* some BIOS lists non-exist devices in DMAR table */
1442 if (!pdev)
1443 continue;
1444 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1445 if (ret)
1446 printk(KERN_ERR
1447 "IOMMU: mapping reserved region failed\n");
1448 }
1449 }
1450
1451 /*
1452 * for each drhd
1453 * enable fault log
1454 * global invalidate context cache
1455 * global invalidate iotlb
1456 * enable translation
1457 */
1458 for_each_drhd_unit(drhd) {
1459 if (drhd->ignored)
1460 continue;
1461 iommu = drhd->iommu;
1462 sprintf (iommu->name, "dmar%d", unit++);
1463
1464 iommu_flush_write_buffer(iommu);
1465
1466 iommu_set_root_entry(iommu);
1467
1468 iommu_flush_context_global(iommu, 0);
1469 iommu_flush_iotlb_global(iommu, 0);
1470
1471 ret = iommu_enable_translation(iommu);
1472 if (ret)
1473 goto error;
1474 }
1475
1476 return 0;
1477error:
1478 for_each_drhd_unit(drhd) {
1479 if (drhd->ignored)
1480 continue;
1481 iommu = drhd->iommu;
1482 free_iommu(iommu);
1483 }
1484 return ret;
1485}
1486
1487static inline u64 aligned_size(u64 host_addr, size_t size)
1488{
1489 u64 addr;
1490 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1491 return PAGE_ALIGN_4K(addr);
1492}
1493
1494struct iova *
1495iommu_alloc_iova(struct dmar_domain *domain, void *host_addr, size_t size,
1496 u64 start, u64 end)
1497{
1498 u64 start_addr;
1499 struct iova *piova;
1500
1501 /* Make sure it's in range */
1502 if ((start > DOMAIN_MAX_ADDR(domain->gaw)) || end < start)
1503 return NULL;
1504
1505 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1506 start_addr = PAGE_ALIGN_4K(start);
1507 size = aligned_size((u64)host_addr, size);
1508 if (!size || (start_addr + size > end))
1509 return NULL;
1510
1511 piova = alloc_iova(&domain->iovad,
1512 size >> PAGE_SHIFT_4K, IOVA_PFN(end));
1513
1514 return piova;
1515}
1516
1517static dma_addr_t __intel_map_single(struct device *dev, void *addr,
1518 size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1519{
1520 struct dmar_domain *domain;
1521 struct pci_dev *pdev = to_pci_dev(dev);
1522 int ret;
1523 int prot = 0;
1524 struct iova *iova = NULL;
1525 u64 start_addr;
1526
1527 addr = (void *)virt_to_phys(addr);
1528
1529 domain = get_domain_for_dev(pdev,
1530 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1531 if (!domain) {
1532 printk(KERN_ERR
1533 "Allocating domain for %s failed", pci_name(pdev));
1534 return 0;
1535 }
1536
1537 start_addr = IOVA_START_ADDR;
1538
1539 if (pdev->dma_mask <= DMA_32BIT_MASK) {
1540 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1541 pdev->dma_mask);
1542 } else {
1543 /*
1544 * First try to allocate an io virtual address in
1545 * DMA_32BIT_MASK and if that fails then try allocating
1546 * from higer range
1547 */
1548 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1549 DMA_32BIT_MASK);
1550 if (!iova)
1551 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1552 pdev->dma_mask);
1553 }
1554
1555 if (!iova) {
1556 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1557 return 0;
1558 }
1559
1560 /* make sure context mapping is ok */
1561 if (unlikely(!domain_context_mapped(domain, pdev))) {
1562 ret = domain_context_mapping(domain, pdev);
1563 if (ret)
1564 goto error;
1565 }
1566
1567 /*
1568 * Check if DMAR supports zero-length reads on write only
1569 * mappings..
1570 */
1571 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1572 !cap_zlr(domain->iommu->cap))
1573 prot |= DMA_PTE_READ;
1574 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1575 prot |= DMA_PTE_WRITE;
1576 /*
1577 * addr - (addr + size) might be partial page, we should map the whole
1578 * page. Note: if two part of one page are separately mapped, we
1579 * might have two guest_addr mapping to the same host addr, but this
1580 * is not a big problem
1581 */
1582 ret = domain_page_mapping(domain, iova->pfn_lo << PAGE_SHIFT_4K,
1583 ((u64)addr) & PAGE_MASK_4K,
1584 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K, prot);
1585 if (ret)
1586 goto error;
1587
1588 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1589 pci_name(pdev), size, (u64)addr,
1590 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1591 (u64)(iova->pfn_lo << PAGE_SHIFT_4K), dir);
1592
1593 *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1594 *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1595 return (iova->pfn_lo << PAGE_SHIFT_4K) + ((u64)addr & (~PAGE_MASK_4K));
1596error:
1597 __free_iova(&domain->iovad, iova);
1598 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1599 pci_name(pdev), size, (u64)addr, dir);
1600 return 0;
1601}
1602
1603static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1604 size_t size, int dir)
1605{
1606 struct pci_dev *pdev = to_pci_dev(hwdev);
1607 dma_addr_t ret;
1608 struct dmar_domain *domain;
1609 u64 flush_addr;
1610 unsigned int flush_size;
1611
1612 BUG_ON(dir == DMA_NONE);
1613 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1614 return virt_to_bus(addr);
1615
1616 ret = __intel_map_single(hwdev, addr, size,
1617 dir, &flush_addr, &flush_size);
1618 if (ret) {
1619 domain = find_domain(pdev);
1620 /* it's a non-present to present mapping */
1621 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
1622 flush_addr, flush_size >> PAGE_SHIFT_4K, 1))
1623 iommu_flush_write_buffer(domain->iommu);
1624 }
1625 return ret;
1626}
1627
1628static void __intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1629 size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1630{
1631 struct dmar_domain *domain;
1632 struct pci_dev *pdev = to_pci_dev(dev);
1633 struct iova *iova;
1634
1635 domain = find_domain(pdev);
1636 BUG_ON(!domain);
1637
1638 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1639 if (!iova) {
1640 *flush_size = 0;
1641 return;
1642 }
1643 pr_debug("Device %s unmapping: %lx@%llx\n",
1644 pci_name(pdev),
1645 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1646 (u64)(iova->pfn_lo << PAGE_SHIFT_4K));
1647
1648 *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1649 *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1650 /* clear the whole page, not just dev_addr - (dev_addr + size) */
1651 dma_pte_clear_range(domain, *flush_addr, *flush_addr + *flush_size);
1652 /* free page tables */
1653 dma_pte_free_pagetable(domain, *flush_addr, *flush_addr + *flush_size);
1654 /* free iova */
1655 __free_iova(&domain->iovad, iova);
1656}
1657
1658static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1659 size_t size, int dir)
1660{
1661 struct pci_dev *pdev = to_pci_dev(dev);
1662 struct dmar_domain *domain;
1663 u64 flush_addr;
1664 unsigned int flush_size;
1665
1666 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1667 return;
1668
1669 domain = find_domain(pdev);
1670 __intel_unmap_single(dev, dev_addr, size,
1671 dir, &flush_addr, &flush_size);
1672 if (flush_size == 0)
1673 return;
1674 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, flush_addr,
1675 flush_size >> PAGE_SHIFT_4K, 0))
1676 iommu_flush_write_buffer(domain->iommu);
1677}
1678
1679static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1680 dma_addr_t *dma_handle, gfp_t flags)
1681{
1682 void *vaddr;
1683 int order;
1684
1685 size = PAGE_ALIGN_4K(size);
1686 order = get_order(size);
1687 flags &= ~(GFP_DMA | GFP_DMA32);
1688
1689 vaddr = (void *)__get_free_pages(flags, order);
1690 if (!vaddr)
1691 return NULL;
1692 memset(vaddr, 0, size);
1693
1694 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1695 if (*dma_handle)
1696 return vaddr;
1697 free_pages((unsigned long)vaddr, order);
1698 return NULL;
1699}
1700
1701static void intel_free_coherent(struct device *hwdev, size_t size,
1702 void *vaddr, dma_addr_t dma_handle)
1703{
1704 int order;
1705
1706 size = PAGE_ALIGN_4K(size);
1707 order = get_order(size);
1708
1709 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1710 free_pages((unsigned long)vaddr, order);
1711}
1712
1713static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sg,
1714 int nelems, int dir)
1715{
1716 int i;
1717 struct pci_dev *pdev = to_pci_dev(hwdev);
1718 struct dmar_domain *domain;
1719 u64 flush_addr;
1720 unsigned int flush_size;
1721
1722 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1723 return;
1724
1725 domain = find_domain(pdev);
1726 for (i = 0; i < nelems; i++, sg++)
1727 __intel_unmap_single(hwdev, sg->dma_address,
1728 sg->dma_length, dir, &flush_addr, &flush_size);
1729
1730 if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 0))
1731 iommu_flush_write_buffer(domain->iommu);
1732}
1733
1734#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
1735static int intel_nontranslate_map_sg(struct device *hddev,
1736 struct scatterlist *sg, int nelems, int dir)
1737{
1738 int i;
1739
1740 for (i = 0; i < nelems; i++) {
1741 struct scatterlist *s = &sg[i];
1742 BUG_ON(!s->page);
1743 s->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(s));
1744 s->dma_length = s->length;
1745 }
1746 return nelems;
1747}
1748
1749static int intel_map_sg(struct device *hwdev, struct scatterlist *sg,
1750 int nelems, int dir)
1751{
1752 void *addr;
1753 int i;
1754 dma_addr_t dma_handle;
1755 struct pci_dev *pdev = to_pci_dev(hwdev);
1756 struct dmar_domain *domain;
1757 u64 flush_addr;
1758 unsigned int flush_size;
1759
1760 BUG_ON(dir == DMA_NONE);
1761 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1762 return intel_nontranslate_map_sg(hwdev, sg, nelems, dir);
1763
1764 for (i = 0; i < nelems; i++, sg++) {
1765 addr = SG_ENT_VIRT_ADDRESS(sg);
1766 dma_handle = __intel_map_single(hwdev, addr,
1767 sg->length, dir, &flush_addr, &flush_size);
1768 if (!dma_handle) {
1769 intel_unmap_sg(hwdev, sg - i, i, dir);
1770 sg[0].dma_length = 0;
1771 return 0;
1772 }
1773 sg->dma_address = dma_handle;
1774 sg->dma_length = sg->length;
1775 }
1776
1777 domain = find_domain(pdev);
1778
1779 /* it's a non-present to present mapping */
1780 if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 1))
1781 iommu_flush_write_buffer(domain->iommu);
1782 return nelems;
1783}
1784
1785static struct dma_mapping_ops intel_dma_ops = {
1786 .alloc_coherent = intel_alloc_coherent,
1787 .free_coherent = intel_free_coherent,
1788 .map_single = intel_map_single,
1789 .unmap_single = intel_unmap_single,
1790 .map_sg = intel_map_sg,
1791 .unmap_sg = intel_unmap_sg,
1792};
1793
1794static inline int iommu_domain_cache_init(void)
1795{
1796 int ret = 0;
1797
1798 iommu_domain_cache = kmem_cache_create("iommu_domain",
1799 sizeof(struct dmar_domain),
1800 0,
1801 SLAB_HWCACHE_ALIGN,
1802
1803 NULL);
1804 if (!iommu_domain_cache) {
1805 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
1806 ret = -ENOMEM;
1807 }
1808
1809 return ret;
1810}
1811
1812static inline int iommu_devinfo_cache_init(void)
1813{
1814 int ret = 0;
1815
1816 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
1817 sizeof(struct device_domain_info),
1818 0,
1819 SLAB_HWCACHE_ALIGN,
1820
1821 NULL);
1822 if (!iommu_devinfo_cache) {
1823 printk(KERN_ERR "Couldn't create devinfo cache\n");
1824 ret = -ENOMEM;
1825 }
1826
1827 return ret;
1828}
1829
1830static inline int iommu_iova_cache_init(void)
1831{
1832 int ret = 0;
1833
1834 iommu_iova_cache = kmem_cache_create("iommu_iova",
1835 sizeof(struct iova),
1836 0,
1837 SLAB_HWCACHE_ALIGN,
1838
1839 NULL);
1840 if (!iommu_iova_cache) {
1841 printk(KERN_ERR "Couldn't create iova cache\n");
1842 ret = -ENOMEM;
1843 }
1844
1845 return ret;
1846}
1847
1848static int __init iommu_init_mempool(void)
1849{
1850 int ret;
1851 ret = iommu_iova_cache_init();
1852 if (ret)
1853 return ret;
1854
1855 ret = iommu_domain_cache_init();
1856 if (ret)
1857 goto domain_error;
1858
1859 ret = iommu_devinfo_cache_init();
1860 if (!ret)
1861 return ret;
1862
1863 kmem_cache_destroy(iommu_domain_cache);
1864domain_error:
1865 kmem_cache_destroy(iommu_iova_cache);
1866
1867 return -ENOMEM;
1868}
1869
1870static void __init iommu_exit_mempool(void)
1871{
1872 kmem_cache_destroy(iommu_devinfo_cache);
1873 kmem_cache_destroy(iommu_domain_cache);
1874 kmem_cache_destroy(iommu_iova_cache);
1875
1876}
1877
1878void __init detect_intel_iommu(void)
1879{
1880 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
1881 return;
1882 if (early_dmar_detect()) {
1883 iommu_detected = 1;
1884 }
1885}
1886
1887static void __init init_no_remapping_devices(void)
1888{
1889 struct dmar_drhd_unit *drhd;
1890
1891 for_each_drhd_unit(drhd) {
1892 if (!drhd->include_all) {
1893 int i;
1894 for (i = 0; i < drhd->devices_cnt; i++)
1895 if (drhd->devices[i] != NULL)
1896 break;
1897 /* ignore DMAR unit if no pci devices exist */
1898 if (i == drhd->devices_cnt)
1899 drhd->ignored = 1;
1900 }
1901 }
1902
1903 if (dmar_map_gfx)
1904 return;
1905
1906 for_each_drhd_unit(drhd) {
1907 int i;
1908 if (drhd->ignored || drhd->include_all)
1909 continue;
1910
1911 for (i = 0; i < drhd->devices_cnt; i++)
1912 if (drhd->devices[i] &&
1913 !IS_GFX_DEVICE(drhd->devices[i]))
1914 break;
1915
1916 if (i < drhd->devices_cnt)
1917 continue;
1918
1919 /* bypass IOMMU if it is just for gfx devices */
1920 drhd->ignored = 1;
1921 for (i = 0; i < drhd->devices_cnt; i++) {
1922 if (!drhd->devices[i])
1923 continue;
1924 drhd->devices[i]->sysdata = DUMMY_DEVICE_DOMAIN_INFO;
1925 }
1926 }
1927}
1928
1929int __init intel_iommu_init(void)
1930{
1931 int ret = 0;
1932
1933 if (no_iommu || swiotlb || dmar_disabled)
1934 return -ENODEV;
1935
1936 if (dmar_table_init())
1937 return -ENODEV;
1938
1939 iommu_init_mempool();
1940 dmar_init_reserved_ranges();
1941
1942 init_no_remapping_devices();
1943
1944 ret = init_dmars();
1945 if (ret) {
1946 printk(KERN_ERR "IOMMU: dmar init failed\n");
1947 put_iova_domain(&reserved_iova_list);
1948 iommu_exit_mempool();
1949 return ret;
1950 }
1951 printk(KERN_INFO
1952 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
1953
1954 force_iommu = 1;
1955 dma_ops = &intel_dma_ops;
1956 return 0;
1957}
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
new file mode 100644
index 000000000000..71dda6b56ffa
--- /dev/null
+++ b/drivers/pci/intel-iommu.h
@@ -0,0 +1,318 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
19 */
20
21#ifndef _INTEL_IOMMU_H_
22#define _INTEL_IOMMU_H_
23
24#include <linux/types.h>
25#include <linux/msi.h>
26#include "iova.h"
27#include <linux/io.h>
28
29/*
30 * Intel IOMMU register specification per version 1.0 public spec.
31 */
32
33#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */
34#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */
35#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */
36#define DMAR_GCMD_REG 0x18 /* Global command register */
37#define DMAR_GSTS_REG 0x1c /* Global status register */
38#define DMAR_RTADDR_REG 0x20 /* Root entry table */
39#define DMAR_CCMD_REG 0x28 /* Context command reg */
40#define DMAR_FSTS_REG 0x34 /* Fault Status register */
41#define DMAR_FECTL_REG 0x38 /* Fault control register */
42#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */
43#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */
44#define DMAR_FEUADDR_REG 0x44 /* Upper address register */
45#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */
46#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */
47#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */
48#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */
49#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */
50#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */
51
52#define OFFSET_STRIDE (9)
53/*
54#define dmar_readl(dmar, reg) readl(dmar + reg)
55#define dmar_readq(dmar, reg) ({ \
56 u32 lo, hi; \
57 lo = readl(dmar + reg); \
58 hi = readl(dmar + reg + 4); \
59 (((u64) hi) << 32) + lo; })
60*/
61static inline u64 dmar_readq(void *addr)
62{
63 u32 lo, hi;
64 lo = readl(addr);
65 hi = readl(addr + 4);
66 return (((u64) hi) << 32) + lo;
67}
68
69static inline void dmar_writeq(void __iomem *addr, u64 val)
70{
71 writel((u32)val, addr);
72 writel((u32)(val >> 32), addr + 4);
73}
74
75#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4)
76#define DMAR_VER_MINOR(v) ((v) & 0x0f)
77
78/*
79 * Decoding Capability Register
80 */
81#define cap_read_drain(c) (((c) >> 55) & 1)
82#define cap_write_drain(c) (((c) >> 54) & 1)
83#define cap_max_amask_val(c) (((c) >> 48) & 0x3f)
84#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1)
85#define cap_pgsel_inv(c) (((c) >> 39) & 1)
86
87#define cap_super_page_val(c) (((c) >> 34) & 0xf)
88#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \
89 * OFFSET_STRIDE) + 21)
90
91#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16)
92#define cap_max_fault_reg_offset(c) \
93 (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
94
95#define cap_zlr(c) (((c) >> 22) & 1)
96#define cap_isoch(c) (((c) >> 23) & 1)
97#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1)
98#define cap_sagaw(c) (((c) >> 8) & 0x1f)
99#define cap_caching_mode(c) (((c) >> 7) & 1)
100#define cap_phmr(c) (((c) >> 6) & 1)
101#define cap_plmr(c) (((c) >> 5) & 1)
102#define cap_rwbf(c) (((c) >> 4) & 1)
103#define cap_afl(c) (((c) >> 3) & 1)
104#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
105/*
106 * Extended Capability Register
107 */
108
109#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1)
110#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16)
111#define ecap_max_iotlb_offset(e) \
112 (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
113#define ecap_coherent(e) ((e) & 0x1)
114
115
116/* IOTLB_REG */
117#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
118#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
119#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
120#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
121#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
122#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
123#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
124#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32)
125#define DMA_TLB_IVT (((u64)1) << 63)
126#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
127#define DMA_TLB_MAX_SIZE (0x3f)
128
129/* GCMD_REG */
130#define DMA_GCMD_TE (((u32)1) << 31)
131#define DMA_GCMD_SRTP (((u32)1) << 30)
132#define DMA_GCMD_SFL (((u32)1) << 29)
133#define DMA_GCMD_EAFL (((u32)1) << 28)
134#define DMA_GCMD_WBF (((u32)1) << 27)
135
136/* GSTS_REG */
137#define DMA_GSTS_TES (((u32)1) << 31)
138#define DMA_GSTS_RTPS (((u32)1) << 30)
139#define DMA_GSTS_FLS (((u32)1) << 29)
140#define DMA_GSTS_AFLS (((u32)1) << 28)
141#define DMA_GSTS_WBFS (((u32)1) << 27)
142
143/* CCMD_REG */
144#define DMA_CCMD_ICC (((u64)1) << 63)
145#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
146#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
147#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
148#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
149#define DMA_CCMD_MASK_NOBIT 0
150#define DMA_CCMD_MASK_1BIT 1
151#define DMA_CCMD_MASK_2BIT 2
152#define DMA_CCMD_MASK_3BIT 3
153#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
154#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
155
156/* FECTL_REG */
157#define DMA_FECTL_IM (((u32)1) << 31)
158
159/* FSTS_REG */
160#define DMA_FSTS_PPF ((u32)2)
161#define DMA_FSTS_PFO ((u32)1)
162#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
163
164/* FRCD_REG, 32 bits access */
165#define DMA_FRCD_F (((u32)1) << 31)
166#define dma_frcd_type(d) ((d >> 30) & 1)
167#define dma_frcd_fault_reason(c) (c & 0xff)
168#define dma_frcd_source_id(c) (c & 0xffff)
169#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
170
171/*
172 * 0: Present
173 * 1-11: Reserved
174 * 12-63: Context Ptr (12 - (haw-1))
175 * 64-127: Reserved
176 */
177struct root_entry {
178 u64 val;
179 u64 rsvd1;
180};
181#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
182static inline bool root_present(struct root_entry *root)
183{
184 return (root->val & 1);
185}
186static inline void set_root_present(struct root_entry *root)
187{
188 root->val |= 1;
189}
190static inline void set_root_value(struct root_entry *root, unsigned long value)
191{
192 root->val |= value & PAGE_MASK_4K;
193}
194
195struct context_entry;
196static inline struct context_entry *
197get_context_addr_from_root(struct root_entry *root)
198{
199 return (struct context_entry *)
200 (root_present(root)?phys_to_virt(
201 root->val & PAGE_MASK_4K):
202 NULL);
203}
204
205/*
206 * low 64 bits:
207 * 0: present
208 * 1: fault processing disable
209 * 2-3: translation type
210 * 12-63: address space root
211 * high 64 bits:
212 * 0-2: address width
213 * 3-6: aval
214 * 8-23: domain id
215 */
216struct context_entry {
217 u64 lo;
218 u64 hi;
219};
220#define context_present(c) ((c).lo & 1)
221#define context_fault_disable(c) (((c).lo >> 1) & 1)
222#define context_translation_type(c) (((c).lo >> 2) & 3)
223#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
224#define context_address_width(c) ((c).hi & 7)
225#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
226
227#define context_set_present(c) do {(c).lo |= 1;} while (0)
228#define context_set_fault_enable(c) \
229 do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
230#define context_set_translation_type(c, val) \
231 do { \
232 (c).lo &= (((u64)-1) << 4) | 3; \
233 (c).lo |= ((val) & 3) << 2; \
234 } while (0)
235#define CONTEXT_TT_MULTI_LEVEL 0
236#define context_set_address_root(c, val) \
237 do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
238#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
239#define context_set_domain_id(c, val) \
240 do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
241#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
242
243/*
244 * 0: readable
245 * 1: writable
246 * 2-6: reserved
247 * 7: super page
248 * 8-11: available
249 * 12-63: Host physcial address
250 */
251struct dma_pte {
252 u64 val;
253};
254#define dma_clear_pte(p) do {(p).val = 0;} while (0)
255
256#define DMA_PTE_READ (1)
257#define DMA_PTE_WRITE (2)
258
259#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
260#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
261#define dma_set_pte_prot(p, prot) \
262 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
263#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
264#define dma_set_pte_addr(p, addr) do {\
265 (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
266#define dma_pte_present(p) (((p).val & 3) != 0)
267
268struct intel_iommu;
269
270struct dmar_domain {
271 int id; /* domain id */
272 struct intel_iommu *iommu; /* back pointer to owning iommu */
273
274 struct list_head devices; /* all devices' list */
275 struct iova_domain iovad; /* iova's that belong to this domain */
276
277 struct dma_pte *pgd; /* virtual address */
278 spinlock_t mapping_lock; /* page table lock */
279 int gaw; /* max guest address width */
280
281 /* adjusted guest address width, 0 is level 2 30-bit */
282 int agaw;
283
284#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
285 int flags;
286};
287
288/* PCI domain-device relationship */
289struct device_domain_info {
290 struct list_head link; /* link to domain siblings */
291 struct list_head global; /* link to global list */
292 u8 bus; /* PCI bus numer */
293 u8 devfn; /* PCI devfn number */
294 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
295 struct dmar_domain *domain; /* pointer to domain */
296};
297
298extern int init_dmars(void);
299
300struct intel_iommu {
301 void __iomem *reg; /* Pointer to hardware regs, virtual addr */
302 u64 cap;
303 u64 ecap;
304 unsigned long *domain_ids; /* bitmap of domains */
305 struct dmar_domain **domains; /* ptr to domains */
306 int seg;
307 u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
308 spinlock_t lock; /* protect context, domain ids */
309 spinlock_t register_lock; /* protect register handling */
310 struct root_entry *root_entry; /* virtual address */
311
312 unsigned int irq;
313 unsigned char name[7]; /* Device Name */
314 struct msi_msg saved_msg;
315 struct sys_device sysdev;
316};
317
318#endif
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 8d3e0e38ca4d..7d683dc8ed1e 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -23,7 +23,14 @@
23 23
24#include <linux/acpi.h> 24#include <linux/acpi.h>
25#include <linux/types.h> 25#include <linux/types.h>
26#include <linux/msi.h>
26 27
28#ifdef CONFIG_DMAR
29struct intel_iommu;
30
31/* Intel IOMMU detection and initialization functions */
32extern void detect_intel_iommu(void);
33extern int intel_iommu_init(void);
27 34
28extern int dmar_table_init(void); 35extern int dmar_table_init(void);
29extern int early_dmar_detect(void); 36extern int early_dmar_detect(void);
@@ -49,4 +56,19 @@ struct dmar_rmrr_unit {
49 int devices_cnt; /* target device count */ 56 int devices_cnt; /* target device count */
50}; 57};
51 58
59#define for_each_drhd_unit(drhd) \
60 list_for_each_entry(drhd, &dmar_drhd_units, list)
61#define for_each_rmrr_units(rmrr) \
62 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
63#else
64static inline void detect_intel_iommu(void)
65{
66 return;
67}
68static inline int intel_iommu_init(void)
69{
70 return -ENODEV;
71}
72
73#endif /* !CONFIG_DMAR */
52#endif /* __DMAR_H__ */ 74#endif /* __DMAR_H__ */