aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/iommu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig110
-rw-r--r--drivers/iommu/Makefile5
-rw-r--r--drivers/iommu/amd_iommu.c2810
-rw-r--r--drivers/iommu/amd_iommu_init.c1574
-rw-r--r--drivers/iommu/amd_iommu_proto.h54
-rw-r--r--drivers/iommu/amd_iommu_types.h585
-rw-r--r--drivers/iommu/dmar.c1461
-rw-r--r--drivers/iommu/intel-iommu.c4016
-rw-r--r--drivers/iommu/intr_remapping.c797
-rw-r--r--drivers/iommu/intr_remapping.h17
-rw-r--r--drivers/iommu/iommu.c124
-rw-r--r--drivers/iommu/iova.c435
-rw-r--r--drivers/iommu/msm_iommu.c731
-rw-r--r--drivers/iommu/msm_iommu_dev.c422
14 files changed, 13141 insertions, 0 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
new file mode 100644
index 000000000000..b57b3fa492f3
--- /dev/null
+++ b/drivers/iommu/Kconfig
@@ -0,0 +1,110 @@
1# IOMMU_API always gets selected by whoever wants it.
2config IOMMU_API
3 bool
4
5menuconfig IOMMU_SUPPORT
6 bool "IOMMU Hardware Support"
7 default y
8 ---help---
9 Say Y here if you want to compile device drivers for IO Memory
10 Management Units into the kernel. These devices usually allow to
11 remap DMA requests and/or remap interrupts from other devices on the
12 system.
13
14if IOMMU_SUPPORT
15
16# MSM IOMMU support
17config MSM_IOMMU
18 bool "MSM IOMMU Support"
19 depends on ARCH_MSM8X60 || ARCH_MSM8960
20 select IOMMU_API
21 help
22 Support for the IOMMUs found on certain Qualcomm SOCs.
23 These IOMMUs allow virtualization of the address space used by most
24 cores within the multimedia subsystem.
25
26 If unsure, say N here.
27
28config IOMMU_PGTABLES_L2
29 def_bool y
30 depends on MSM_IOMMU && MMU && SMP && CPU_DCACHE_DISABLE=n
31
32# AMD IOMMU support
33config AMD_IOMMU
34 bool "AMD IOMMU support"
35 select SWIOTLB
36 select PCI_MSI
37 select PCI_IOV
38 select IOMMU_API
39 depends on X86_64 && PCI && ACPI
40 ---help---
41 With this option you can enable support for AMD IOMMU hardware in
42 your system. An IOMMU is a hardware component which provides
43 remapping of DMA memory accesses from devices. With an AMD IOMMU you
44 can isolate the the DMA memory of different devices and protect the
45 system from misbehaving device drivers or hardware.
46
47 You can find out if your system has an AMD IOMMU if you look into
48 your BIOS for an option to enable it or if you have an IVRS ACPI
49 table.
50
51config AMD_IOMMU_STATS
52 bool "Export AMD IOMMU statistics to debugfs"
53 depends on AMD_IOMMU
54 select DEBUG_FS
55 ---help---
56 This option enables code in the AMD IOMMU driver to collect various
57 statistics about whats happening in the driver and exports that
58 information to userspace via debugfs.
59 If unsure, say N.
60
61# Intel IOMMU support
62config DMAR
63 bool "Support for DMA Remapping Devices"
64 depends on PCI_MSI && ACPI && (X86 || IA64_GENERIC)
65 select IOMMU_API
66 help
67 DMA remapping (DMAR) devices support enables independent address
68 translations for Direct Memory Access (DMA) from devices.
69 These DMA remapping devices are reported via ACPI tables
70 and include PCI device scope covered by these DMA
71 remapping devices.
72
73config DMAR_DEFAULT_ON
74 def_bool y
75 prompt "Enable DMA Remapping Devices by default"
76 depends on DMAR
77 help
78 Selecting this option will enable a DMAR device at boot time if
79 one is found. If this option is not selected, DMAR support can
80 be enabled by passing intel_iommu=on to the kernel.
81
82config DMAR_BROKEN_GFX_WA
83 bool "Workaround broken graphics drivers (going away soon)"
84 depends on DMAR && BROKEN && X86
85 ---help---
86 Current Graphics drivers tend to use physical address
87 for DMA and avoid using DMA APIs. Setting this config
88 option permits the IOMMU driver to set a unity map for
89 all the OS-visible memory. Hence the driver can continue
90 to use physical addresses for DMA, at least until this
91 option is removed in the 2.6.32 kernel.
92
93config DMAR_FLOPPY_WA
94 def_bool y
95 depends on DMAR && X86
96 ---help---
97 Floppy disk drivers are known to bypass DMA API calls
98 thereby failing to work when IOMMU is enabled. This
99 workaround will setup a 1:1 mapping for the first
100 16MiB to make floppy (an ISA device) work.
101
102config INTR_REMAP
103 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
104 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
105 ---help---
106 Supports Interrupt remapping for IO-APIC and MSI devices.
107 To use x2apic mode in the CPU's which support x2APIC enhancements or
108 to support platforms with CPU's having > 8 bit APIC ID, say Y.
109
110endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
new file mode 100644
index 000000000000..4d4d77df7cac
--- /dev/null
+++ b/drivers/iommu/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_IOMMU_API) += iommu.o
2obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
3obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
4obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
5obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
new file mode 100644
index 000000000000..748eab063857
--- /dev/null
+++ b/drivers/iommu/amd_iommu.c
@@ -0,0 +1,2810 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/bitmap.h>
23#include <linux/slab.h>
24#include <linux/debugfs.h>
25#include <linux/scatterlist.h>
26#include <linux/dma-mapping.h>
27#include <linux/iommu-helper.h>
28#include <linux/iommu.h>
29#include <linux/delay.h>
30#include <linux/amd-iommu.h>
31#include <asm/proto.h>
32#include <asm/iommu.h>
33#include <asm/gart.h>
34#include <asm/dma.h>
35
36#include "amd_iommu_proto.h"
37#include "amd_iommu_types.h"
38
39#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
40
41#define LOOP_TIMEOUT 100000
42
43static DEFINE_RWLOCK(amd_iommu_devtable_lock);
44
45/* A list of preallocated protection domains */
46static LIST_HEAD(iommu_pd_list);
47static DEFINE_SPINLOCK(iommu_pd_list_lock);
48
49/* List of all available dev_data structures */
50static LIST_HEAD(dev_data_list);
51static DEFINE_SPINLOCK(dev_data_list_lock);
52
53/*
54 * Domain for untranslated devices - only allocated
55 * if iommu=pt passed on kernel cmd line.
56 */
57static struct protection_domain *pt_domain;
58
59static struct iommu_ops amd_iommu_ops;
60
61/*
62 * general struct to manage commands send to an IOMMU
63 */
64struct iommu_cmd {
65 u32 data[4];
66};
67
68static void update_domain(struct protection_domain *domain);
69
70/****************************************************************************
71 *
72 * Helper functions
73 *
74 ****************************************************************************/
75
76static struct iommu_dev_data *alloc_dev_data(u16 devid)
77{
78 struct iommu_dev_data *dev_data;
79 unsigned long flags;
80
81 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
82 if (!dev_data)
83 return NULL;
84
85 dev_data->devid = devid;
86 atomic_set(&dev_data->bind, 0);
87
88 spin_lock_irqsave(&dev_data_list_lock, flags);
89 list_add_tail(&dev_data->dev_data_list, &dev_data_list);
90 spin_unlock_irqrestore(&dev_data_list_lock, flags);
91
92 return dev_data;
93}
94
95static void free_dev_data(struct iommu_dev_data *dev_data)
96{
97 unsigned long flags;
98
99 spin_lock_irqsave(&dev_data_list_lock, flags);
100 list_del(&dev_data->dev_data_list);
101 spin_unlock_irqrestore(&dev_data_list_lock, flags);
102
103 kfree(dev_data);
104}
105
106static struct iommu_dev_data *search_dev_data(u16 devid)
107{
108 struct iommu_dev_data *dev_data;
109 unsigned long flags;
110
111 spin_lock_irqsave(&dev_data_list_lock, flags);
112 list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
113 if (dev_data->devid == devid)
114 goto out_unlock;
115 }
116
117 dev_data = NULL;
118
119out_unlock:
120 spin_unlock_irqrestore(&dev_data_list_lock, flags);
121
122 return dev_data;
123}
124
125static struct iommu_dev_data *find_dev_data(u16 devid)
126{
127 struct iommu_dev_data *dev_data;
128
129 dev_data = search_dev_data(devid);
130
131 if (dev_data == NULL)
132 dev_data = alloc_dev_data(devid);
133
134 return dev_data;
135}
136
137static inline u16 get_device_id(struct device *dev)
138{
139 struct pci_dev *pdev = to_pci_dev(dev);
140
141 return calc_devid(pdev->bus->number, pdev->devfn);
142}
143
144static struct iommu_dev_data *get_dev_data(struct device *dev)
145{
146 return dev->archdata.iommu;
147}
148
149/*
150 * In this function the list of preallocated protection domains is traversed to
151 * find the domain for a specific device
152 */
153static struct dma_ops_domain *find_protection_domain(u16 devid)
154{
155 struct dma_ops_domain *entry, *ret = NULL;
156 unsigned long flags;
157 u16 alias = amd_iommu_alias_table[devid];
158
159 if (list_empty(&iommu_pd_list))
160 return NULL;
161
162 spin_lock_irqsave(&iommu_pd_list_lock, flags);
163
164 list_for_each_entry(entry, &iommu_pd_list, list) {
165 if (entry->target_dev == devid ||
166 entry->target_dev == alias) {
167 ret = entry;
168 break;
169 }
170 }
171
172 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
173
174 return ret;
175}
176
177/*
178 * This function checks if the driver got a valid device from the caller to
179 * avoid dereferencing invalid pointers.
180 */
181static bool check_device(struct device *dev)
182{
183 u16 devid;
184
185 if (!dev || !dev->dma_mask)
186 return false;
187
188 /* No device or no PCI device */
189 if (dev->bus != &pci_bus_type)
190 return false;
191
192 devid = get_device_id(dev);
193
194 /* Out of our scope? */
195 if (devid > amd_iommu_last_bdf)
196 return false;
197
198 if (amd_iommu_rlookup_table[devid] == NULL)
199 return false;
200
201 return true;
202}
203
204static int iommu_init_device(struct device *dev)
205{
206 struct iommu_dev_data *dev_data;
207 u16 alias;
208
209 if (dev->archdata.iommu)
210 return 0;
211
212 dev_data = find_dev_data(get_device_id(dev));
213 if (!dev_data)
214 return -ENOMEM;
215
216 alias = amd_iommu_alias_table[dev_data->devid];
217 if (alias != dev_data->devid) {
218 struct iommu_dev_data *alias_data;
219
220 alias_data = find_dev_data(alias);
221 if (alias_data == NULL) {
222 pr_err("AMD-Vi: Warning: Unhandled device %s\n",
223 dev_name(dev));
224 free_dev_data(dev_data);
225 return -ENOTSUPP;
226 }
227 dev_data->alias_data = alias_data;
228 }
229
230 dev->archdata.iommu = dev_data;
231
232 return 0;
233}
234
235static void iommu_ignore_device(struct device *dev)
236{
237 u16 devid, alias;
238
239 devid = get_device_id(dev);
240 alias = amd_iommu_alias_table[devid];
241
242 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
243 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
244
245 amd_iommu_rlookup_table[devid] = NULL;
246 amd_iommu_rlookup_table[alias] = NULL;
247}
248
249static void iommu_uninit_device(struct device *dev)
250{
251 /*
252 * Nothing to do here - we keep dev_data around for unplugged devices
253 * and reuse it when the device is re-plugged - not doing so would
254 * introduce a ton of races.
255 */
256}
257
258void __init amd_iommu_uninit_devices(void)
259{
260 struct iommu_dev_data *dev_data, *n;
261 struct pci_dev *pdev = NULL;
262
263 for_each_pci_dev(pdev) {
264
265 if (!check_device(&pdev->dev))
266 continue;
267
268 iommu_uninit_device(&pdev->dev);
269 }
270
271 /* Free all of our dev_data structures */
272 list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list)
273 free_dev_data(dev_data);
274}
275
276int __init amd_iommu_init_devices(void)
277{
278 struct pci_dev *pdev = NULL;
279 int ret = 0;
280
281 for_each_pci_dev(pdev) {
282
283 if (!check_device(&pdev->dev))
284 continue;
285
286 ret = iommu_init_device(&pdev->dev);
287 if (ret == -ENOTSUPP)
288 iommu_ignore_device(&pdev->dev);
289 else if (ret)
290 goto out_free;
291 }
292
293 return 0;
294
295out_free:
296
297 amd_iommu_uninit_devices();
298
299 return ret;
300}
301#ifdef CONFIG_AMD_IOMMU_STATS
302
303/*
304 * Initialization code for statistics collection
305 */
306
307DECLARE_STATS_COUNTER(compl_wait);
308DECLARE_STATS_COUNTER(cnt_map_single);
309DECLARE_STATS_COUNTER(cnt_unmap_single);
310DECLARE_STATS_COUNTER(cnt_map_sg);
311DECLARE_STATS_COUNTER(cnt_unmap_sg);
312DECLARE_STATS_COUNTER(cnt_alloc_coherent);
313DECLARE_STATS_COUNTER(cnt_free_coherent);
314DECLARE_STATS_COUNTER(cross_page);
315DECLARE_STATS_COUNTER(domain_flush_single);
316DECLARE_STATS_COUNTER(domain_flush_all);
317DECLARE_STATS_COUNTER(alloced_io_mem);
318DECLARE_STATS_COUNTER(total_map_requests);
319
320static struct dentry *stats_dir;
321static struct dentry *de_fflush;
322
323static void amd_iommu_stats_add(struct __iommu_counter *cnt)
324{
325 if (stats_dir == NULL)
326 return;
327
328 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
329 &cnt->value);
330}
331
332static void amd_iommu_stats_init(void)
333{
334 stats_dir = debugfs_create_dir("amd-iommu", NULL);
335 if (stats_dir == NULL)
336 return;
337
338 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
339 (u32 *)&amd_iommu_unmap_flush);
340
341 amd_iommu_stats_add(&compl_wait);
342 amd_iommu_stats_add(&cnt_map_single);
343 amd_iommu_stats_add(&cnt_unmap_single);
344 amd_iommu_stats_add(&cnt_map_sg);
345 amd_iommu_stats_add(&cnt_unmap_sg);
346 amd_iommu_stats_add(&cnt_alloc_coherent);
347 amd_iommu_stats_add(&cnt_free_coherent);
348 amd_iommu_stats_add(&cross_page);
349 amd_iommu_stats_add(&domain_flush_single);
350 amd_iommu_stats_add(&domain_flush_all);
351 amd_iommu_stats_add(&alloced_io_mem);
352 amd_iommu_stats_add(&total_map_requests);
353}
354
355#endif
356
357/****************************************************************************
358 *
359 * Interrupt handling functions
360 *
361 ****************************************************************************/
362
363static void dump_dte_entry(u16 devid)
364{
365 int i;
366
367 for (i = 0; i < 8; ++i)
368 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
369 amd_iommu_dev_table[devid].data[i]);
370}
371
372static void dump_command(unsigned long phys_addr)
373{
374 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
375 int i;
376
377 for (i = 0; i < 4; ++i)
378 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
379}
380
381static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
382{
383 u32 *event = __evt;
384 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
385 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
386 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
387 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
388 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
389
390 printk(KERN_ERR "AMD-Vi: Event logged [");
391
392 switch (type) {
393 case EVENT_TYPE_ILL_DEV:
394 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
395 "address=0x%016llx flags=0x%04x]\n",
396 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
397 address, flags);
398 dump_dte_entry(devid);
399 break;
400 case EVENT_TYPE_IO_FAULT:
401 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
402 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
403 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
404 domid, address, flags);
405 break;
406 case EVENT_TYPE_DEV_TAB_ERR:
407 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
408 "address=0x%016llx flags=0x%04x]\n",
409 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
410 address, flags);
411 break;
412 case EVENT_TYPE_PAGE_TAB_ERR:
413 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
414 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
415 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
416 domid, address, flags);
417 break;
418 case EVENT_TYPE_ILL_CMD:
419 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
420 dump_command(address);
421 break;
422 case EVENT_TYPE_CMD_HARD_ERR:
423 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
424 "flags=0x%04x]\n", address, flags);
425 break;
426 case EVENT_TYPE_IOTLB_INV_TO:
427 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
428 "address=0x%016llx]\n",
429 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
430 address);
431 break;
432 case EVENT_TYPE_INV_DEV_REQ:
433 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
434 "address=0x%016llx flags=0x%04x]\n",
435 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
436 address, flags);
437 break;
438 default:
439 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
440 }
441}
442
443static void iommu_poll_events(struct amd_iommu *iommu)
444{
445 u32 head, tail;
446 unsigned long flags;
447
448 spin_lock_irqsave(&iommu->lock, flags);
449
450 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
451 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
452
453 while (head != tail) {
454 iommu_print_event(iommu, iommu->evt_buf + head);
455 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
456 }
457
458 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
459
460 spin_unlock_irqrestore(&iommu->lock, flags);
461}
462
463irqreturn_t amd_iommu_int_thread(int irq, void *data)
464{
465 struct amd_iommu *iommu;
466
467 for_each_iommu(iommu)
468 iommu_poll_events(iommu);
469
470 return IRQ_HANDLED;
471}
472
473irqreturn_t amd_iommu_int_handler(int irq, void *data)
474{
475 return IRQ_WAKE_THREAD;
476}
477
478/****************************************************************************
479 *
480 * IOMMU command queuing functions
481 *
482 ****************************************************************************/
483
484static int wait_on_sem(volatile u64 *sem)
485{
486 int i = 0;
487
488 while (*sem == 0 && i < LOOP_TIMEOUT) {
489 udelay(1);
490 i += 1;
491 }
492
493 if (i == LOOP_TIMEOUT) {
494 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
495 return -EIO;
496 }
497
498 return 0;
499}
500
501static void copy_cmd_to_buffer(struct amd_iommu *iommu,
502 struct iommu_cmd *cmd,
503 u32 tail)
504{
505 u8 *target;
506
507 target = iommu->cmd_buf + tail;
508 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
509
510 /* Copy command to buffer */
511 memcpy(target, cmd, sizeof(*cmd));
512
513 /* Tell the IOMMU about it */
514 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
515}
516
517static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
518{
519 WARN_ON(address & 0x7ULL);
520
521 memset(cmd, 0, sizeof(*cmd));
522 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
523 cmd->data[1] = upper_32_bits(__pa(address));
524 cmd->data[2] = 1;
525 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
526}
527
528static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
529{
530 memset(cmd, 0, sizeof(*cmd));
531 cmd->data[0] = devid;
532 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
533}
534
535static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
536 size_t size, u16 domid, int pde)
537{
538 u64 pages;
539 int s;
540
541 pages = iommu_num_pages(address, size, PAGE_SIZE);
542 s = 0;
543
544 if (pages > 1) {
545 /*
546 * If we have to flush more than one page, flush all
547 * TLB entries for this domain
548 */
549 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
550 s = 1;
551 }
552
553 address &= PAGE_MASK;
554
555 memset(cmd, 0, sizeof(*cmd));
556 cmd->data[1] |= domid;
557 cmd->data[2] = lower_32_bits(address);
558 cmd->data[3] = upper_32_bits(address);
559 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
560 if (s) /* size bit - we flush more than one 4kb page */
561 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
562 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
563 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
564}
565
566static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
567 u64 address, size_t size)
568{
569 u64 pages;
570 int s;
571
572 pages = iommu_num_pages(address, size, PAGE_SIZE);
573 s = 0;
574
575 if (pages > 1) {
576 /*
577 * If we have to flush more than one page, flush all
578 * TLB entries for this domain
579 */
580 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
581 s = 1;
582 }
583
584 address &= PAGE_MASK;
585
586 memset(cmd, 0, sizeof(*cmd));
587 cmd->data[0] = devid;
588 cmd->data[0] |= (qdep & 0xff) << 24;
589 cmd->data[1] = devid;
590 cmd->data[2] = lower_32_bits(address);
591 cmd->data[3] = upper_32_bits(address);
592 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
593 if (s)
594 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
595}
596
597static void build_inv_all(struct iommu_cmd *cmd)
598{
599 memset(cmd, 0, sizeof(*cmd));
600 CMD_SET_TYPE(cmd, CMD_INV_ALL);
601}
602
603/*
604 * Writes the command to the IOMMUs command buffer and informs the
605 * hardware about the new command.
606 */
607static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
608{
609 u32 left, tail, head, next_tail;
610 unsigned long flags;
611
612 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
613
614again:
615 spin_lock_irqsave(&iommu->lock, flags);
616
617 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
618 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
619 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
620 left = (head - next_tail) % iommu->cmd_buf_size;
621
622 if (left <= 2) {
623 struct iommu_cmd sync_cmd;
624 volatile u64 sem = 0;
625 int ret;
626
627 build_completion_wait(&sync_cmd, (u64)&sem);
628 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
629
630 spin_unlock_irqrestore(&iommu->lock, flags);
631
632 if ((ret = wait_on_sem(&sem)) != 0)
633 return ret;
634
635 goto again;
636 }
637
638 copy_cmd_to_buffer(iommu, cmd, tail);
639
640 /* We need to sync now to make sure all commands are processed */
641 iommu->need_sync = true;
642
643 spin_unlock_irqrestore(&iommu->lock, flags);
644
645 return 0;
646}
647
648/*
649 * This function queues a completion wait command into the command
650 * buffer of an IOMMU
651 */
652static int iommu_completion_wait(struct amd_iommu *iommu)
653{
654 struct iommu_cmd cmd;
655 volatile u64 sem = 0;
656 int ret;
657
658 if (!iommu->need_sync)
659 return 0;
660
661 build_completion_wait(&cmd, (u64)&sem);
662
663 ret = iommu_queue_command(iommu, &cmd);
664 if (ret)
665 return ret;
666
667 return wait_on_sem(&sem);
668}
669
670static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
671{
672 struct iommu_cmd cmd;
673
674 build_inv_dte(&cmd, devid);
675
676 return iommu_queue_command(iommu, &cmd);
677}
678
679static void iommu_flush_dte_all(struct amd_iommu *iommu)
680{
681 u32 devid;
682
683 for (devid = 0; devid <= 0xffff; ++devid)
684 iommu_flush_dte(iommu, devid);
685
686 iommu_completion_wait(iommu);
687}
688
689/*
690 * This function uses heavy locking and may disable irqs for some time. But
691 * this is no issue because it is only called during resume.
692 */
693static void iommu_flush_tlb_all(struct amd_iommu *iommu)
694{
695 u32 dom_id;
696
697 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
698 struct iommu_cmd cmd;
699 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
700 dom_id, 1);
701 iommu_queue_command(iommu, &cmd);
702 }
703
704 iommu_completion_wait(iommu);
705}
706
707static void iommu_flush_all(struct amd_iommu *iommu)
708{
709 struct iommu_cmd cmd;
710
711 build_inv_all(&cmd);
712
713 iommu_queue_command(iommu, &cmd);
714 iommu_completion_wait(iommu);
715}
716
717void iommu_flush_all_caches(struct amd_iommu *iommu)
718{
719 if (iommu_feature(iommu, FEATURE_IA)) {
720 iommu_flush_all(iommu);
721 } else {
722 iommu_flush_dte_all(iommu);
723 iommu_flush_tlb_all(iommu);
724 }
725}
726
727/*
728 * Command send function for flushing on-device TLB
729 */
730static int device_flush_iotlb(struct iommu_dev_data *dev_data,
731 u64 address, size_t size)
732{
733 struct amd_iommu *iommu;
734 struct iommu_cmd cmd;
735 int qdep;
736
737 qdep = dev_data->ats.qdep;
738 iommu = amd_iommu_rlookup_table[dev_data->devid];
739
740 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
741
742 return iommu_queue_command(iommu, &cmd);
743}
744
745/*
746 * Command send function for invalidating a device table entry
747 */
748static int device_flush_dte(struct iommu_dev_data *dev_data)
749{
750 struct amd_iommu *iommu;
751 int ret;
752
753 iommu = amd_iommu_rlookup_table[dev_data->devid];
754
755 ret = iommu_flush_dte(iommu, dev_data->devid);
756 if (ret)
757 return ret;
758
759 if (dev_data->ats.enabled)
760 ret = device_flush_iotlb(dev_data, 0, ~0UL);
761
762 return ret;
763}
764
765/*
766 * TLB invalidation function which is called from the mapping functions.
767 * It invalidates a single PTE if the range to flush is within a single
768 * page. Otherwise it flushes the whole TLB of the IOMMU.
769 */
770static void __domain_flush_pages(struct protection_domain *domain,
771 u64 address, size_t size, int pde)
772{
773 struct iommu_dev_data *dev_data;
774 struct iommu_cmd cmd;
775 int ret = 0, i;
776
777 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
778
779 for (i = 0; i < amd_iommus_present; ++i) {
780 if (!domain->dev_iommu[i])
781 continue;
782
783 /*
784 * Devices of this domain are behind this IOMMU
785 * We need a TLB flush
786 */
787 ret |= iommu_queue_command(amd_iommus[i], &cmd);
788 }
789
790 list_for_each_entry(dev_data, &domain->dev_list, list) {
791
792 if (!dev_data->ats.enabled)
793 continue;
794
795 ret |= device_flush_iotlb(dev_data, address, size);
796 }
797
798 WARN_ON(ret);
799}
800
801static void domain_flush_pages(struct protection_domain *domain,
802 u64 address, size_t size)
803{
804 __domain_flush_pages(domain, address, size, 0);
805}
806
807/* Flush the whole IO/TLB for a given protection domain */
808static void domain_flush_tlb(struct protection_domain *domain)
809{
810 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
811}
812
813/* Flush the whole IO/TLB for a given protection domain - including PDE */
814static void domain_flush_tlb_pde(struct protection_domain *domain)
815{
816 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
817}
818
819static void domain_flush_complete(struct protection_domain *domain)
820{
821 int i;
822
823 for (i = 0; i < amd_iommus_present; ++i) {
824 if (!domain->dev_iommu[i])
825 continue;
826
827 /*
828 * Devices of this domain are behind this IOMMU
829 * We need to wait for completion of all commands.
830 */
831 iommu_completion_wait(amd_iommus[i]);
832 }
833}
834
835
836/*
837 * This function flushes the DTEs for all devices in domain
838 */
839static void domain_flush_devices(struct protection_domain *domain)
840{
841 struct iommu_dev_data *dev_data;
842 unsigned long flags;
843
844 spin_lock_irqsave(&domain->lock, flags);
845
846 list_for_each_entry(dev_data, &domain->dev_list, list)
847 device_flush_dte(dev_data);
848
849 spin_unlock_irqrestore(&domain->lock, flags);
850}
851
852/****************************************************************************
853 *
854 * The functions below are used the create the page table mappings for
855 * unity mapped regions.
856 *
857 ****************************************************************************/
858
859/*
860 * This function is used to add another level to an IO page table. Adding
861 * another level increases the size of the address space by 9 bits to a size up
862 * to 64 bits.
863 */
864static bool increase_address_space(struct protection_domain *domain,
865 gfp_t gfp)
866{
867 u64 *pte;
868
869 if (domain->mode == PAGE_MODE_6_LEVEL)
870 /* address space already 64 bit large */
871 return false;
872
873 pte = (void *)get_zeroed_page(gfp);
874 if (!pte)
875 return false;
876
877 *pte = PM_LEVEL_PDE(domain->mode,
878 virt_to_phys(domain->pt_root));
879 domain->pt_root = pte;
880 domain->mode += 1;
881 domain->updated = true;
882
883 return true;
884}
885
886static u64 *alloc_pte(struct protection_domain *domain,
887 unsigned long address,
888 unsigned long page_size,
889 u64 **pte_page,
890 gfp_t gfp)
891{
892 int level, end_lvl;
893 u64 *pte, *page;
894
895 BUG_ON(!is_power_of_2(page_size));
896
897 while (address > PM_LEVEL_SIZE(domain->mode))
898 increase_address_space(domain, gfp);
899
900 level = domain->mode - 1;
901 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
902 address = PAGE_SIZE_ALIGN(address, page_size);
903 end_lvl = PAGE_SIZE_LEVEL(page_size);
904
905 while (level > end_lvl) {
906 if (!IOMMU_PTE_PRESENT(*pte)) {
907 page = (u64 *)get_zeroed_page(gfp);
908 if (!page)
909 return NULL;
910 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
911 }
912
913 /* No level skipping support yet */
914 if (PM_PTE_LEVEL(*pte) != level)
915 return NULL;
916
917 level -= 1;
918
919 pte = IOMMU_PTE_PAGE(*pte);
920
921 if (pte_page && level == end_lvl)
922 *pte_page = pte;
923
924 pte = &pte[PM_LEVEL_INDEX(level, address)];
925 }
926
927 return pte;
928}
929
930/*
931 * This function checks if there is a PTE for a given dma address. If
932 * there is one, it returns the pointer to it.
933 */
934static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
935{
936 int level;
937 u64 *pte;
938
939 if (address > PM_LEVEL_SIZE(domain->mode))
940 return NULL;
941
942 level = domain->mode - 1;
943 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
944
945 while (level > 0) {
946
947 /* Not Present */
948 if (!IOMMU_PTE_PRESENT(*pte))
949 return NULL;
950
951 /* Large PTE */
952 if (PM_PTE_LEVEL(*pte) == 0x07) {
953 unsigned long pte_mask, __pte;
954
955 /*
956 * If we have a series of large PTEs, make
957 * sure to return a pointer to the first one.
958 */
959 pte_mask = PTE_PAGE_SIZE(*pte);
960 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
961 __pte = ((unsigned long)pte) & pte_mask;
962
963 return (u64 *)__pte;
964 }
965
966 /* No level skipping support yet */
967 if (PM_PTE_LEVEL(*pte) != level)
968 return NULL;
969
970 level -= 1;
971
972 /* Walk to the next level */
973 pte = IOMMU_PTE_PAGE(*pte);
974 pte = &pte[PM_LEVEL_INDEX(level, address)];
975 }
976
977 return pte;
978}
979
980/*
981 * Generic mapping functions. It maps a physical address into a DMA
982 * address space. It allocates the page table pages if necessary.
983 * In the future it can be extended to a generic mapping function
984 * supporting all features of AMD IOMMU page tables like level skipping
985 * and full 64 bit address spaces.
986 */
987static int iommu_map_page(struct protection_domain *dom,
988 unsigned long bus_addr,
989 unsigned long phys_addr,
990 int prot,
991 unsigned long page_size)
992{
993 u64 __pte, *pte;
994 int i, count;
995
996 if (!(prot & IOMMU_PROT_MASK))
997 return -EINVAL;
998
999 bus_addr = PAGE_ALIGN(bus_addr);
1000 phys_addr = PAGE_ALIGN(phys_addr);
1001 count = PAGE_SIZE_PTE_COUNT(page_size);
1002 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
1003
1004 for (i = 0; i < count; ++i)
1005 if (IOMMU_PTE_PRESENT(pte[i]))
1006 return -EBUSY;
1007
1008 if (page_size > PAGE_SIZE) {
1009 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
1010 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
1011 } else
1012 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
1013
1014 if (prot & IOMMU_PROT_IR)
1015 __pte |= IOMMU_PTE_IR;
1016 if (prot & IOMMU_PROT_IW)
1017 __pte |= IOMMU_PTE_IW;
1018
1019 for (i = 0; i < count; ++i)
1020 pte[i] = __pte;
1021
1022 update_domain(dom);
1023
1024 return 0;
1025}
1026
1027static unsigned long iommu_unmap_page(struct protection_domain *dom,
1028 unsigned long bus_addr,
1029 unsigned long page_size)
1030{
1031 unsigned long long unmap_size, unmapped;
1032 u64 *pte;
1033
1034 BUG_ON(!is_power_of_2(page_size));
1035
1036 unmapped = 0;
1037
1038 while (unmapped < page_size) {
1039
1040 pte = fetch_pte(dom, bus_addr);
1041
1042 if (!pte) {
1043 /*
1044 * No PTE for this address
1045 * move forward in 4kb steps
1046 */
1047 unmap_size = PAGE_SIZE;
1048 } else if (PM_PTE_LEVEL(*pte) == 0) {
1049 /* 4kb PTE found for this address */
1050 unmap_size = PAGE_SIZE;
1051 *pte = 0ULL;
1052 } else {
1053 int count, i;
1054
1055 /* Large PTE found which maps this address */
1056 unmap_size = PTE_PAGE_SIZE(*pte);
1057 count = PAGE_SIZE_PTE_COUNT(unmap_size);
1058 for (i = 0; i < count; i++)
1059 pte[i] = 0ULL;
1060 }
1061
1062 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
1063 unmapped += unmap_size;
1064 }
1065
1066 BUG_ON(!is_power_of_2(unmapped));
1067
1068 return unmapped;
1069}
1070
1071/*
1072 * This function checks if a specific unity mapping entry is needed for
1073 * this specific IOMMU.
1074 */
1075static int iommu_for_unity_map(struct amd_iommu *iommu,
1076 struct unity_map_entry *entry)
1077{
1078 u16 bdf, i;
1079
1080 for (i = entry->devid_start; i <= entry->devid_end; ++i) {
1081 bdf = amd_iommu_alias_table[i];
1082 if (amd_iommu_rlookup_table[bdf] == iommu)
1083 return 1;
1084 }
1085
1086 return 0;
1087}
1088
1089/*
1090 * This function actually applies the mapping to the page table of the
1091 * dma_ops domain.
1092 */
1093static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
1094 struct unity_map_entry *e)
1095{
1096 u64 addr;
1097 int ret;
1098
1099 for (addr = e->address_start; addr < e->address_end;
1100 addr += PAGE_SIZE) {
1101 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
1102 PAGE_SIZE);
1103 if (ret)
1104 return ret;
1105 /*
1106 * if unity mapping is in aperture range mark the page
1107 * as allocated in the aperture
1108 */
1109 if (addr < dma_dom->aperture_size)
1110 __set_bit(addr >> PAGE_SHIFT,
1111 dma_dom->aperture[0]->bitmap);
1112 }
1113
1114 return 0;
1115}
1116
1117/*
1118 * Init the unity mappings for a specific IOMMU in the system
1119 *
1120 * Basically iterates over all unity mapping entries and applies them to
1121 * the default domain DMA of that IOMMU if necessary.
1122 */
1123static int iommu_init_unity_mappings(struct amd_iommu *iommu)
1124{
1125 struct unity_map_entry *entry;
1126 int ret;
1127
1128 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
1129 if (!iommu_for_unity_map(iommu, entry))
1130 continue;
1131 ret = dma_ops_unity_map(iommu->default_dom, entry);
1132 if (ret)
1133 return ret;
1134 }
1135
1136 return 0;
1137}
1138
1139/*
1140 * Inits the unity mappings required for a specific device
1141 */
1142static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
1143 u16 devid)
1144{
1145 struct unity_map_entry *e;
1146 int ret;
1147
1148 list_for_each_entry(e, &amd_iommu_unity_map, list) {
1149 if (!(devid >= e->devid_start && devid <= e->devid_end))
1150 continue;
1151 ret = dma_ops_unity_map(dma_dom, e);
1152 if (ret)
1153 return ret;
1154 }
1155
1156 return 0;
1157}
1158
1159/****************************************************************************
1160 *
1161 * The next functions belong to the address allocator for the dma_ops
1162 * interface functions. They work like the allocators in the other IOMMU
1163 * drivers. Its basically a bitmap which marks the allocated pages in
1164 * the aperture. Maybe it could be enhanced in the future to a more
1165 * efficient allocator.
1166 *
1167 ****************************************************************************/
1168
1169/*
1170 * The address allocator core functions.
1171 *
1172 * called with domain->lock held
1173 */
1174
1175/*
1176 * Used to reserve address ranges in the aperture (e.g. for exclusion
1177 * ranges.
1178 */
1179static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1180 unsigned long start_page,
1181 unsigned int pages)
1182{
1183 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1184
1185 if (start_page + pages > last_page)
1186 pages = last_page - start_page;
1187
1188 for (i = start_page; i < start_page + pages; ++i) {
1189 int index = i / APERTURE_RANGE_PAGES;
1190 int page = i % APERTURE_RANGE_PAGES;
1191 __set_bit(page, dom->aperture[index]->bitmap);
1192 }
1193}
1194
1195/*
1196 * This function is used to add a new aperture range to an existing
1197 * aperture in case of dma_ops domain allocation or address allocation
1198 * failure.
1199 */
1200static int alloc_new_range(struct dma_ops_domain *dma_dom,
1201 bool populate, gfp_t gfp)
1202{
1203 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
1204 struct amd_iommu *iommu;
1205 unsigned long i;
1206
1207#ifdef CONFIG_IOMMU_STRESS
1208 populate = false;
1209#endif
1210
1211 if (index >= APERTURE_MAX_RANGES)
1212 return -ENOMEM;
1213
1214 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
1215 if (!dma_dom->aperture[index])
1216 return -ENOMEM;
1217
1218 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
1219 if (!dma_dom->aperture[index]->bitmap)
1220 goto out_free;
1221
1222 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
1223
1224 if (populate) {
1225 unsigned long address = dma_dom->aperture_size;
1226 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
1227 u64 *pte, *pte_page;
1228
1229 for (i = 0; i < num_ptes; ++i) {
1230 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1231 &pte_page, gfp);
1232 if (!pte)
1233 goto out_free;
1234
1235 dma_dom->aperture[index]->pte_pages[i] = pte_page;
1236
1237 address += APERTURE_RANGE_SIZE / 64;
1238 }
1239 }
1240
1241 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1242
1243 /* Initialize the exclusion range if necessary */
1244 for_each_iommu(iommu) {
1245 if (iommu->exclusion_start &&
1246 iommu->exclusion_start >= dma_dom->aperture[index]->offset
1247 && iommu->exclusion_start < dma_dom->aperture_size) {
1248 unsigned long startpage;
1249 int pages = iommu_num_pages(iommu->exclusion_start,
1250 iommu->exclusion_length,
1251 PAGE_SIZE);
1252 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1253 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1254 }
1255 }
1256
1257 /*
1258 * Check for areas already mapped as present in the new aperture
1259 * range and mark those pages as reserved in the allocator. Such
1260 * mappings may already exist as a result of requested unity
1261 * mappings for devices.
1262 */
1263 for (i = dma_dom->aperture[index]->offset;
1264 i < dma_dom->aperture_size;
1265 i += PAGE_SIZE) {
1266 u64 *pte = fetch_pte(&dma_dom->domain, i);
1267 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1268 continue;
1269
1270 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
1271 }
1272
1273 update_domain(&dma_dom->domain);
1274
1275 return 0;
1276
1277out_free:
1278 update_domain(&dma_dom->domain);
1279
1280 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
1281
1282 kfree(dma_dom->aperture[index]);
1283 dma_dom->aperture[index] = NULL;
1284
1285 return -ENOMEM;
1286}
1287
1288static unsigned long dma_ops_area_alloc(struct device *dev,
1289 struct dma_ops_domain *dom,
1290 unsigned int pages,
1291 unsigned long align_mask,
1292 u64 dma_mask,
1293 unsigned long start)
1294{
1295 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
1296 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
1297 int i = start >> APERTURE_RANGE_SHIFT;
1298 unsigned long boundary_size;
1299 unsigned long address = -1;
1300 unsigned long limit;
1301
1302 next_bit >>= PAGE_SHIFT;
1303
1304 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
1305 PAGE_SIZE) >> PAGE_SHIFT;
1306
1307 for (;i < max_index; ++i) {
1308 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
1309
1310 if (dom->aperture[i]->offset >= dma_mask)
1311 break;
1312
1313 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
1314 dma_mask >> PAGE_SHIFT);
1315
1316 address = iommu_area_alloc(dom->aperture[i]->bitmap,
1317 limit, next_bit, pages, 0,
1318 boundary_size, align_mask);
1319 if (address != -1) {
1320 address = dom->aperture[i]->offset +
1321 (address << PAGE_SHIFT);
1322 dom->next_address = address + (pages << PAGE_SHIFT);
1323 break;
1324 }
1325
1326 next_bit = 0;
1327 }
1328
1329 return address;
1330}
1331
1332static unsigned long dma_ops_alloc_addresses(struct device *dev,
1333 struct dma_ops_domain *dom,
1334 unsigned int pages,
1335 unsigned long align_mask,
1336 u64 dma_mask)
1337{
1338 unsigned long address;
1339
1340#ifdef CONFIG_IOMMU_STRESS
1341 dom->next_address = 0;
1342 dom->need_flush = true;
1343#endif
1344
1345 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1346 dma_mask, dom->next_address);
1347
1348 if (address == -1) {
1349 dom->next_address = 0;
1350 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1351 dma_mask, 0);
1352 dom->need_flush = true;
1353 }
1354
1355 if (unlikely(address == -1))
1356 address = DMA_ERROR_CODE;
1357
1358 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
1359
1360 return address;
1361}
1362
1363/*
1364 * The address free function.
1365 *
1366 * called with domain->lock held
1367 */
1368static void dma_ops_free_addresses(struct dma_ops_domain *dom,
1369 unsigned long address,
1370 unsigned int pages)
1371{
1372 unsigned i = address >> APERTURE_RANGE_SHIFT;
1373 struct aperture_range *range = dom->aperture[i];
1374
1375 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
1376
1377#ifdef CONFIG_IOMMU_STRESS
1378 if (i < 4)
1379 return;
1380#endif
1381
1382 if (address >= dom->next_address)
1383 dom->need_flush = true;
1384
1385 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
1386
1387 bitmap_clear(range->bitmap, address, pages);
1388
1389}
1390
1391/****************************************************************************
1392 *
1393 * The next functions belong to the domain allocation. A domain is
1394 * allocated for every IOMMU as the default domain. If device isolation
1395 * is enabled, every device get its own domain. The most important thing
1396 * about domains is the page table mapping the DMA address space they
1397 * contain.
1398 *
1399 ****************************************************************************/
1400
1401/*
1402 * This function adds a protection domain to the global protection domain list
1403 */
1404static void add_domain_to_list(struct protection_domain *domain)
1405{
1406 unsigned long flags;
1407
1408 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1409 list_add(&domain->list, &amd_iommu_pd_list);
1410 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1411}
1412
1413/*
1414 * This function removes a protection domain to the global
1415 * protection domain list
1416 */
1417static void del_domain_from_list(struct protection_domain *domain)
1418{
1419 unsigned long flags;
1420
1421 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1422 list_del(&domain->list);
1423 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1424}
1425
1426static u16 domain_id_alloc(void)
1427{
1428 unsigned long flags;
1429 int id;
1430
1431 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1432 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1433 BUG_ON(id == 0);
1434 if (id > 0 && id < MAX_DOMAIN_ID)
1435 __set_bit(id, amd_iommu_pd_alloc_bitmap);
1436 else
1437 id = 0;
1438 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1439
1440 return id;
1441}
1442
1443static void domain_id_free(int id)
1444{
1445 unsigned long flags;
1446
1447 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1448 if (id > 0 && id < MAX_DOMAIN_ID)
1449 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
1450 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1451}
1452
1453static void free_pagetable(struct protection_domain *domain)
1454{
1455 int i, j;
1456 u64 *p1, *p2, *p3;
1457
1458 p1 = domain->pt_root;
1459
1460 if (!p1)
1461 return;
1462
1463 for (i = 0; i < 512; ++i) {
1464 if (!IOMMU_PTE_PRESENT(p1[i]))
1465 continue;
1466
1467 p2 = IOMMU_PTE_PAGE(p1[i]);
1468 for (j = 0; j < 512; ++j) {
1469 if (!IOMMU_PTE_PRESENT(p2[j]))
1470 continue;
1471 p3 = IOMMU_PTE_PAGE(p2[j]);
1472 free_page((unsigned long)p3);
1473 }
1474
1475 free_page((unsigned long)p2);
1476 }
1477
1478 free_page((unsigned long)p1);
1479
1480 domain->pt_root = NULL;
1481}
1482
1483/*
1484 * Free a domain, only used if something went wrong in the
1485 * allocation path and we need to free an already allocated page table
1486 */
1487static void dma_ops_domain_free(struct dma_ops_domain *dom)
1488{
1489 int i;
1490
1491 if (!dom)
1492 return;
1493
1494 del_domain_from_list(&dom->domain);
1495
1496 free_pagetable(&dom->domain);
1497
1498 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
1499 if (!dom->aperture[i])
1500 continue;
1501 free_page((unsigned long)dom->aperture[i]->bitmap);
1502 kfree(dom->aperture[i]);
1503 }
1504
1505 kfree(dom);
1506}
1507
1508/*
1509 * Allocates a new protection domain usable for the dma_ops functions.
1510 * It also initializes the page table and the address allocator data
1511 * structures required for the dma_ops interface
1512 */
1513static struct dma_ops_domain *dma_ops_domain_alloc(void)
1514{
1515 struct dma_ops_domain *dma_dom;
1516
1517 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
1518 if (!dma_dom)
1519 return NULL;
1520
1521 spin_lock_init(&dma_dom->domain.lock);
1522
1523 dma_dom->domain.id = domain_id_alloc();
1524 if (dma_dom->domain.id == 0)
1525 goto free_dma_dom;
1526 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1527 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1528 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1529 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1530 dma_dom->domain.priv = dma_dom;
1531 if (!dma_dom->domain.pt_root)
1532 goto free_dma_dom;
1533
1534 dma_dom->need_flush = false;
1535 dma_dom->target_dev = 0xffff;
1536
1537 add_domain_to_list(&dma_dom->domain);
1538
1539 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1540 goto free_dma_dom;
1541
1542 /*
1543 * mark the first page as allocated so we never return 0 as
1544 * a valid dma-address. So we can use 0 as error value
1545 */
1546 dma_dom->aperture[0]->bitmap[0] = 1;
1547 dma_dom->next_address = 0;
1548
1549
1550 return dma_dom;
1551
1552free_dma_dom:
1553 dma_ops_domain_free(dma_dom);
1554
1555 return NULL;
1556}
1557
1558/*
1559 * little helper function to check whether a given protection domain is a
1560 * dma_ops domain
1561 */
1562static bool dma_ops_domain(struct protection_domain *domain)
1563{
1564 return domain->flags & PD_DMA_OPS_MASK;
1565}
1566
1567static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1568{
1569 u64 pte_root = virt_to_phys(domain->pt_root);
1570 u32 flags = 0;
1571
1572 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1573 << DEV_ENTRY_MODE_SHIFT;
1574 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1575
1576 if (ats)
1577 flags |= DTE_FLAG_IOTLB;
1578
1579 amd_iommu_dev_table[devid].data[3] |= flags;
1580 amd_iommu_dev_table[devid].data[2] = domain->id;
1581 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1582 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1583}
1584
1585static void clear_dte_entry(u16 devid)
1586{
1587 /* remove entry from the device table seen by the hardware */
1588 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1589 amd_iommu_dev_table[devid].data[1] = 0;
1590 amd_iommu_dev_table[devid].data[2] = 0;
1591
1592 amd_iommu_apply_erratum_63(devid);
1593}
1594
1595static void do_attach(struct iommu_dev_data *dev_data,
1596 struct protection_domain *domain)
1597{
1598 struct amd_iommu *iommu;
1599 bool ats;
1600
1601 iommu = amd_iommu_rlookup_table[dev_data->devid];
1602 ats = dev_data->ats.enabled;
1603
1604 /* Update data structures */
1605 dev_data->domain = domain;
1606 list_add(&dev_data->list, &domain->dev_list);
1607 set_dte_entry(dev_data->devid, domain, ats);
1608
1609 /* Do reference counting */
1610 domain->dev_iommu[iommu->index] += 1;
1611 domain->dev_cnt += 1;
1612
1613 /* Flush the DTE entry */
1614 device_flush_dte(dev_data);
1615}
1616
1617static void do_detach(struct iommu_dev_data *dev_data)
1618{
1619 struct amd_iommu *iommu;
1620
1621 iommu = amd_iommu_rlookup_table[dev_data->devid];
1622
1623 /* decrease reference counters */
1624 dev_data->domain->dev_iommu[iommu->index] -= 1;
1625 dev_data->domain->dev_cnt -= 1;
1626
1627 /* Update data structures */
1628 dev_data->domain = NULL;
1629 list_del(&dev_data->list);
1630 clear_dte_entry(dev_data->devid);
1631
1632 /* Flush the DTE entry */
1633 device_flush_dte(dev_data);
1634}
1635
1636/*
1637 * If a device is not yet associated with a domain, this function does
1638 * assigns it visible for the hardware
1639 */
1640static int __attach_device(struct iommu_dev_data *dev_data,
1641 struct protection_domain *domain)
1642{
1643 int ret;
1644
1645 /* lock domain */
1646 spin_lock(&domain->lock);
1647
1648 if (dev_data->alias_data != NULL) {
1649 struct iommu_dev_data *alias_data = dev_data->alias_data;
1650
1651 /* Some sanity checks */
1652 ret = -EBUSY;
1653 if (alias_data->domain != NULL &&
1654 alias_data->domain != domain)
1655 goto out_unlock;
1656
1657 if (dev_data->domain != NULL &&
1658 dev_data->domain != domain)
1659 goto out_unlock;
1660
1661 /* Do real assignment */
1662 if (alias_data->domain == NULL)
1663 do_attach(alias_data, domain);
1664
1665 atomic_inc(&alias_data->bind);
1666 }
1667
1668 if (dev_data->domain == NULL)
1669 do_attach(dev_data, domain);
1670
1671 atomic_inc(&dev_data->bind);
1672
1673 ret = 0;
1674
1675out_unlock:
1676
1677 /* ready */
1678 spin_unlock(&domain->lock);
1679
1680 return ret;
1681}
1682
1683/*
1684 * If a device is not yet associated with a domain, this function does
1685 * assigns it visible for the hardware
1686 */
1687static int attach_device(struct device *dev,
1688 struct protection_domain *domain)
1689{
1690 struct pci_dev *pdev = to_pci_dev(dev);
1691 struct iommu_dev_data *dev_data;
1692 unsigned long flags;
1693 int ret;
1694
1695 dev_data = get_dev_data(dev);
1696
1697 if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
1698 dev_data->ats.enabled = true;
1699 dev_data->ats.qdep = pci_ats_queue_depth(pdev);
1700 }
1701
1702 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1703 ret = __attach_device(dev_data, domain);
1704 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1705
1706 /*
1707 * We might boot into a crash-kernel here. The crashed kernel
1708 * left the caches in the IOMMU dirty. So we have to flush
1709 * here to evict all dirty stuff.
1710 */
1711 domain_flush_tlb_pde(domain);
1712
1713 return ret;
1714}
1715
1716/*
1717 * Removes a device from a protection domain (unlocked)
1718 */
1719static void __detach_device(struct iommu_dev_data *dev_data)
1720{
1721 struct protection_domain *domain;
1722 unsigned long flags;
1723
1724 BUG_ON(!dev_data->domain);
1725
1726 domain = dev_data->domain;
1727
1728 spin_lock_irqsave(&domain->lock, flags);
1729
1730 if (dev_data->alias_data != NULL) {
1731 struct iommu_dev_data *alias_data = dev_data->alias_data;
1732
1733 if (atomic_dec_and_test(&alias_data->bind))
1734 do_detach(alias_data);
1735 }
1736
1737 if (atomic_dec_and_test(&dev_data->bind))
1738 do_detach(dev_data);
1739
1740 spin_unlock_irqrestore(&domain->lock, flags);
1741
1742 /*
1743 * If we run in passthrough mode the device must be assigned to the
1744 * passthrough domain if it is detached from any other domain.
1745 * Make sure we can deassign from the pt_domain itself.
1746 */
1747 if (iommu_pass_through &&
1748 (dev_data->domain == NULL && domain != pt_domain))
1749 __attach_device(dev_data, pt_domain);
1750}
1751
1752/*
1753 * Removes a device from a protection domain (with devtable_lock held)
1754 */
1755static void detach_device(struct device *dev)
1756{
1757 struct iommu_dev_data *dev_data;
1758 unsigned long flags;
1759
1760 dev_data = get_dev_data(dev);
1761
1762 /* lock device table */
1763 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1764 __detach_device(dev_data);
1765 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1766
1767 if (dev_data->ats.enabled) {
1768 pci_disable_ats(to_pci_dev(dev));
1769 dev_data->ats.enabled = false;
1770 }
1771}
1772
1773/*
1774 * Find out the protection domain structure for a given PCI device. This
1775 * will give us the pointer to the page table root for example.
1776 */
1777static struct protection_domain *domain_for_device(struct device *dev)
1778{
1779 struct iommu_dev_data *dev_data;
1780 struct protection_domain *dom = NULL;
1781 unsigned long flags;
1782
1783 dev_data = get_dev_data(dev);
1784
1785 if (dev_data->domain)
1786 return dev_data->domain;
1787
1788 if (dev_data->alias_data != NULL) {
1789 struct iommu_dev_data *alias_data = dev_data->alias_data;
1790
1791 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1792 if (alias_data->domain != NULL) {
1793 __attach_device(dev_data, alias_data->domain);
1794 dom = alias_data->domain;
1795 }
1796 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1797 }
1798
1799 return dom;
1800}
1801
1802static int device_change_notifier(struct notifier_block *nb,
1803 unsigned long action, void *data)
1804{
1805 struct device *dev = data;
1806 u16 devid;
1807 struct protection_domain *domain;
1808 struct dma_ops_domain *dma_domain;
1809 struct amd_iommu *iommu;
1810 unsigned long flags;
1811
1812 if (!check_device(dev))
1813 return 0;
1814
1815 devid = get_device_id(dev);
1816 iommu = amd_iommu_rlookup_table[devid];
1817
1818 switch (action) {
1819 case BUS_NOTIFY_UNBOUND_DRIVER:
1820
1821 domain = domain_for_device(dev);
1822
1823 if (!domain)
1824 goto out;
1825 if (iommu_pass_through)
1826 break;
1827 detach_device(dev);
1828 break;
1829 case BUS_NOTIFY_ADD_DEVICE:
1830
1831 iommu_init_device(dev);
1832
1833 domain = domain_for_device(dev);
1834
1835 /* allocate a protection domain if a device is added */
1836 dma_domain = find_protection_domain(devid);
1837 if (dma_domain)
1838 goto out;
1839 dma_domain = dma_ops_domain_alloc();
1840 if (!dma_domain)
1841 goto out;
1842 dma_domain->target_dev = devid;
1843
1844 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1845 list_add_tail(&dma_domain->list, &iommu_pd_list);
1846 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1847
1848 break;
1849 case BUS_NOTIFY_DEL_DEVICE:
1850
1851 iommu_uninit_device(dev);
1852
1853 default:
1854 goto out;
1855 }
1856
1857 iommu_completion_wait(iommu);
1858
1859out:
1860 return 0;
1861}
1862
1863static struct notifier_block device_nb = {
1864 .notifier_call = device_change_notifier,
1865};
1866
1867void amd_iommu_init_notifier(void)
1868{
1869 bus_register_notifier(&pci_bus_type, &device_nb);
1870}
1871
1872/*****************************************************************************
1873 *
1874 * The next functions belong to the dma_ops mapping/unmapping code.
1875 *
1876 *****************************************************************************/
1877
1878/*
1879 * In the dma_ops path we only have the struct device. This function
1880 * finds the corresponding IOMMU, the protection domain and the
1881 * requestor id for a given device.
1882 * If the device is not yet associated with a domain this is also done
1883 * in this function.
1884 */
1885static struct protection_domain *get_domain(struct device *dev)
1886{
1887 struct protection_domain *domain;
1888 struct dma_ops_domain *dma_dom;
1889 u16 devid = get_device_id(dev);
1890
1891 if (!check_device(dev))
1892 return ERR_PTR(-EINVAL);
1893
1894 domain = domain_for_device(dev);
1895 if (domain != NULL && !dma_ops_domain(domain))
1896 return ERR_PTR(-EBUSY);
1897
1898 if (domain != NULL)
1899 return domain;
1900
1901 /* Device not bount yet - bind it */
1902 dma_dom = find_protection_domain(devid);
1903 if (!dma_dom)
1904 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1905 attach_device(dev, &dma_dom->domain);
1906 DUMP_printk("Using protection domain %d for device %s\n",
1907 dma_dom->domain.id, dev_name(dev));
1908
1909 return &dma_dom->domain;
1910}
1911
1912static void update_device_table(struct protection_domain *domain)
1913{
1914 struct iommu_dev_data *dev_data;
1915
1916 list_for_each_entry(dev_data, &domain->dev_list, list)
1917 set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
1918}
1919
1920static void update_domain(struct protection_domain *domain)
1921{
1922 if (!domain->updated)
1923 return;
1924
1925 update_device_table(domain);
1926
1927 domain_flush_devices(domain);
1928 domain_flush_tlb_pde(domain);
1929
1930 domain->updated = false;
1931}
1932
1933/*
1934 * This function fetches the PTE for a given address in the aperture
1935 */
1936static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1937 unsigned long address)
1938{
1939 struct aperture_range *aperture;
1940 u64 *pte, *pte_page;
1941
1942 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1943 if (!aperture)
1944 return NULL;
1945
1946 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1947 if (!pte) {
1948 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1949 GFP_ATOMIC);
1950 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1951 } else
1952 pte += PM_LEVEL_INDEX(0, address);
1953
1954 update_domain(&dom->domain);
1955
1956 return pte;
1957}
1958
1959/*
1960 * This is the generic map function. It maps one 4kb page at paddr to
1961 * the given address in the DMA address space for the domain.
1962 */
1963static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1964 unsigned long address,
1965 phys_addr_t paddr,
1966 int direction)
1967{
1968 u64 *pte, __pte;
1969
1970 WARN_ON(address > dom->aperture_size);
1971
1972 paddr &= PAGE_MASK;
1973
1974 pte = dma_ops_get_pte(dom, address);
1975 if (!pte)
1976 return DMA_ERROR_CODE;
1977
1978 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1979
1980 if (direction == DMA_TO_DEVICE)
1981 __pte |= IOMMU_PTE_IR;
1982 else if (direction == DMA_FROM_DEVICE)
1983 __pte |= IOMMU_PTE_IW;
1984 else if (direction == DMA_BIDIRECTIONAL)
1985 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
1986
1987 WARN_ON(*pte);
1988
1989 *pte = __pte;
1990
1991 return (dma_addr_t)address;
1992}
1993
1994/*
1995 * The generic unmapping function for on page in the DMA address space.
1996 */
1997static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1998 unsigned long address)
1999{
2000 struct aperture_range *aperture;
2001 u64 *pte;
2002
2003 if (address >= dom->aperture_size)
2004 return;
2005
2006 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
2007 if (!aperture)
2008 return;
2009
2010 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
2011 if (!pte)
2012 return;
2013
2014 pte += PM_LEVEL_INDEX(0, address);
2015
2016 WARN_ON(!*pte);
2017
2018 *pte = 0ULL;
2019}
2020
2021/*
2022 * This function contains common code for mapping of a physically
2023 * contiguous memory region into DMA address space. It is used by all
2024 * mapping functions provided with this IOMMU driver.
2025 * Must be called with the domain lock held.
2026 */
2027static dma_addr_t __map_single(struct device *dev,
2028 struct dma_ops_domain *dma_dom,
2029 phys_addr_t paddr,
2030 size_t size,
2031 int dir,
2032 bool align,
2033 u64 dma_mask)
2034{
2035 dma_addr_t offset = paddr & ~PAGE_MASK;
2036 dma_addr_t address, start, ret;
2037 unsigned int pages;
2038 unsigned long align_mask = 0;
2039 int i;
2040
2041 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
2042 paddr &= PAGE_MASK;
2043
2044 INC_STATS_COUNTER(total_map_requests);
2045
2046 if (pages > 1)
2047 INC_STATS_COUNTER(cross_page);
2048
2049 if (align)
2050 align_mask = (1UL << get_order(size)) - 1;
2051
2052retry:
2053 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
2054 dma_mask);
2055 if (unlikely(address == DMA_ERROR_CODE)) {
2056 /*
2057 * setting next_address here will let the address
2058 * allocator only scan the new allocated range in the
2059 * first run. This is a small optimization.
2060 */
2061 dma_dom->next_address = dma_dom->aperture_size;
2062
2063 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
2064 goto out;
2065
2066 /*
2067 * aperture was successfully enlarged by 128 MB, try
2068 * allocation again
2069 */
2070 goto retry;
2071 }
2072
2073 start = address;
2074 for (i = 0; i < pages; ++i) {
2075 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
2076 if (ret == DMA_ERROR_CODE)
2077 goto out_unmap;
2078
2079 paddr += PAGE_SIZE;
2080 start += PAGE_SIZE;
2081 }
2082 address += offset;
2083
2084 ADD_STATS_COUNTER(alloced_io_mem, size);
2085
2086 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
2087 domain_flush_tlb(&dma_dom->domain);
2088 dma_dom->need_flush = false;
2089 } else if (unlikely(amd_iommu_np_cache))
2090 domain_flush_pages(&dma_dom->domain, address, size);
2091
2092out:
2093 return address;
2094
2095out_unmap:
2096
2097 for (--i; i >= 0; --i) {
2098 start -= PAGE_SIZE;
2099 dma_ops_domain_unmap(dma_dom, start);
2100 }
2101
2102 dma_ops_free_addresses(dma_dom, address, pages);
2103
2104 return DMA_ERROR_CODE;
2105}
2106
2107/*
2108 * Does the reverse of the __map_single function. Must be called with
2109 * the domain lock held too
2110 */
2111static void __unmap_single(struct dma_ops_domain *dma_dom,
2112 dma_addr_t dma_addr,
2113 size_t size,
2114 int dir)
2115{
2116 dma_addr_t flush_addr;
2117 dma_addr_t i, start;
2118 unsigned int pages;
2119
2120 if ((dma_addr == DMA_ERROR_CODE) ||
2121 (dma_addr + size > dma_dom->aperture_size))
2122 return;
2123
2124 flush_addr = dma_addr;
2125 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2126 dma_addr &= PAGE_MASK;
2127 start = dma_addr;
2128
2129 for (i = 0; i < pages; ++i) {
2130 dma_ops_domain_unmap(dma_dom, start);
2131 start += PAGE_SIZE;
2132 }
2133
2134 SUB_STATS_COUNTER(alloced_io_mem, size);
2135
2136 dma_ops_free_addresses(dma_dom, dma_addr, pages);
2137
2138 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
2139 domain_flush_pages(&dma_dom->domain, flush_addr, size);
2140 dma_dom->need_flush = false;
2141 }
2142}
2143
2144/*
2145 * The exported map_single function for dma_ops.
2146 */
2147static dma_addr_t map_page(struct device *dev, struct page *page,
2148 unsigned long offset, size_t size,
2149 enum dma_data_direction dir,
2150 struct dma_attrs *attrs)
2151{
2152 unsigned long flags;
2153 struct protection_domain *domain;
2154 dma_addr_t addr;
2155 u64 dma_mask;
2156 phys_addr_t paddr = page_to_phys(page) + offset;
2157
2158 INC_STATS_COUNTER(cnt_map_single);
2159
2160 domain = get_domain(dev);
2161 if (PTR_ERR(domain) == -EINVAL)
2162 return (dma_addr_t)paddr;
2163 else if (IS_ERR(domain))
2164 return DMA_ERROR_CODE;
2165
2166 dma_mask = *dev->dma_mask;
2167
2168 spin_lock_irqsave(&domain->lock, flags);
2169
2170 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
2171 dma_mask);
2172 if (addr == DMA_ERROR_CODE)
2173 goto out;
2174
2175 domain_flush_complete(domain);
2176
2177out:
2178 spin_unlock_irqrestore(&domain->lock, flags);
2179
2180 return addr;
2181}
2182
2183/*
2184 * The exported unmap_single function for dma_ops.
2185 */
2186static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2187 enum dma_data_direction dir, struct dma_attrs *attrs)
2188{
2189 unsigned long flags;
2190 struct protection_domain *domain;
2191
2192 INC_STATS_COUNTER(cnt_unmap_single);
2193
2194 domain = get_domain(dev);
2195 if (IS_ERR(domain))
2196 return;
2197
2198 spin_lock_irqsave(&domain->lock, flags);
2199
2200 __unmap_single(domain->priv, dma_addr, size, dir);
2201
2202 domain_flush_complete(domain);
2203
2204 spin_unlock_irqrestore(&domain->lock, flags);
2205}
2206
2207/*
2208 * This is a special map_sg function which is used if we should map a
2209 * device which is not handled by an AMD IOMMU in the system.
2210 */
2211static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
2212 int nelems, int dir)
2213{
2214 struct scatterlist *s;
2215 int i;
2216
2217 for_each_sg(sglist, s, nelems, i) {
2218 s->dma_address = (dma_addr_t)sg_phys(s);
2219 s->dma_length = s->length;
2220 }
2221
2222 return nelems;
2223}
2224
2225/*
2226 * The exported map_sg function for dma_ops (handles scatter-gather
2227 * lists).
2228 */
2229static int map_sg(struct device *dev, struct scatterlist *sglist,
2230 int nelems, enum dma_data_direction dir,
2231 struct dma_attrs *attrs)
2232{
2233 unsigned long flags;
2234 struct protection_domain *domain;
2235 int i;
2236 struct scatterlist *s;
2237 phys_addr_t paddr;
2238 int mapped_elems = 0;
2239 u64 dma_mask;
2240
2241 INC_STATS_COUNTER(cnt_map_sg);
2242
2243 domain = get_domain(dev);
2244 if (PTR_ERR(domain) == -EINVAL)
2245 return map_sg_no_iommu(dev, sglist, nelems, dir);
2246 else if (IS_ERR(domain))
2247 return 0;
2248
2249 dma_mask = *dev->dma_mask;
2250
2251 spin_lock_irqsave(&domain->lock, flags);
2252
2253 for_each_sg(sglist, s, nelems, i) {
2254 paddr = sg_phys(s);
2255
2256 s->dma_address = __map_single(dev, domain->priv,
2257 paddr, s->length, dir, false,
2258 dma_mask);
2259
2260 if (s->dma_address) {
2261 s->dma_length = s->length;
2262 mapped_elems++;
2263 } else
2264 goto unmap;
2265 }
2266
2267 domain_flush_complete(domain);
2268
2269out:
2270 spin_unlock_irqrestore(&domain->lock, flags);
2271
2272 return mapped_elems;
2273unmap:
2274 for_each_sg(sglist, s, mapped_elems, i) {
2275 if (s->dma_address)
2276 __unmap_single(domain->priv, s->dma_address,
2277 s->dma_length, dir);
2278 s->dma_address = s->dma_length = 0;
2279 }
2280
2281 mapped_elems = 0;
2282
2283 goto out;
2284}
2285
2286/*
2287 * The exported map_sg function for dma_ops (handles scatter-gather
2288 * lists).
2289 */
2290static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2291 int nelems, enum dma_data_direction dir,
2292 struct dma_attrs *attrs)
2293{
2294 unsigned long flags;
2295 struct protection_domain *domain;
2296 struct scatterlist *s;
2297 int i;
2298
2299 INC_STATS_COUNTER(cnt_unmap_sg);
2300
2301 domain = get_domain(dev);
2302 if (IS_ERR(domain))
2303 return;
2304
2305 spin_lock_irqsave(&domain->lock, flags);
2306
2307 for_each_sg(sglist, s, nelems, i) {
2308 __unmap_single(domain->priv, s->dma_address,
2309 s->dma_length, dir);
2310 s->dma_address = s->dma_length = 0;
2311 }
2312
2313 domain_flush_complete(domain);
2314
2315 spin_unlock_irqrestore(&domain->lock, flags);
2316}
2317
2318/*
2319 * The exported alloc_coherent function for dma_ops.
2320 */
2321static void *alloc_coherent(struct device *dev, size_t size,
2322 dma_addr_t *dma_addr, gfp_t flag)
2323{
2324 unsigned long flags;
2325 void *virt_addr;
2326 struct protection_domain *domain;
2327 phys_addr_t paddr;
2328 u64 dma_mask = dev->coherent_dma_mask;
2329
2330 INC_STATS_COUNTER(cnt_alloc_coherent);
2331
2332 domain = get_domain(dev);
2333 if (PTR_ERR(domain) == -EINVAL) {
2334 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2335 *dma_addr = __pa(virt_addr);
2336 return virt_addr;
2337 } else if (IS_ERR(domain))
2338 return NULL;
2339
2340 dma_mask = dev->coherent_dma_mask;
2341 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2342 flag |= __GFP_ZERO;
2343
2344 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2345 if (!virt_addr)
2346 return NULL;
2347
2348 paddr = virt_to_phys(virt_addr);
2349
2350 if (!dma_mask)
2351 dma_mask = *dev->dma_mask;
2352
2353 spin_lock_irqsave(&domain->lock, flags);
2354
2355 *dma_addr = __map_single(dev, domain->priv, paddr,
2356 size, DMA_BIDIRECTIONAL, true, dma_mask);
2357
2358 if (*dma_addr == DMA_ERROR_CODE) {
2359 spin_unlock_irqrestore(&domain->lock, flags);
2360 goto out_free;
2361 }
2362
2363 domain_flush_complete(domain);
2364
2365 spin_unlock_irqrestore(&domain->lock, flags);
2366
2367 return virt_addr;
2368
2369out_free:
2370
2371 free_pages((unsigned long)virt_addr, get_order(size));
2372
2373 return NULL;
2374}
2375
2376/*
2377 * The exported free_coherent function for dma_ops.
2378 */
2379static void free_coherent(struct device *dev, size_t size,
2380 void *virt_addr, dma_addr_t dma_addr)
2381{
2382 unsigned long flags;
2383 struct protection_domain *domain;
2384
2385 INC_STATS_COUNTER(cnt_free_coherent);
2386
2387 domain = get_domain(dev);
2388 if (IS_ERR(domain))
2389 goto free_mem;
2390
2391 spin_lock_irqsave(&domain->lock, flags);
2392
2393 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2394
2395 domain_flush_complete(domain);
2396
2397 spin_unlock_irqrestore(&domain->lock, flags);
2398
2399free_mem:
2400 free_pages((unsigned long)virt_addr, get_order(size));
2401}
2402
2403/*
2404 * This function is called by the DMA layer to find out if we can handle a
2405 * particular device. It is part of the dma_ops.
2406 */
2407static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2408{
2409 return check_device(dev);
2410}
2411
2412/*
2413 * The function for pre-allocating protection domains.
2414 *
2415 * If the driver core informs the DMA layer if a driver grabs a device
2416 * we don't need to preallocate the protection domains anymore.
2417 * For now we have to.
2418 */
2419static void prealloc_protection_domains(void)
2420{
2421 struct pci_dev *dev = NULL;
2422 struct dma_ops_domain *dma_dom;
2423 u16 devid;
2424
2425 for_each_pci_dev(dev) {
2426
2427 /* Do we handle this device? */
2428 if (!check_device(&dev->dev))
2429 continue;
2430
2431 /* Is there already any domain for it? */
2432 if (domain_for_device(&dev->dev))
2433 continue;
2434
2435 devid = get_device_id(&dev->dev);
2436
2437 dma_dom = dma_ops_domain_alloc();
2438 if (!dma_dom)
2439 continue;
2440 init_unity_mappings_for_device(dma_dom, devid);
2441 dma_dom->target_dev = devid;
2442
2443 attach_device(&dev->dev, &dma_dom->domain);
2444
2445 list_add_tail(&dma_dom->list, &iommu_pd_list);
2446 }
2447}
2448
2449static struct dma_map_ops amd_iommu_dma_ops = {
2450 .alloc_coherent = alloc_coherent,
2451 .free_coherent = free_coherent,
2452 .map_page = map_page,
2453 .unmap_page = unmap_page,
2454 .map_sg = map_sg,
2455 .unmap_sg = unmap_sg,
2456 .dma_supported = amd_iommu_dma_supported,
2457};
2458
2459static unsigned device_dma_ops_init(void)
2460{
2461 struct pci_dev *pdev = NULL;
2462 unsigned unhandled = 0;
2463
2464 for_each_pci_dev(pdev) {
2465 if (!check_device(&pdev->dev)) {
2466 unhandled += 1;
2467 continue;
2468 }
2469
2470 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
2471 }
2472
2473 return unhandled;
2474}
2475
2476/*
2477 * The function which clues the AMD IOMMU driver into dma_ops.
2478 */
2479
2480void __init amd_iommu_init_api(void)
2481{
2482 register_iommu(&amd_iommu_ops);
2483}
2484
2485int __init amd_iommu_init_dma_ops(void)
2486{
2487 struct amd_iommu *iommu;
2488 int ret, unhandled;
2489
2490 /*
2491 * first allocate a default protection domain for every IOMMU we
2492 * found in the system. Devices not assigned to any other
2493 * protection domain will be assigned to the default one.
2494 */
2495 for_each_iommu(iommu) {
2496 iommu->default_dom = dma_ops_domain_alloc();
2497 if (iommu->default_dom == NULL)
2498 return -ENOMEM;
2499 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
2500 ret = iommu_init_unity_mappings(iommu);
2501 if (ret)
2502 goto free_domains;
2503 }
2504
2505 /*
2506 * Pre-allocate the protection domains for each device.
2507 */
2508 prealloc_protection_domains();
2509
2510 iommu_detected = 1;
2511 swiotlb = 0;
2512
2513 /* Make the driver finally visible to the drivers */
2514 unhandled = device_dma_ops_init();
2515 if (unhandled && max_pfn > MAX_DMA32_PFN) {
2516 /* There are unhandled devices - initialize swiotlb for them */
2517 swiotlb = 1;
2518 }
2519
2520 amd_iommu_stats_init();
2521
2522 return 0;
2523
2524free_domains:
2525
2526 for_each_iommu(iommu) {
2527 if (iommu->default_dom)
2528 dma_ops_domain_free(iommu->default_dom);
2529 }
2530
2531 return ret;
2532}
2533
2534/*****************************************************************************
2535 *
2536 * The following functions belong to the exported interface of AMD IOMMU
2537 *
2538 * This interface allows access to lower level functions of the IOMMU
2539 * like protection domain handling and assignement of devices to domains
2540 * which is not possible with the dma_ops interface.
2541 *
2542 *****************************************************************************/
2543
2544static void cleanup_domain(struct protection_domain *domain)
2545{
2546 struct iommu_dev_data *dev_data, *next;
2547 unsigned long flags;
2548
2549 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2550
2551 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2552 __detach_device(dev_data);
2553 atomic_set(&dev_data->bind, 0);
2554 }
2555
2556 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2557}
2558
2559static void protection_domain_free(struct protection_domain *domain)
2560{
2561 if (!domain)
2562 return;
2563
2564 del_domain_from_list(domain);
2565
2566 if (domain->id)
2567 domain_id_free(domain->id);
2568
2569 kfree(domain);
2570}
2571
2572static struct protection_domain *protection_domain_alloc(void)
2573{
2574 struct protection_domain *domain;
2575
2576 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2577 if (!domain)
2578 return NULL;
2579
2580 spin_lock_init(&domain->lock);
2581 mutex_init(&domain->api_lock);
2582 domain->id = domain_id_alloc();
2583 if (!domain->id)
2584 goto out_err;
2585 INIT_LIST_HEAD(&domain->dev_list);
2586
2587 add_domain_to_list(domain);
2588
2589 return domain;
2590
2591out_err:
2592 kfree(domain);
2593
2594 return NULL;
2595}
2596
2597static int amd_iommu_domain_init(struct iommu_domain *dom)
2598{
2599 struct protection_domain *domain;
2600
2601 domain = protection_domain_alloc();
2602 if (!domain)
2603 goto out_free;
2604
2605 domain->mode = PAGE_MODE_3_LEVEL;
2606 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2607 if (!domain->pt_root)
2608 goto out_free;
2609
2610 dom->priv = domain;
2611
2612 return 0;
2613
2614out_free:
2615 protection_domain_free(domain);
2616
2617 return -ENOMEM;
2618}
2619
2620static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2621{
2622 struct protection_domain *domain = dom->priv;
2623
2624 if (!domain)
2625 return;
2626
2627 if (domain->dev_cnt > 0)
2628 cleanup_domain(domain);
2629
2630 BUG_ON(domain->dev_cnt != 0);
2631
2632 free_pagetable(domain);
2633
2634 protection_domain_free(domain);
2635
2636 dom->priv = NULL;
2637}
2638
2639static void amd_iommu_detach_device(struct iommu_domain *dom,
2640 struct device *dev)
2641{
2642 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2643 struct amd_iommu *iommu;
2644 u16 devid;
2645
2646 if (!check_device(dev))
2647 return;
2648
2649 devid = get_device_id(dev);
2650
2651 if (dev_data->domain != NULL)
2652 detach_device(dev);
2653
2654 iommu = amd_iommu_rlookup_table[devid];
2655 if (!iommu)
2656 return;
2657
2658 iommu_completion_wait(iommu);
2659}
2660
2661static int amd_iommu_attach_device(struct iommu_domain *dom,
2662 struct device *dev)
2663{
2664 struct protection_domain *domain = dom->priv;
2665 struct iommu_dev_data *dev_data;
2666 struct amd_iommu *iommu;
2667 int ret;
2668
2669 if (!check_device(dev))
2670 return -EINVAL;
2671
2672 dev_data = dev->archdata.iommu;
2673
2674 iommu = amd_iommu_rlookup_table[dev_data->devid];
2675 if (!iommu)
2676 return -EINVAL;
2677
2678 if (dev_data->domain)
2679 detach_device(dev);
2680
2681 ret = attach_device(dev, domain);
2682
2683 iommu_completion_wait(iommu);
2684
2685 return ret;
2686}
2687
2688static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2689 phys_addr_t paddr, int gfp_order, int iommu_prot)
2690{
2691 unsigned long page_size = 0x1000UL << gfp_order;
2692 struct protection_domain *domain = dom->priv;
2693 int prot = 0;
2694 int ret;
2695
2696 if (iommu_prot & IOMMU_READ)
2697 prot |= IOMMU_PROT_IR;
2698 if (iommu_prot & IOMMU_WRITE)
2699 prot |= IOMMU_PROT_IW;
2700
2701 mutex_lock(&domain->api_lock);
2702 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2703 mutex_unlock(&domain->api_lock);
2704
2705 return ret;
2706}
2707
2708static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2709 int gfp_order)
2710{
2711 struct protection_domain *domain = dom->priv;
2712 unsigned long page_size, unmap_size;
2713
2714 page_size = 0x1000UL << gfp_order;
2715
2716 mutex_lock(&domain->api_lock);
2717 unmap_size = iommu_unmap_page(domain, iova, page_size);
2718 mutex_unlock(&domain->api_lock);
2719
2720 domain_flush_tlb_pde(domain);
2721
2722 return get_order(unmap_size);
2723}
2724
2725static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2726 unsigned long iova)
2727{
2728 struct protection_domain *domain = dom->priv;
2729 unsigned long offset_mask;
2730 phys_addr_t paddr;
2731 u64 *pte, __pte;
2732
2733 pte = fetch_pte(domain, iova);
2734
2735 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2736 return 0;
2737
2738 if (PM_PTE_LEVEL(*pte) == 0)
2739 offset_mask = PAGE_SIZE - 1;
2740 else
2741 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2742
2743 __pte = *pte & PM_ADDR_MASK;
2744 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2745
2746 return paddr;
2747}
2748
2749static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2750 unsigned long cap)
2751{
2752 switch (cap) {
2753 case IOMMU_CAP_CACHE_COHERENCY:
2754 return 1;
2755 }
2756
2757 return 0;
2758}
2759
2760static struct iommu_ops amd_iommu_ops = {
2761 .domain_init = amd_iommu_domain_init,
2762 .domain_destroy = amd_iommu_domain_destroy,
2763 .attach_dev = amd_iommu_attach_device,
2764 .detach_dev = amd_iommu_detach_device,
2765 .map = amd_iommu_map,
2766 .unmap = amd_iommu_unmap,
2767 .iova_to_phys = amd_iommu_iova_to_phys,
2768 .domain_has_cap = amd_iommu_domain_has_cap,
2769};
2770
2771/*****************************************************************************
2772 *
2773 * The next functions do a basic initialization of IOMMU for pass through
2774 * mode
2775 *
2776 * In passthrough mode the IOMMU is initialized and enabled but not used for
2777 * DMA-API translation.
2778 *
2779 *****************************************************************************/
2780
2781int __init amd_iommu_init_passthrough(void)
2782{
2783 struct amd_iommu *iommu;
2784 struct pci_dev *dev = NULL;
2785 u16 devid;
2786
2787 /* allocate passthrough domain */
2788 pt_domain = protection_domain_alloc();
2789 if (!pt_domain)
2790 return -ENOMEM;
2791
2792 pt_domain->mode |= PAGE_MODE_NONE;
2793
2794 for_each_pci_dev(dev) {
2795 if (!check_device(&dev->dev))
2796 continue;
2797
2798 devid = get_device_id(&dev->dev);
2799
2800 iommu = amd_iommu_rlookup_table[devid];
2801 if (!iommu)
2802 continue;
2803
2804 attach_device(&dev->dev, pt_domain);
2805 }
2806
2807 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2808
2809 return 0;
2810}
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
new file mode 100644
index 000000000000..82d2410f4205
--- /dev/null
+++ b/drivers/iommu/amd_iommu_init.c
@@ -0,0 +1,1574 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/acpi.h>
22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
27#include <linux/amd-iommu.h>
28#include <asm/pci-direct.h>
29#include <asm/iommu.h>
30#include <asm/gart.h>
31#include <asm/x86_init.h>
32#include <asm/iommu_table.h>
33
34#include "amd_iommu_proto.h"
35#include "amd_iommu_types.h"
36
37/*
38 * definitions for the ACPI scanning code
39 */
40#define IVRS_HEADER_LENGTH 48
41
42#define ACPI_IVHD_TYPE 0x10
43#define ACPI_IVMD_TYPE_ALL 0x20
44#define ACPI_IVMD_TYPE 0x21
45#define ACPI_IVMD_TYPE_RANGE 0x22
46
47#define IVHD_DEV_ALL 0x01
48#define IVHD_DEV_SELECT 0x02
49#define IVHD_DEV_SELECT_RANGE_START 0x03
50#define IVHD_DEV_RANGE_END 0x04
51#define IVHD_DEV_ALIAS 0x42
52#define IVHD_DEV_ALIAS_RANGE 0x43
53#define IVHD_DEV_EXT_SELECT 0x46
54#define IVHD_DEV_EXT_SELECT_RANGE 0x47
55
56#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
57#define IVHD_FLAG_PASSPW_EN_MASK 0x02
58#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
59#define IVHD_FLAG_ISOC_EN_MASK 0x08
60
61#define IVMD_FLAG_EXCL_RANGE 0x08
62#define IVMD_FLAG_UNITY_MAP 0x01
63
64#define ACPI_DEVFLAG_INITPASS 0x01
65#define ACPI_DEVFLAG_EXTINT 0x02
66#define ACPI_DEVFLAG_NMI 0x04
67#define ACPI_DEVFLAG_SYSMGT1 0x10
68#define ACPI_DEVFLAG_SYSMGT2 0x20
69#define ACPI_DEVFLAG_LINT0 0x40
70#define ACPI_DEVFLAG_LINT1 0x80
71#define ACPI_DEVFLAG_ATSDIS 0x10000000
72
73/*
74 * ACPI table definitions
75 *
76 * These data structures are laid over the table to parse the important values
77 * out of it.
78 */
79
80/*
81 * structure describing one IOMMU in the ACPI table. Typically followed by one
82 * or more ivhd_entrys.
83 */
84struct ivhd_header {
85 u8 type;
86 u8 flags;
87 u16 length;
88 u16 devid;
89 u16 cap_ptr;
90 u64 mmio_phys;
91 u16 pci_seg;
92 u16 info;
93 u32 reserved;
94} __attribute__((packed));
95
96/*
97 * A device entry describing which devices a specific IOMMU translates and
98 * which requestor ids they use.
99 */
100struct ivhd_entry {
101 u8 type;
102 u16 devid;
103 u8 flags;
104 u32 ext;
105} __attribute__((packed));
106
107/*
108 * An AMD IOMMU memory definition structure. It defines things like exclusion
109 * ranges for devices and regions that should be unity mapped.
110 */
111struct ivmd_header {
112 u8 type;
113 u8 flags;
114 u16 length;
115 u16 devid;
116 u16 aux;
117 u64 resv;
118 u64 range_start;
119 u64 range_length;
120} __attribute__((packed));
121
122bool amd_iommu_dump;
123
124static int __initdata amd_iommu_detected;
125static bool __initdata amd_iommu_disabled;
126
127u16 amd_iommu_last_bdf; /* largest PCI device id we have
128 to handle */
129LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
130 we find in ACPI */
131bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
132
133LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
134 system */
135
136/* Array to assign indices to IOMMUs*/
137struct amd_iommu *amd_iommus[MAX_IOMMUS];
138int amd_iommus_present;
139
140/* IOMMUs have a non-present cache? */
141bool amd_iommu_np_cache __read_mostly;
142bool amd_iommu_iotlb_sup __read_mostly = true;
143
144/*
145 * The ACPI table parsing functions set this variable on an error
146 */
147static int __initdata amd_iommu_init_err;
148
149/*
150 * List of protection domains - used during resume
151 */
152LIST_HEAD(amd_iommu_pd_list);
153spinlock_t amd_iommu_pd_lock;
154
155/*
156 * Pointer to the device table which is shared by all AMD IOMMUs
157 * it is indexed by the PCI device id or the HT unit id and contains
158 * information about the domain the device belongs to as well as the
159 * page table root pointer.
160 */
161struct dev_table_entry *amd_iommu_dev_table;
162
163/*
164 * The alias table is a driver specific data structure which contains the
165 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
166 * More than one device can share the same requestor id.
167 */
168u16 *amd_iommu_alias_table;
169
170/*
171 * The rlookup table is used to find the IOMMU which is responsible
172 * for a specific device. It is also indexed by the PCI device id.
173 */
174struct amd_iommu **amd_iommu_rlookup_table;
175
176/*
177 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
178 * to know which ones are already in use.
179 */
180unsigned long *amd_iommu_pd_alloc_bitmap;
181
182static u32 dev_table_size; /* size of the device table */
183static u32 alias_table_size; /* size of the alias table */
184static u32 rlookup_table_size; /* size if the rlookup table */
185
186/*
187 * This function flushes all internal caches of
188 * the IOMMU used by this driver.
189 */
190extern void iommu_flush_all_caches(struct amd_iommu *iommu);
191
192static inline void update_last_devid(u16 devid)
193{
194 if (devid > amd_iommu_last_bdf)
195 amd_iommu_last_bdf = devid;
196}
197
198static inline unsigned long tbl_size(int entry_size)
199{
200 unsigned shift = PAGE_SHIFT +
201 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
202
203 return 1UL << shift;
204}
205
206/* Access to l1 and l2 indexed register spaces */
207
208static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
209{
210 u32 val;
211
212 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
213 pci_read_config_dword(iommu->dev, 0xfc, &val);
214 return val;
215}
216
217static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
218{
219 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
220 pci_write_config_dword(iommu->dev, 0xfc, val);
221 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
222}
223
224static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
225{
226 u32 val;
227
228 pci_write_config_dword(iommu->dev, 0xf0, address);
229 pci_read_config_dword(iommu->dev, 0xf4, &val);
230 return val;
231}
232
233static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
234{
235 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
236 pci_write_config_dword(iommu->dev, 0xf4, val);
237}
238
239/****************************************************************************
240 *
241 * AMD IOMMU MMIO register space handling functions
242 *
243 * These functions are used to program the IOMMU device registers in
244 * MMIO space required for that driver.
245 *
246 ****************************************************************************/
247
248/*
249 * This function set the exclusion range in the IOMMU. DMA accesses to the
250 * exclusion range are passed through untranslated
251 */
252static void iommu_set_exclusion_range(struct amd_iommu *iommu)
253{
254 u64 start = iommu->exclusion_start & PAGE_MASK;
255 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
256 u64 entry;
257
258 if (!iommu->exclusion_start)
259 return;
260
261 entry = start | MMIO_EXCL_ENABLE_MASK;
262 memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
263 &entry, sizeof(entry));
264
265 entry = limit;
266 memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
267 &entry, sizeof(entry));
268}
269
270/* Programs the physical address of the device table into the IOMMU hardware */
271static void __init iommu_set_device_table(struct amd_iommu *iommu)
272{
273 u64 entry;
274
275 BUG_ON(iommu->mmio_base == NULL);
276
277 entry = virt_to_phys(amd_iommu_dev_table);
278 entry |= (dev_table_size >> 12) - 1;
279 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
280 &entry, sizeof(entry));
281}
282
283/* Generic functions to enable/disable certain features of the IOMMU. */
284static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
285{
286 u32 ctrl;
287
288 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
289 ctrl |= (1 << bit);
290 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
291}
292
293static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
294{
295 u32 ctrl;
296
297 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
298 ctrl &= ~(1 << bit);
299 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
300}
301
302/* Function to enable the hardware */
303static void iommu_enable(struct amd_iommu *iommu)
304{
305 static const char * const feat_str[] = {
306 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
307 "IA", "GA", "HE", "PC", NULL
308 };
309 int i;
310
311 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
312 dev_name(&iommu->dev->dev), iommu->cap_ptr);
313
314 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
315 printk(KERN_CONT " extended features: ");
316 for (i = 0; feat_str[i]; ++i)
317 if (iommu_feature(iommu, (1ULL << i)))
318 printk(KERN_CONT " %s", feat_str[i]);
319 }
320 printk(KERN_CONT "\n");
321
322 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
323}
324
325static void iommu_disable(struct amd_iommu *iommu)
326{
327 /* Disable command buffer */
328 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
329
330 /* Disable event logging and event interrupts */
331 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
332 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
333
334 /* Disable IOMMU hardware itself */
335 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
336}
337
338/*
339 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
340 * the system has one.
341 */
342static u8 * __init iommu_map_mmio_space(u64 address)
343{
344 u8 *ret;
345
346 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
347 pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
348 address);
349 pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
350 return NULL;
351 }
352
353 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
354 if (ret != NULL)
355 return ret;
356
357 release_mem_region(address, MMIO_REGION_LENGTH);
358
359 return NULL;
360}
361
362static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
363{
364 if (iommu->mmio_base)
365 iounmap(iommu->mmio_base);
366 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
367}
368
369/****************************************************************************
370 *
371 * The functions below belong to the first pass of AMD IOMMU ACPI table
372 * parsing. In this pass we try to find out the highest device id this
373 * code has to handle. Upon this information the size of the shared data
374 * structures is determined later.
375 *
376 ****************************************************************************/
377
378/*
379 * This function calculates the length of a given IVHD entry
380 */
381static inline int ivhd_entry_length(u8 *ivhd)
382{
383 return 0x04 << (*ivhd >> 6);
384}
385
386/*
387 * This function reads the last device id the IOMMU has to handle from the PCI
388 * capability header for this IOMMU
389 */
390static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
391{
392 u32 cap;
393
394 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
395 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
396
397 return 0;
398}
399
400/*
401 * After reading the highest device id from the IOMMU PCI capability header
402 * this function looks if there is a higher device id defined in the ACPI table
403 */
404static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
405{
406 u8 *p = (void *)h, *end = (void *)h;
407 struct ivhd_entry *dev;
408
409 p += sizeof(*h);
410 end += h->length;
411
412 find_last_devid_on_pci(PCI_BUS(h->devid),
413 PCI_SLOT(h->devid),
414 PCI_FUNC(h->devid),
415 h->cap_ptr);
416
417 while (p < end) {
418 dev = (struct ivhd_entry *)p;
419 switch (dev->type) {
420 case IVHD_DEV_SELECT:
421 case IVHD_DEV_RANGE_END:
422 case IVHD_DEV_ALIAS:
423 case IVHD_DEV_EXT_SELECT:
424 /* all the above subfield types refer to device ids */
425 update_last_devid(dev->devid);
426 break;
427 default:
428 break;
429 }
430 p += ivhd_entry_length(p);
431 }
432
433 WARN_ON(p != end);
434
435 return 0;
436}
437
438/*
439 * Iterate over all IVHD entries in the ACPI table and find the highest device
440 * id which we need to handle. This is the first of three functions which parse
441 * the ACPI table. So we check the checksum here.
442 */
443static int __init find_last_devid_acpi(struct acpi_table_header *table)
444{
445 int i;
446 u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
447 struct ivhd_header *h;
448
449 /*
450 * Validate checksum here so we don't need to do it when
451 * we actually parse the table
452 */
453 for (i = 0; i < table->length; ++i)
454 checksum += p[i];
455 if (checksum != 0) {
456 /* ACPI table corrupt */
457 amd_iommu_init_err = -ENODEV;
458 return 0;
459 }
460
461 p += IVRS_HEADER_LENGTH;
462
463 end += table->length;
464 while (p < end) {
465 h = (struct ivhd_header *)p;
466 switch (h->type) {
467 case ACPI_IVHD_TYPE:
468 find_last_devid_from_ivhd(h);
469 break;
470 default:
471 break;
472 }
473 p += h->length;
474 }
475 WARN_ON(p != end);
476
477 return 0;
478}
479
480/****************************************************************************
481 *
482 * The following functions belong the the code path which parses the ACPI table
483 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
484 * data structures, initialize the device/alias/rlookup table and also
485 * basically initialize the hardware.
486 *
487 ****************************************************************************/
488
489/*
490 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
491 * write commands to that buffer later and the IOMMU will execute them
492 * asynchronously
493 */
494static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
495{
496 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
497 get_order(CMD_BUFFER_SIZE));
498
499 if (cmd_buf == NULL)
500 return NULL;
501
502 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
503
504 return cmd_buf;
505}
506
507/*
508 * This function resets the command buffer if the IOMMU stopped fetching
509 * commands from it.
510 */
511void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
512{
513 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
514
515 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
516 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
517
518 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
519}
520
521/*
522 * This function writes the command buffer address to the hardware and
523 * enables it.
524 */
525static void iommu_enable_command_buffer(struct amd_iommu *iommu)
526{
527 u64 entry;
528
529 BUG_ON(iommu->cmd_buf == NULL);
530
531 entry = (u64)virt_to_phys(iommu->cmd_buf);
532 entry |= MMIO_CMD_SIZE_512;
533
534 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
535 &entry, sizeof(entry));
536
537 amd_iommu_reset_cmd_buffer(iommu);
538 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
539}
540
541static void __init free_command_buffer(struct amd_iommu *iommu)
542{
543 free_pages((unsigned long)iommu->cmd_buf,
544 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
545}
546
547/* allocates the memory where the IOMMU will log its events to */
548static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
549{
550 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
551 get_order(EVT_BUFFER_SIZE));
552
553 if (iommu->evt_buf == NULL)
554 return NULL;
555
556 iommu->evt_buf_size = EVT_BUFFER_SIZE;
557
558 return iommu->evt_buf;
559}
560
561static void iommu_enable_event_buffer(struct amd_iommu *iommu)
562{
563 u64 entry;
564
565 BUG_ON(iommu->evt_buf == NULL);
566
567 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
568
569 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
570 &entry, sizeof(entry));
571
572 /* set head and tail to zero manually */
573 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
574 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
575
576 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
577}
578
579static void __init free_event_buffer(struct amd_iommu *iommu)
580{
581 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
582}
583
584/* sets a specific bit in the device table entry. */
585static void set_dev_entry_bit(u16 devid, u8 bit)
586{
587 int i = (bit >> 5) & 0x07;
588 int _bit = bit & 0x1f;
589
590 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
591}
592
593static int get_dev_entry_bit(u16 devid, u8 bit)
594{
595 int i = (bit >> 5) & 0x07;
596 int _bit = bit & 0x1f;
597
598 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
599}
600
601
602void amd_iommu_apply_erratum_63(u16 devid)
603{
604 int sysmgt;
605
606 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
607 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
608
609 if (sysmgt == 0x01)
610 set_dev_entry_bit(devid, DEV_ENTRY_IW);
611}
612
613/* Writes the specific IOMMU for a device into the rlookup table */
614static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
615{
616 amd_iommu_rlookup_table[devid] = iommu;
617}
618
619/*
620 * This function takes the device specific flags read from the ACPI
621 * table and sets up the device table entry with that information
622 */
623static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
624 u16 devid, u32 flags, u32 ext_flags)
625{
626 if (flags & ACPI_DEVFLAG_INITPASS)
627 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
628 if (flags & ACPI_DEVFLAG_EXTINT)
629 set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
630 if (flags & ACPI_DEVFLAG_NMI)
631 set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
632 if (flags & ACPI_DEVFLAG_SYSMGT1)
633 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
634 if (flags & ACPI_DEVFLAG_SYSMGT2)
635 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
636 if (flags & ACPI_DEVFLAG_LINT0)
637 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
638 if (flags & ACPI_DEVFLAG_LINT1)
639 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
640
641 amd_iommu_apply_erratum_63(devid);
642
643 set_iommu_for_device(iommu, devid);
644}
645
646/*
647 * Reads the device exclusion range from ACPI and initialize IOMMU with
648 * it
649 */
650static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
651{
652 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
653
654 if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
655 return;
656
657 if (iommu) {
658 /*
659 * We only can configure exclusion ranges per IOMMU, not
660 * per device. But we can enable the exclusion range per
661 * device. This is done here
662 */
663 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
664 iommu->exclusion_start = m->range_start;
665 iommu->exclusion_length = m->range_length;
666 }
667}
668
669/*
670 * This function reads some important data from the IOMMU PCI space and
671 * initializes the driver data structure with it. It reads the hardware
672 * capabilities and the first/last device entries
673 */
674static void __init init_iommu_from_pci(struct amd_iommu *iommu)
675{
676 int cap_ptr = iommu->cap_ptr;
677 u32 range, misc, low, high;
678 int i, j;
679
680 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
681 &iommu->cap);
682 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
683 &range);
684 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
685 &misc);
686
687 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
688 MMIO_GET_FD(range));
689 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
690 MMIO_GET_LD(range));
691 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
692
693 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
694 amd_iommu_iotlb_sup = false;
695
696 /* read extended feature bits */
697 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
698 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
699
700 iommu->features = ((u64)high << 32) | low;
701
702 if (!is_rd890_iommu(iommu->dev))
703 return;
704
705 /*
706 * Some rd890 systems may not be fully reconfigured by the BIOS, so
707 * it's necessary for us to store this information so it can be
708 * reprogrammed on resume
709 */
710
711 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
712 &iommu->stored_addr_lo);
713 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
714 &iommu->stored_addr_hi);
715
716 /* Low bit locks writes to configuration space */
717 iommu->stored_addr_lo &= ~1;
718
719 for (i = 0; i < 6; i++)
720 for (j = 0; j < 0x12; j++)
721 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
722
723 for (i = 0; i < 0x83; i++)
724 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
725}
726
727/*
728 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
729 * initializes the hardware and our data structures with it.
730 */
731static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
732 struct ivhd_header *h)
733{
734 u8 *p = (u8 *)h;
735 u8 *end = p, flags = 0;
736 u16 devid = 0, devid_start = 0, devid_to = 0;
737 u32 dev_i, ext_flags = 0;
738 bool alias = false;
739 struct ivhd_entry *e;
740
741 /*
742 * First save the recommended feature enable bits from ACPI
743 */
744 iommu->acpi_flags = h->flags;
745
746 /*
747 * Done. Now parse the device entries
748 */
749 p += sizeof(struct ivhd_header);
750 end += h->length;
751
752
753 while (p < end) {
754 e = (struct ivhd_entry *)p;
755 switch (e->type) {
756 case IVHD_DEV_ALL:
757
758 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
759 " last device %02x:%02x.%x flags: %02x\n",
760 PCI_BUS(iommu->first_device),
761 PCI_SLOT(iommu->first_device),
762 PCI_FUNC(iommu->first_device),
763 PCI_BUS(iommu->last_device),
764 PCI_SLOT(iommu->last_device),
765 PCI_FUNC(iommu->last_device),
766 e->flags);
767
768 for (dev_i = iommu->first_device;
769 dev_i <= iommu->last_device; ++dev_i)
770 set_dev_entry_from_acpi(iommu, dev_i,
771 e->flags, 0);
772 break;
773 case IVHD_DEV_SELECT:
774
775 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
776 "flags: %02x\n",
777 PCI_BUS(e->devid),
778 PCI_SLOT(e->devid),
779 PCI_FUNC(e->devid),
780 e->flags);
781
782 devid = e->devid;
783 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
784 break;
785 case IVHD_DEV_SELECT_RANGE_START:
786
787 DUMP_printk(" DEV_SELECT_RANGE_START\t "
788 "devid: %02x:%02x.%x flags: %02x\n",
789 PCI_BUS(e->devid),
790 PCI_SLOT(e->devid),
791 PCI_FUNC(e->devid),
792 e->flags);
793
794 devid_start = e->devid;
795 flags = e->flags;
796 ext_flags = 0;
797 alias = false;
798 break;
799 case IVHD_DEV_ALIAS:
800
801 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
802 "flags: %02x devid_to: %02x:%02x.%x\n",
803 PCI_BUS(e->devid),
804 PCI_SLOT(e->devid),
805 PCI_FUNC(e->devid),
806 e->flags,
807 PCI_BUS(e->ext >> 8),
808 PCI_SLOT(e->ext >> 8),
809 PCI_FUNC(e->ext >> 8));
810
811 devid = e->devid;
812 devid_to = e->ext >> 8;
813 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
814 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
815 amd_iommu_alias_table[devid] = devid_to;
816 break;
817 case IVHD_DEV_ALIAS_RANGE:
818
819 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
820 "devid: %02x:%02x.%x flags: %02x "
821 "devid_to: %02x:%02x.%x\n",
822 PCI_BUS(e->devid),
823 PCI_SLOT(e->devid),
824 PCI_FUNC(e->devid),
825 e->flags,
826 PCI_BUS(e->ext >> 8),
827 PCI_SLOT(e->ext >> 8),
828 PCI_FUNC(e->ext >> 8));
829
830 devid_start = e->devid;
831 flags = e->flags;
832 devid_to = e->ext >> 8;
833 ext_flags = 0;
834 alias = true;
835 break;
836 case IVHD_DEV_EXT_SELECT:
837
838 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
839 "flags: %02x ext: %08x\n",
840 PCI_BUS(e->devid),
841 PCI_SLOT(e->devid),
842 PCI_FUNC(e->devid),
843 e->flags, e->ext);
844
845 devid = e->devid;
846 set_dev_entry_from_acpi(iommu, devid, e->flags,
847 e->ext);
848 break;
849 case IVHD_DEV_EXT_SELECT_RANGE:
850
851 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
852 "%02x:%02x.%x flags: %02x ext: %08x\n",
853 PCI_BUS(e->devid),
854 PCI_SLOT(e->devid),
855 PCI_FUNC(e->devid),
856 e->flags, e->ext);
857
858 devid_start = e->devid;
859 flags = e->flags;
860 ext_flags = e->ext;
861 alias = false;
862 break;
863 case IVHD_DEV_RANGE_END:
864
865 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
866 PCI_BUS(e->devid),
867 PCI_SLOT(e->devid),
868 PCI_FUNC(e->devid));
869
870 devid = e->devid;
871 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
872 if (alias) {
873 amd_iommu_alias_table[dev_i] = devid_to;
874 set_dev_entry_from_acpi(iommu,
875 devid_to, flags, ext_flags);
876 }
877 set_dev_entry_from_acpi(iommu, dev_i,
878 flags, ext_flags);
879 }
880 break;
881 default:
882 break;
883 }
884
885 p += ivhd_entry_length(p);
886 }
887}
888
889/* Initializes the device->iommu mapping for the driver */
890static int __init init_iommu_devices(struct amd_iommu *iommu)
891{
892 u32 i;
893
894 for (i = iommu->first_device; i <= iommu->last_device; ++i)
895 set_iommu_for_device(iommu, i);
896
897 return 0;
898}
899
900static void __init free_iommu_one(struct amd_iommu *iommu)
901{
902 free_command_buffer(iommu);
903 free_event_buffer(iommu);
904 iommu_unmap_mmio_space(iommu);
905}
906
907static void __init free_iommu_all(void)
908{
909 struct amd_iommu *iommu, *next;
910
911 for_each_iommu_safe(iommu, next) {
912 list_del(&iommu->list);
913 free_iommu_one(iommu);
914 kfree(iommu);
915 }
916}
917
918/*
919 * This function clues the initialization function for one IOMMU
920 * together and also allocates the command buffer and programs the
921 * hardware. It does NOT enable the IOMMU. This is done afterwards.
922 */
923static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
924{
925 spin_lock_init(&iommu->lock);
926
927 /* Add IOMMU to internal data structures */
928 list_add_tail(&iommu->list, &amd_iommu_list);
929 iommu->index = amd_iommus_present++;
930
931 if (unlikely(iommu->index >= MAX_IOMMUS)) {
932 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
933 return -ENOSYS;
934 }
935
936 /* Index is fine - add IOMMU to the array */
937 amd_iommus[iommu->index] = iommu;
938
939 /*
940 * Copy data from ACPI table entry to the iommu struct
941 */
942 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
943 if (!iommu->dev)
944 return 1;
945
946 iommu->cap_ptr = h->cap_ptr;
947 iommu->pci_seg = h->pci_seg;
948 iommu->mmio_phys = h->mmio_phys;
949 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
950 if (!iommu->mmio_base)
951 return -ENOMEM;
952
953 iommu->cmd_buf = alloc_command_buffer(iommu);
954 if (!iommu->cmd_buf)
955 return -ENOMEM;
956
957 iommu->evt_buf = alloc_event_buffer(iommu);
958 if (!iommu->evt_buf)
959 return -ENOMEM;
960
961 iommu->int_enabled = false;
962
963 init_iommu_from_pci(iommu);
964 init_iommu_from_acpi(iommu, h);
965 init_iommu_devices(iommu);
966
967 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
968 amd_iommu_np_cache = true;
969
970 return pci_enable_device(iommu->dev);
971}
972
973/*
974 * Iterates over all IOMMU entries in the ACPI table, allocates the
975 * IOMMU structure and initializes it with init_iommu_one()
976 */
977static int __init init_iommu_all(struct acpi_table_header *table)
978{
979 u8 *p = (u8 *)table, *end = (u8 *)table;
980 struct ivhd_header *h;
981 struct amd_iommu *iommu;
982 int ret;
983
984 end += table->length;
985 p += IVRS_HEADER_LENGTH;
986
987 while (p < end) {
988 h = (struct ivhd_header *)p;
989 switch (*p) {
990 case ACPI_IVHD_TYPE:
991
992 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
993 "seg: %d flags: %01x info %04x\n",
994 PCI_BUS(h->devid), PCI_SLOT(h->devid),
995 PCI_FUNC(h->devid), h->cap_ptr,
996 h->pci_seg, h->flags, h->info);
997 DUMP_printk(" mmio-addr: %016llx\n",
998 h->mmio_phys);
999
1000 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
1001 if (iommu == NULL) {
1002 amd_iommu_init_err = -ENOMEM;
1003 return 0;
1004 }
1005
1006 ret = init_iommu_one(iommu, h);
1007 if (ret) {
1008 amd_iommu_init_err = ret;
1009 return 0;
1010 }
1011 break;
1012 default:
1013 break;
1014 }
1015 p += h->length;
1016
1017 }
1018 WARN_ON(p != end);
1019
1020 return 0;
1021}
1022
1023/****************************************************************************
1024 *
1025 * The following functions initialize the MSI interrupts for all IOMMUs
1026 * in the system. Its a bit challenging because there could be multiple
1027 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
1028 * pci_dev.
1029 *
1030 ****************************************************************************/
1031
1032static int iommu_setup_msi(struct amd_iommu *iommu)
1033{
1034 int r;
1035
1036 if (pci_enable_msi(iommu->dev))
1037 return 1;
1038
1039 r = request_threaded_irq(iommu->dev->irq,
1040 amd_iommu_int_handler,
1041 amd_iommu_int_thread,
1042 0, "AMD-Vi",
1043 iommu->dev);
1044
1045 if (r) {
1046 pci_disable_msi(iommu->dev);
1047 return 1;
1048 }
1049
1050 iommu->int_enabled = true;
1051 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
1052
1053 return 0;
1054}
1055
1056static int iommu_init_msi(struct amd_iommu *iommu)
1057{
1058 if (iommu->int_enabled)
1059 return 0;
1060
1061 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
1062 return iommu_setup_msi(iommu);
1063
1064 return 1;
1065}
1066
1067/****************************************************************************
1068 *
1069 * The next functions belong to the third pass of parsing the ACPI
1070 * table. In this last pass the memory mapping requirements are
1071 * gathered (like exclusion and unity mapping reanges).
1072 *
1073 ****************************************************************************/
1074
1075static void __init free_unity_maps(void)
1076{
1077 struct unity_map_entry *entry, *next;
1078
1079 list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
1080 list_del(&entry->list);
1081 kfree(entry);
1082 }
1083}
1084
1085/* called when we find an exclusion range definition in ACPI */
1086static int __init init_exclusion_range(struct ivmd_header *m)
1087{
1088 int i;
1089
1090 switch (m->type) {
1091 case ACPI_IVMD_TYPE:
1092 set_device_exclusion_range(m->devid, m);
1093 break;
1094 case ACPI_IVMD_TYPE_ALL:
1095 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1096 set_device_exclusion_range(i, m);
1097 break;
1098 case ACPI_IVMD_TYPE_RANGE:
1099 for (i = m->devid; i <= m->aux; ++i)
1100 set_device_exclusion_range(i, m);
1101 break;
1102 default:
1103 break;
1104 }
1105
1106 return 0;
1107}
1108
1109/* called for unity map ACPI definition */
1110static int __init init_unity_map_range(struct ivmd_header *m)
1111{
1112 struct unity_map_entry *e = 0;
1113 char *s;
1114
1115 e = kzalloc(sizeof(*e), GFP_KERNEL);
1116 if (e == NULL)
1117 return -ENOMEM;
1118
1119 switch (m->type) {
1120 default:
1121 kfree(e);
1122 return 0;
1123 case ACPI_IVMD_TYPE:
1124 s = "IVMD_TYPEi\t\t\t";
1125 e->devid_start = e->devid_end = m->devid;
1126 break;
1127 case ACPI_IVMD_TYPE_ALL:
1128 s = "IVMD_TYPE_ALL\t\t";
1129 e->devid_start = 0;
1130 e->devid_end = amd_iommu_last_bdf;
1131 break;
1132 case ACPI_IVMD_TYPE_RANGE:
1133 s = "IVMD_TYPE_RANGE\t\t";
1134 e->devid_start = m->devid;
1135 e->devid_end = m->aux;
1136 break;
1137 }
1138 e->address_start = PAGE_ALIGN(m->range_start);
1139 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
1140 e->prot = m->flags >> 1;
1141
1142 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
1143 " range_start: %016llx range_end: %016llx flags: %x\n", s,
1144 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
1145 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
1146 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
1147 e->address_start, e->address_end, m->flags);
1148
1149 list_add_tail(&e->list, &amd_iommu_unity_map);
1150
1151 return 0;
1152}
1153
1154/* iterates over all memory definitions we find in the ACPI table */
1155static int __init init_memory_definitions(struct acpi_table_header *table)
1156{
1157 u8 *p = (u8 *)table, *end = (u8 *)table;
1158 struct ivmd_header *m;
1159
1160 end += table->length;
1161 p += IVRS_HEADER_LENGTH;
1162
1163 while (p < end) {
1164 m = (struct ivmd_header *)p;
1165 if (m->flags & IVMD_FLAG_EXCL_RANGE)
1166 init_exclusion_range(m);
1167 else if (m->flags & IVMD_FLAG_UNITY_MAP)
1168 init_unity_map_range(m);
1169
1170 p += m->length;
1171 }
1172
1173 return 0;
1174}
1175
1176/*
1177 * Init the device table to not allow DMA access for devices and
1178 * suppress all page faults
1179 */
1180static void init_device_table(void)
1181{
1182 u32 devid;
1183
1184 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
1185 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
1186 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
1187 }
1188}
1189
1190static void iommu_init_flags(struct amd_iommu *iommu)
1191{
1192 iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
1193 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
1194 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
1195
1196 iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
1197 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
1198 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
1199
1200 iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
1201 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
1202 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
1203
1204 iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
1205 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
1206 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
1207
1208 /*
1209 * make IOMMU memory accesses cache coherent
1210 */
1211 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1212}
1213
1214static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1215{
1216 int i, j;
1217 u32 ioc_feature_control;
1218 struct pci_dev *pdev = NULL;
1219
1220 /* RD890 BIOSes may not have completely reconfigured the iommu */
1221 if (!is_rd890_iommu(iommu->dev))
1222 return;
1223
1224 /*
1225 * First, we need to ensure that the iommu is enabled. This is
1226 * controlled by a register in the northbridge
1227 */
1228 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1229
1230 if (!pdev)
1231 return;
1232
1233 /* Select Northbridge indirect register 0x75 and enable writing */
1234 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1235 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1236
1237 /* Enable the iommu */
1238 if (!(ioc_feature_control & 0x1))
1239 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1240
1241 pci_dev_put(pdev);
1242
1243 /* Restore the iommu BAR */
1244 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1245 iommu->stored_addr_lo);
1246 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1247 iommu->stored_addr_hi);
1248
1249 /* Restore the l1 indirect regs for each of the 6 l1s */
1250 for (i = 0; i < 6; i++)
1251 for (j = 0; j < 0x12; j++)
1252 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1253
1254 /* Restore the l2 indirect regs */
1255 for (i = 0; i < 0x83; i++)
1256 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1257
1258 /* Lock PCI setup registers */
1259 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1260 iommu->stored_addr_lo | 1);
1261}
1262
1263/*
1264 * This function finally enables all IOMMUs found in the system after
1265 * they have been initialized
1266 */
1267static void enable_iommus(void)
1268{
1269 struct amd_iommu *iommu;
1270
1271 for_each_iommu(iommu) {
1272 iommu_disable(iommu);
1273 iommu_init_flags(iommu);
1274 iommu_set_device_table(iommu);
1275 iommu_enable_command_buffer(iommu);
1276 iommu_enable_event_buffer(iommu);
1277 iommu_set_exclusion_range(iommu);
1278 iommu_init_msi(iommu);
1279 iommu_enable(iommu);
1280 iommu_flush_all_caches(iommu);
1281 }
1282}
1283
1284static void disable_iommus(void)
1285{
1286 struct amd_iommu *iommu;
1287
1288 for_each_iommu(iommu)
1289 iommu_disable(iommu);
1290}
1291
1292/*
1293 * Suspend/Resume support
1294 * disable suspend until real resume implemented
1295 */
1296
1297static void amd_iommu_resume(void)
1298{
1299 struct amd_iommu *iommu;
1300
1301 for_each_iommu(iommu)
1302 iommu_apply_resume_quirks(iommu);
1303
1304 /* re-load the hardware */
1305 enable_iommus();
1306
1307 /*
1308 * we have to flush after the IOMMUs are enabled because a
1309 * disabled IOMMU will never execute the commands we send
1310 */
1311 for_each_iommu(iommu)
1312 iommu_flush_all_caches(iommu);
1313}
1314
1315static int amd_iommu_suspend(void)
1316{
1317 /* disable IOMMUs to go out of the way for BIOS */
1318 disable_iommus();
1319
1320 return 0;
1321}
1322
1323static struct syscore_ops amd_iommu_syscore_ops = {
1324 .suspend = amd_iommu_suspend,
1325 .resume = amd_iommu_resume,
1326};
1327
1328/*
1329 * This is the core init function for AMD IOMMU hardware in the system.
1330 * This function is called from the generic x86 DMA layer initialization
1331 * code.
1332 *
1333 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1334 * three times:
1335 *
1336 * 1 pass) Find the highest PCI device id the driver has to handle.
1337 * Upon this information the size of the data structures is
1338 * determined that needs to be allocated.
1339 *
1340 * 2 pass) Initialize the data structures just allocated with the
1341 * information in the ACPI table about available AMD IOMMUs
1342 * in the system. It also maps the PCI devices in the
1343 * system to specific IOMMUs
1344 *
1345 * 3 pass) After the basic data structures are allocated and
1346 * initialized we update them with information about memory
1347 * remapping requirements parsed out of the ACPI table in
1348 * this last pass.
1349 *
1350 * After that the hardware is initialized and ready to go. In the last
1351 * step we do some Linux specific things like registering the driver in
1352 * the dma_ops interface and initializing the suspend/resume support
1353 * functions. Finally it prints some information about AMD IOMMUs and
1354 * the driver state and enables the hardware.
1355 */
1356static int __init amd_iommu_init(void)
1357{
1358 int i, ret = 0;
1359
1360 /*
1361 * First parse ACPI tables to find the largest Bus/Dev/Func
1362 * we need to handle. Upon this information the shared data
1363 * structures for the IOMMUs in the system will be allocated
1364 */
1365 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1366 return -ENODEV;
1367
1368 ret = amd_iommu_init_err;
1369 if (ret)
1370 goto out;
1371
1372 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1373 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1374 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
1375
1376 ret = -ENOMEM;
1377
1378 /* Device table - directly used by all IOMMUs */
1379 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1380 get_order(dev_table_size));
1381 if (amd_iommu_dev_table == NULL)
1382 goto out;
1383
1384 /*
1385 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
1386 * IOMMU see for that device
1387 */
1388 amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
1389 get_order(alias_table_size));
1390 if (amd_iommu_alias_table == NULL)
1391 goto free;
1392
1393 /* IOMMU rlookup table - find the IOMMU for a specific device */
1394 amd_iommu_rlookup_table = (void *)__get_free_pages(
1395 GFP_KERNEL | __GFP_ZERO,
1396 get_order(rlookup_table_size));
1397 if (amd_iommu_rlookup_table == NULL)
1398 goto free;
1399
1400 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1401 GFP_KERNEL | __GFP_ZERO,
1402 get_order(MAX_DOMAIN_ID/8));
1403 if (amd_iommu_pd_alloc_bitmap == NULL)
1404 goto free;
1405
1406 /* init the device table */
1407 init_device_table();
1408
1409 /*
1410 * let all alias entries point to itself
1411 */
1412 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1413 amd_iommu_alias_table[i] = i;
1414
1415 /*
1416 * never allocate domain 0 because its used as the non-allocated and
1417 * error value placeholder
1418 */
1419 amd_iommu_pd_alloc_bitmap[0] = 1;
1420
1421 spin_lock_init(&amd_iommu_pd_lock);
1422
1423 /*
1424 * now the data structures are allocated and basically initialized
1425 * start the real acpi table scan
1426 */
1427 ret = -ENODEV;
1428 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1429 goto free;
1430
1431 if (amd_iommu_init_err) {
1432 ret = amd_iommu_init_err;
1433 goto free;
1434 }
1435
1436 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1437 goto free;
1438
1439 if (amd_iommu_init_err) {
1440 ret = amd_iommu_init_err;
1441 goto free;
1442 }
1443
1444 ret = amd_iommu_init_devices();
1445 if (ret)
1446 goto free;
1447
1448 enable_iommus();
1449
1450 if (iommu_pass_through)
1451 ret = amd_iommu_init_passthrough();
1452 else
1453 ret = amd_iommu_init_dma_ops();
1454
1455 if (ret)
1456 goto free_disable;
1457
1458 amd_iommu_init_api();
1459
1460 amd_iommu_init_notifier();
1461
1462 register_syscore_ops(&amd_iommu_syscore_ops);
1463
1464 if (iommu_pass_through)
1465 goto out;
1466
1467 if (amd_iommu_unmap_flush)
1468 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1469 else
1470 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1471
1472 x86_platform.iommu_shutdown = disable_iommus;
1473out:
1474 return ret;
1475
1476free_disable:
1477 disable_iommus();
1478
1479free:
1480 amd_iommu_uninit_devices();
1481
1482 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1483 get_order(MAX_DOMAIN_ID/8));
1484
1485 free_pages((unsigned long)amd_iommu_rlookup_table,
1486 get_order(rlookup_table_size));
1487
1488 free_pages((unsigned long)amd_iommu_alias_table,
1489 get_order(alias_table_size));
1490
1491 free_pages((unsigned long)amd_iommu_dev_table,
1492 get_order(dev_table_size));
1493
1494 free_iommu_all();
1495
1496 free_unity_maps();
1497
1498#ifdef CONFIG_GART_IOMMU
1499 /*
1500 * We failed to initialize the AMD IOMMU - try fallback to GART
1501 * if possible.
1502 */
1503 gart_iommu_init();
1504
1505#endif
1506
1507 goto out;
1508}
1509
1510/****************************************************************************
1511 *
1512 * Early detect code. This code runs at IOMMU detection time in the DMA
1513 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1514 * IOMMUs
1515 *
1516 ****************************************************************************/
1517static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1518{
1519 return 0;
1520}
1521
1522int __init amd_iommu_detect(void)
1523{
1524 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1525 return -ENODEV;
1526
1527 if (amd_iommu_disabled)
1528 return -ENODEV;
1529
1530 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1531 iommu_detected = 1;
1532 amd_iommu_detected = 1;
1533 x86_init.iommu.iommu_init = amd_iommu_init;
1534
1535 /* Make sure ACS will be enabled */
1536 pci_request_acs();
1537 return 1;
1538 }
1539 return -ENODEV;
1540}
1541
1542/****************************************************************************
1543 *
1544 * Parsing functions for the AMD IOMMU specific kernel command line
1545 * options.
1546 *
1547 ****************************************************************************/
1548
1549static int __init parse_amd_iommu_dump(char *str)
1550{
1551 amd_iommu_dump = true;
1552
1553 return 1;
1554}
1555
1556static int __init parse_amd_iommu_options(char *str)
1557{
1558 for (; *str; ++str) {
1559 if (strncmp(str, "fullflush", 9) == 0)
1560 amd_iommu_unmap_flush = true;
1561 if (strncmp(str, "off", 3) == 0)
1562 amd_iommu_disabled = true;
1563 }
1564
1565 return 1;
1566}
1567
1568__setup("amd_iommu_dump", parse_amd_iommu_dump);
1569__setup("amd_iommu=", parse_amd_iommu_options);
1570
1571IOMMU_INIT_FINISH(amd_iommu_detect,
1572 gart_iommu_hole_init,
1573 0,
1574 0);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
new file mode 100644
index 000000000000..7ffaa64410b0
--- /dev/null
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22#include "amd_iommu_types.h"
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
27extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
28extern void amd_iommu_apply_erratum_63(u16 devid);
29extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
30extern int amd_iommu_init_devices(void);
31extern void amd_iommu_uninit_devices(void);
32extern void amd_iommu_init_notifier(void);
33extern void amd_iommu_init_api(void);
34#ifndef CONFIG_AMD_IOMMU_STATS
35
36static inline void amd_iommu_stats_init(void) { }
37
38#endif /* !CONFIG_AMD_IOMMU_STATS */
39
40static inline bool is_rd890_iommu(struct pci_dev *pdev)
41{
42 return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
43 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
44}
45
46static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
47{
48 if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
49 return false;
50
51 return !!(iommu->features & f);
52}
53
54#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
new file mode 100644
index 000000000000..5b9c5075e81a
--- /dev/null
+++ b/drivers/iommu/amd_iommu_types.h
@@ -0,0 +1,585 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
21#define _ASM_X86_AMD_IOMMU_TYPES_H
22
23#include <linux/types.h>
24#include <linux/mutex.h>
25#include <linux/list.h>
26#include <linux/spinlock.h>
27
28/*
29 * Maximum number of IOMMUs supported
30 */
31#define MAX_IOMMUS 32
32
33/*
34 * some size calculation constants
35 */
36#define DEV_TABLE_ENTRY_SIZE 32
37#define ALIAS_TABLE_ENTRY_SIZE 2
38#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
39
40/* Length of the MMIO region for the AMD IOMMU */
41#define MMIO_REGION_LENGTH 0x4000
42
43/* Capability offsets used by the driver */
44#define MMIO_CAP_HDR_OFFSET 0x00
45#define MMIO_RANGE_OFFSET 0x0c
46#define MMIO_MISC_OFFSET 0x10
47
48/* Masks, shifts and macros to parse the device range capability */
49#define MMIO_RANGE_LD_MASK 0xff000000
50#define MMIO_RANGE_FD_MASK 0x00ff0000
51#define MMIO_RANGE_BUS_MASK 0x0000ff00
52#define MMIO_RANGE_LD_SHIFT 24
53#define MMIO_RANGE_FD_SHIFT 16
54#define MMIO_RANGE_BUS_SHIFT 8
55#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
56#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
57#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
58#define MMIO_MSI_NUM(x) ((x) & 0x1f)
59
60/* Flag masks for the AMD IOMMU exclusion range */
61#define MMIO_EXCL_ENABLE_MASK 0x01ULL
62#define MMIO_EXCL_ALLOW_MASK 0x02ULL
63
64/* Used offsets into the MMIO space */
65#define MMIO_DEV_TABLE_OFFSET 0x0000
66#define MMIO_CMD_BUF_OFFSET 0x0008
67#define MMIO_EVT_BUF_OFFSET 0x0010
68#define MMIO_CONTROL_OFFSET 0x0018
69#define MMIO_EXCL_BASE_OFFSET 0x0020
70#define MMIO_EXCL_LIMIT_OFFSET 0x0028
71#define MMIO_EXT_FEATURES 0x0030
72#define MMIO_CMD_HEAD_OFFSET 0x2000
73#define MMIO_CMD_TAIL_OFFSET 0x2008
74#define MMIO_EVT_HEAD_OFFSET 0x2010
75#define MMIO_EVT_TAIL_OFFSET 0x2018
76#define MMIO_STATUS_OFFSET 0x2020
77
78
79/* Extended Feature Bits */
80#define FEATURE_PREFETCH (1ULL<<0)
81#define FEATURE_PPR (1ULL<<1)
82#define FEATURE_X2APIC (1ULL<<2)
83#define FEATURE_NX (1ULL<<3)
84#define FEATURE_GT (1ULL<<4)
85#define FEATURE_IA (1ULL<<6)
86#define FEATURE_GA (1ULL<<7)
87#define FEATURE_HE (1ULL<<8)
88#define FEATURE_PC (1ULL<<9)
89
90/* MMIO status bits */
91#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
92
93/* event logging constants */
94#define EVENT_ENTRY_SIZE 0x10
95#define EVENT_TYPE_SHIFT 28
96#define EVENT_TYPE_MASK 0xf
97#define EVENT_TYPE_ILL_DEV 0x1
98#define EVENT_TYPE_IO_FAULT 0x2
99#define EVENT_TYPE_DEV_TAB_ERR 0x3
100#define EVENT_TYPE_PAGE_TAB_ERR 0x4
101#define EVENT_TYPE_ILL_CMD 0x5
102#define EVENT_TYPE_CMD_HARD_ERR 0x6
103#define EVENT_TYPE_IOTLB_INV_TO 0x7
104#define EVENT_TYPE_INV_DEV_REQ 0x8
105#define EVENT_DEVID_MASK 0xffff
106#define EVENT_DEVID_SHIFT 0
107#define EVENT_DOMID_MASK 0xffff
108#define EVENT_DOMID_SHIFT 0
109#define EVENT_FLAGS_MASK 0xfff
110#define EVENT_FLAGS_SHIFT 0x10
111
112/* feature control bits */
113#define CONTROL_IOMMU_EN 0x00ULL
114#define CONTROL_HT_TUN_EN 0x01ULL
115#define CONTROL_EVT_LOG_EN 0x02ULL
116#define CONTROL_EVT_INT_EN 0x03ULL
117#define CONTROL_COMWAIT_EN 0x04ULL
118#define CONTROL_PASSPW_EN 0x08ULL
119#define CONTROL_RESPASSPW_EN 0x09ULL
120#define CONTROL_COHERENT_EN 0x0aULL
121#define CONTROL_ISOC_EN 0x0bULL
122#define CONTROL_CMDBUF_EN 0x0cULL
123#define CONTROL_PPFLOG_EN 0x0dULL
124#define CONTROL_PPFINT_EN 0x0eULL
125
126/* command specific defines */
127#define CMD_COMPL_WAIT 0x01
128#define CMD_INV_DEV_ENTRY 0x02
129#define CMD_INV_IOMMU_PAGES 0x03
130#define CMD_INV_IOTLB_PAGES 0x04
131#define CMD_INV_ALL 0x08
132
133#define CMD_COMPL_WAIT_STORE_MASK 0x01
134#define CMD_COMPL_WAIT_INT_MASK 0x02
135#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
136#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
137
138#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
139
140/* macros and definitions for device table entries */
141#define DEV_ENTRY_VALID 0x00
142#define DEV_ENTRY_TRANSLATION 0x01
143#define DEV_ENTRY_IR 0x3d
144#define DEV_ENTRY_IW 0x3e
145#define DEV_ENTRY_NO_PAGE_FAULT 0x62
146#define DEV_ENTRY_EX 0x67
147#define DEV_ENTRY_SYSMGT1 0x68
148#define DEV_ENTRY_SYSMGT2 0x69
149#define DEV_ENTRY_INIT_PASS 0xb8
150#define DEV_ENTRY_EINT_PASS 0xb9
151#define DEV_ENTRY_NMI_PASS 0xba
152#define DEV_ENTRY_LINT0_PASS 0xbe
153#define DEV_ENTRY_LINT1_PASS 0xbf
154#define DEV_ENTRY_MODE_MASK 0x07
155#define DEV_ENTRY_MODE_SHIFT 0x09
156
157/* constants to configure the command buffer */
158#define CMD_BUFFER_SIZE 8192
159#define CMD_BUFFER_UNINITIALIZED 1
160#define CMD_BUFFER_ENTRIES 512
161#define MMIO_CMD_SIZE_SHIFT 56
162#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
163
164/* constants for event buffer handling */
165#define EVT_BUFFER_SIZE 8192 /* 512 entries */
166#define EVT_LEN_MASK (0x9ULL << 56)
167
168#define PAGE_MODE_NONE 0x00
169#define PAGE_MODE_1_LEVEL 0x01
170#define PAGE_MODE_2_LEVEL 0x02
171#define PAGE_MODE_3_LEVEL 0x03
172#define PAGE_MODE_4_LEVEL 0x04
173#define PAGE_MODE_5_LEVEL 0x05
174#define PAGE_MODE_6_LEVEL 0x06
175
176#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
177#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
178 ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
179 (0xffffffffffffffffULL))
180#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
181#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
182#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
183 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
184#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
185
186#define PM_MAP_4k 0
187#define PM_ADDR_MASK 0x000ffffffffff000ULL
188#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
189 (~((1ULL << (12 + ((lvl) * 9))) - 1)))
190#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
191
192/*
193 * Returns the page table level to use for a given page size
194 * Pagesize is expected to be a power-of-two
195 */
196#define PAGE_SIZE_LEVEL(pagesize) \
197 ((__ffs(pagesize) - 12) / 9)
198/*
199 * Returns the number of ptes to use for a given page size
200 * Pagesize is expected to be a power-of-two
201 */
202#define PAGE_SIZE_PTE_COUNT(pagesize) \
203 (1ULL << ((__ffs(pagesize) - 12) % 9))
204
205/*
206 * Aligns a given io-virtual address to a given page size
207 * Pagesize is expected to be a power-of-two
208 */
209#define PAGE_SIZE_ALIGN(address, pagesize) \
210 ((address) & ~((pagesize) - 1))
211/*
212 * Creates an IOMMU PTE for an address an a given pagesize
213 * The PTE has no permission bits set
214 * Pagesize is expected to be a power-of-two larger than 4096
215 */
216#define PAGE_SIZE_PTE(address, pagesize) \
217 (((address) | ((pagesize) - 1)) & \
218 (~(pagesize >> 1)) & PM_ADDR_MASK)
219
220/*
221 * Takes a PTE value with mode=0x07 and returns the page size it maps
222 */
223#define PTE_PAGE_SIZE(pte) \
224 (1ULL << (1 + ffz(((pte) | 0xfffULL))))
225
226#define IOMMU_PTE_P (1ULL << 0)
227#define IOMMU_PTE_TV (1ULL << 1)
228#define IOMMU_PTE_U (1ULL << 59)
229#define IOMMU_PTE_FC (1ULL << 60)
230#define IOMMU_PTE_IR (1ULL << 61)
231#define IOMMU_PTE_IW (1ULL << 62)
232
233#define DTE_FLAG_IOTLB 0x01
234
235#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
236#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
237#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
238#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
239
240#define IOMMU_PROT_MASK 0x03
241#define IOMMU_PROT_IR 0x01
242#define IOMMU_PROT_IW 0x02
243
244/* IOMMU capabilities */
245#define IOMMU_CAP_IOTLB 24
246#define IOMMU_CAP_NPCACHE 26
247#define IOMMU_CAP_EFR 27
248
249#define MAX_DOMAIN_ID 65536
250
251/* FIXME: move this macro to <linux/pci.h> */
252#define PCI_BUS(x) (((x) >> 8) & 0xff)
253
254/* Protection domain flags */
255#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
256#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
257 domain for an IOMMU */
258#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
259 translation */
260
261extern bool amd_iommu_dump;
262#define DUMP_printk(format, arg...) \
263 do { \
264 if (amd_iommu_dump) \
265 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
266 } while(0);
267
268/* global flag if IOMMUs cache non-present entries */
269extern bool amd_iommu_np_cache;
270/* Only true if all IOMMUs support device IOTLBs */
271extern bool amd_iommu_iotlb_sup;
272
273/*
274 * Make iterating over all IOMMUs easier
275 */
276#define for_each_iommu(iommu) \
277 list_for_each_entry((iommu), &amd_iommu_list, list)
278#define for_each_iommu_safe(iommu, next) \
279 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
280
281#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
282#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
283#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
284#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
285#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
286#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
287
288/*
289 * This structure contains generic data for IOMMU protection domains
290 * independent of their use.
291 */
292struct protection_domain {
293 struct list_head list; /* for list of all protection domains */
294 struct list_head dev_list; /* List of all devices in this domain */
295 spinlock_t lock; /* mostly used to lock the page table*/
296 struct mutex api_lock; /* protect page tables in the iommu-api path */
297 u16 id; /* the domain id written to the device table */
298 int mode; /* paging mode (0-6 levels) */
299 u64 *pt_root; /* page table root pointer */
300 unsigned long flags; /* flags to find out type of domain */
301 bool updated; /* complete domain flush required */
302 unsigned dev_cnt; /* devices assigned to this domain */
303 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
304 void *priv; /* private data */
305
306};
307
308/*
309 * This struct contains device specific data for the IOMMU
310 */
311struct iommu_dev_data {
312 struct list_head list; /* For domain->dev_list */
313 struct list_head dev_data_list; /* For global dev_data_list */
314 struct iommu_dev_data *alias_data;/* The alias dev_data */
315 struct protection_domain *domain; /* Domain the device is bound to */
316 atomic_t bind; /* Domain attach reverent count */
317 u16 devid; /* PCI Device ID */
318 struct {
319 bool enabled;
320 int qdep;
321 } ats; /* ATS state */
322};
323
324/*
325 * For dynamic growth the aperture size is split into ranges of 128MB of
326 * DMA address space each. This struct represents one such range.
327 */
328struct aperture_range {
329
330 /* address allocation bitmap */
331 unsigned long *bitmap;
332
333 /*
334 * Array of PTE pages for the aperture. In this array we save all the
335 * leaf pages of the domain page table used for the aperture. This way
336 * we don't need to walk the page table to find a specific PTE. We can
337 * just calculate its address in constant time.
338 */
339 u64 *pte_pages[64];
340
341 unsigned long offset;
342};
343
344/*
345 * Data container for a dma_ops specific protection domain
346 */
347struct dma_ops_domain {
348 struct list_head list;
349
350 /* generic protection domain information */
351 struct protection_domain domain;
352
353 /* size of the aperture for the mappings */
354 unsigned long aperture_size;
355
356 /* address we start to search for free addresses */
357 unsigned long next_address;
358
359 /* address space relevant data */
360 struct aperture_range *aperture[APERTURE_MAX_RANGES];
361
362 /* This will be set to true when TLB needs to be flushed */
363 bool need_flush;
364
365 /*
366 * if this is a preallocated domain, keep the device for which it was
367 * preallocated in this variable
368 */
369 u16 target_dev;
370};
371
372/*
373 * Structure where we save information about one hardware AMD IOMMU in the
374 * system.
375 */
376struct amd_iommu {
377 struct list_head list;
378
379 /* Index within the IOMMU array */
380 int index;
381
382 /* locks the accesses to the hardware */
383 spinlock_t lock;
384
385 /* Pointer to PCI device of this IOMMU */
386 struct pci_dev *dev;
387
388 /* physical address of MMIO space */
389 u64 mmio_phys;
390 /* virtual address of MMIO space */
391 u8 *mmio_base;
392
393 /* capabilities of that IOMMU read from ACPI */
394 u32 cap;
395
396 /* flags read from acpi table */
397 u8 acpi_flags;
398
399 /* Extended features */
400 u64 features;
401
402 /*
403 * Capability pointer. There could be more than one IOMMU per PCI
404 * device function if there are more than one AMD IOMMU capability
405 * pointers.
406 */
407 u16 cap_ptr;
408
409 /* pci domain of this IOMMU */
410 u16 pci_seg;
411
412 /* first device this IOMMU handles. read from PCI */
413 u16 first_device;
414 /* last device this IOMMU handles. read from PCI */
415 u16 last_device;
416
417 /* start of exclusion range of that IOMMU */
418 u64 exclusion_start;
419 /* length of exclusion range of that IOMMU */
420 u64 exclusion_length;
421
422 /* command buffer virtual address */
423 u8 *cmd_buf;
424 /* size of command buffer */
425 u32 cmd_buf_size;
426
427 /* size of event buffer */
428 u32 evt_buf_size;
429 /* event buffer virtual address */
430 u8 *evt_buf;
431 /* MSI number for event interrupt */
432 u16 evt_msi_num;
433
434 /* true if interrupts for this IOMMU are already enabled */
435 bool int_enabled;
436
437 /* if one, we need to send a completion wait command */
438 bool need_sync;
439
440 /* default dma_ops domain for that IOMMU */
441 struct dma_ops_domain *default_dom;
442
443 /*
444 * We can't rely on the BIOS to restore all values on reinit, so we
445 * need to stash them
446 */
447
448 /* The iommu BAR */
449 u32 stored_addr_lo;
450 u32 stored_addr_hi;
451
452 /*
453 * Each iommu has 6 l1s, each of which is documented as having 0x12
454 * registers
455 */
456 u32 stored_l1[6][0x12];
457
458 /* The l2 indirect registers */
459 u32 stored_l2[0x83];
460};
461
462/*
463 * List with all IOMMUs in the system. This list is not locked because it is
464 * only written and read at driver initialization or suspend time
465 */
466extern struct list_head amd_iommu_list;
467
468/*
469 * Array with pointers to each IOMMU struct
470 * The indices are referenced in the protection domains
471 */
472extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
473
474/* Number of IOMMUs present in the system */
475extern int amd_iommus_present;
476
477/*
478 * Declarations for the global list of all protection domains
479 */
480extern spinlock_t amd_iommu_pd_lock;
481extern struct list_head amd_iommu_pd_list;
482
483/*
484 * Structure defining one entry in the device table
485 */
486struct dev_table_entry {
487 u32 data[8];
488};
489
490/*
491 * One entry for unity mappings parsed out of the ACPI table.
492 */
493struct unity_map_entry {
494 struct list_head list;
495
496 /* starting device id this entry is used for (including) */
497 u16 devid_start;
498 /* end device id this entry is used for (including) */
499 u16 devid_end;
500
501 /* start address to unity map (including) */
502 u64 address_start;
503 /* end address to unity map (including) */
504 u64 address_end;
505
506 /* required protection */
507 int prot;
508};
509
510/*
511 * List of all unity mappings. It is not locked because as runtime it is only
512 * read. It is created at ACPI table parsing time.
513 */
514extern struct list_head amd_iommu_unity_map;
515
516/*
517 * Data structures for device handling
518 */
519
520/*
521 * Device table used by hardware. Read and write accesses by software are
522 * locked with the amd_iommu_pd_table lock.
523 */
524extern struct dev_table_entry *amd_iommu_dev_table;
525
526/*
527 * Alias table to find requestor ids to device ids. Not locked because only
528 * read on runtime.
529 */
530extern u16 *amd_iommu_alias_table;
531
532/*
533 * Reverse lookup table to find the IOMMU which translates a specific device.
534 */
535extern struct amd_iommu **amd_iommu_rlookup_table;
536
537/* size of the dma_ops aperture as power of 2 */
538extern unsigned amd_iommu_aperture_order;
539
540/* largest PCI device id we expect translation requests for */
541extern u16 amd_iommu_last_bdf;
542
543/* allocation bitmap for domain ids */
544extern unsigned long *amd_iommu_pd_alloc_bitmap;
545
546/*
547 * If true, the addresses will be flushed on unmap time, not when
548 * they are reused
549 */
550extern bool amd_iommu_unmap_flush;
551
552/* takes bus and device/function and returns the device id
553 * FIXME: should that be in generic PCI code? */
554static inline u16 calc_devid(u8 bus, u8 devfn)
555{
556 return (((u16)bus) << 8) | devfn;
557}
558
559#ifdef CONFIG_AMD_IOMMU_STATS
560
561struct __iommu_counter {
562 char *name;
563 struct dentry *dent;
564 u64 value;
565};
566
567#define DECLARE_STATS_COUNTER(nm) \
568 static struct __iommu_counter nm = { \
569 .name = #nm, \
570 }
571
572#define INC_STATS_COUNTER(name) name.value += 1
573#define ADD_STATS_COUNTER(name, x) name.value += (x)
574#define SUB_STATS_COUNTER(name, x) name.value -= (x)
575
576#else /* CONFIG_AMD_IOMMU_STATS */
577
578#define DECLARE_STATS_COUNTER(name)
579#define INC_STATS_COUNTER(name)
580#define ADD_STATS_COUNTER(name, x)
581#define SUB_STATS_COUNTER(name, x)
582
583#endif /* CONFIG_AMD_IOMMU_STATS */
584
585#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
new file mode 100644
index 000000000000..3dc9befa5aec
--- /dev/null
+++ b/drivers/iommu/dmar.c
@@ -0,0 +1,1461 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 *
22 * This file implements early detection/parsing of Remapping Devices
23 * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
24 * tables.
25 *
26 * These routines are used by both DMA-remapping and Interrupt-remapping
27 */
28
29#include <linux/pci.h>
30#include <linux/dmar.h>
31#include <linux/iova.h>
32#include <linux/intel-iommu.h>
33#include <linux/timer.h>
34#include <linux/irq.h>
35#include <linux/interrupt.h>
36#include <linux/tboot.h>
37#include <linux/dmi.h>
38#include <linux/slab.h>
39#include <asm/iommu_table.h>
40
41#define PREFIX "DMAR: "
42
43/* No locks are needed as DMA remapping hardware unit
44 * list is constructed at boot time and hotplug of
45 * these units are not supported by the architecture.
46 */
47LIST_HEAD(dmar_drhd_units);
48
49static struct acpi_table_header * __initdata dmar_tbl;
50static acpi_size dmar_tbl_size;
51
52static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
53{
54 /*
55 * add INCLUDE_ALL at the tail, so scan the list will find it at
56 * the very end.
57 */
58 if (drhd->include_all)
59 list_add_tail(&drhd->list, &dmar_drhd_units);
60 else
61 list_add(&drhd->list, &dmar_drhd_units);
62}
63
64static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
65 struct pci_dev **dev, u16 segment)
66{
67 struct pci_bus *bus;
68 struct pci_dev *pdev = NULL;
69 struct acpi_dmar_pci_path *path;
70 int count;
71
72 bus = pci_find_bus(segment, scope->bus);
73 path = (struct acpi_dmar_pci_path *)(scope + 1);
74 count = (scope->length - sizeof(struct acpi_dmar_device_scope))
75 / sizeof(struct acpi_dmar_pci_path);
76
77 while (count) {
78 if (pdev)
79 pci_dev_put(pdev);
80 /*
81 * Some BIOSes list non-exist devices in DMAR table, just
82 * ignore it
83 */
84 if (!bus) {
85 printk(KERN_WARNING
86 PREFIX "Device scope bus [%d] not found\n",
87 scope->bus);
88 break;
89 }
90 pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn));
91 if (!pdev) {
92 printk(KERN_WARNING PREFIX
93 "Device scope device [%04x:%02x:%02x.%02x] not found\n",
94 segment, bus->number, path->dev, path->fn);
95 break;
96 }
97 path ++;
98 count --;
99 bus = pdev->subordinate;
100 }
101 if (!pdev) {
102 printk(KERN_WARNING PREFIX
103 "Device scope device [%04x:%02x:%02x.%02x] not found\n",
104 segment, scope->bus, path->dev, path->fn);
105 *dev = NULL;
106 return 0;
107 }
108 if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \
109 pdev->subordinate) || (scope->entry_type == \
110 ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) {
111 pci_dev_put(pdev);
112 printk(KERN_WARNING PREFIX
113 "Device scope type does not match for %s\n",
114 pci_name(pdev));
115 return -EINVAL;
116 }
117 *dev = pdev;
118 return 0;
119}
120
121static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt,
122 struct pci_dev ***devices, u16 segment)
123{
124 struct acpi_dmar_device_scope *scope;
125 void * tmp = start;
126 int index;
127 int ret;
128
129 *cnt = 0;
130 while (start < end) {
131 scope = start;
132 if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
133 scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
134 (*cnt)++;
135 else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
136 printk(KERN_WARNING PREFIX
137 "Unsupported device scope\n");
138 }
139 start += scope->length;
140 }
141 if (*cnt == 0)
142 return 0;
143
144 *devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL);
145 if (!*devices)
146 return -ENOMEM;
147
148 start = tmp;
149 index = 0;
150 while (start < end) {
151 scope = start;
152 if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
153 scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) {
154 ret = dmar_parse_one_dev_scope(scope,
155 &(*devices)[index], segment);
156 if (ret) {
157 kfree(*devices);
158 return ret;
159 }
160 index ++;
161 }
162 start += scope->length;
163 }
164
165 return 0;
166}
167
168/**
169 * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
170 * structure which uniquely represent one DMA remapping hardware unit
171 * present in the platform
172 */
173static int __init
174dmar_parse_one_drhd(struct acpi_dmar_header *header)
175{
176 struct acpi_dmar_hardware_unit *drhd;
177 struct dmar_drhd_unit *dmaru;
178 int ret = 0;
179
180 drhd = (struct acpi_dmar_hardware_unit *)header;
181 dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
182 if (!dmaru)
183 return -ENOMEM;
184
185 dmaru->hdr = header;
186 dmaru->reg_base_addr = drhd->address;
187 dmaru->segment = drhd->segment;
188 dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
189
190 ret = alloc_iommu(dmaru);
191 if (ret) {
192 kfree(dmaru);
193 return ret;
194 }
195 dmar_register_drhd_unit(dmaru);
196 return 0;
197}
198
199static int __init dmar_parse_dev(struct dmar_drhd_unit *dmaru)
200{
201 struct acpi_dmar_hardware_unit *drhd;
202 int ret = 0;
203
204 drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr;
205
206 if (dmaru->include_all)
207 return 0;
208
209 ret = dmar_parse_dev_scope((void *)(drhd + 1),
210 ((void *)drhd) + drhd->header.length,
211 &dmaru->devices_cnt, &dmaru->devices,
212 drhd->segment);
213 if (ret) {
214 list_del(&dmaru->list);
215 kfree(dmaru);
216 }
217 return ret;
218}
219
220#ifdef CONFIG_DMAR
221LIST_HEAD(dmar_rmrr_units);
222
223static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
224{
225 list_add(&rmrr->list, &dmar_rmrr_units);
226}
227
228
229static int __init
230dmar_parse_one_rmrr(struct acpi_dmar_header *header)
231{
232 struct acpi_dmar_reserved_memory *rmrr;
233 struct dmar_rmrr_unit *rmrru;
234
235 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
236 if (!rmrru)
237 return -ENOMEM;
238
239 rmrru->hdr = header;
240 rmrr = (struct acpi_dmar_reserved_memory *)header;
241 rmrru->base_address = rmrr->base_address;
242 rmrru->end_address = rmrr->end_address;
243
244 dmar_register_rmrr_unit(rmrru);
245 return 0;
246}
247
248static int __init
249rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
250{
251 struct acpi_dmar_reserved_memory *rmrr;
252 int ret;
253
254 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
255 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
256 ((void *)rmrr) + rmrr->header.length,
257 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
258
259 if (ret || (rmrru->devices_cnt == 0)) {
260 list_del(&rmrru->list);
261 kfree(rmrru);
262 }
263 return ret;
264}
265
266static LIST_HEAD(dmar_atsr_units);
267
268static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
269{
270 struct acpi_dmar_atsr *atsr;
271 struct dmar_atsr_unit *atsru;
272
273 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
274 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
275 if (!atsru)
276 return -ENOMEM;
277
278 atsru->hdr = hdr;
279 atsru->include_all = atsr->flags & 0x1;
280
281 list_add(&atsru->list, &dmar_atsr_units);
282
283 return 0;
284}
285
286static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
287{
288 int rc;
289 struct acpi_dmar_atsr *atsr;
290
291 if (atsru->include_all)
292 return 0;
293
294 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
295 rc = dmar_parse_dev_scope((void *)(atsr + 1),
296 (void *)atsr + atsr->header.length,
297 &atsru->devices_cnt, &atsru->devices,
298 atsr->segment);
299 if (rc || !atsru->devices_cnt) {
300 list_del(&atsru->list);
301 kfree(atsru);
302 }
303
304 return rc;
305}
306
307int dmar_find_matched_atsr_unit(struct pci_dev *dev)
308{
309 int i;
310 struct pci_bus *bus;
311 struct acpi_dmar_atsr *atsr;
312 struct dmar_atsr_unit *atsru;
313
314 dev = pci_physfn(dev);
315
316 list_for_each_entry(atsru, &dmar_atsr_units, list) {
317 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
318 if (atsr->segment == pci_domain_nr(dev->bus))
319 goto found;
320 }
321
322 return 0;
323
324found:
325 for (bus = dev->bus; bus; bus = bus->parent) {
326 struct pci_dev *bridge = bus->self;
327
328 if (!bridge || !pci_is_pcie(bridge) ||
329 bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
330 return 0;
331
332 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
333 for (i = 0; i < atsru->devices_cnt; i++)
334 if (atsru->devices[i] == bridge)
335 return 1;
336 break;
337 }
338 }
339
340 if (atsru->include_all)
341 return 1;
342
343 return 0;
344}
345#endif
346
347#ifdef CONFIG_ACPI_NUMA
348static int __init
349dmar_parse_one_rhsa(struct acpi_dmar_header *header)
350{
351 struct acpi_dmar_rhsa *rhsa;
352 struct dmar_drhd_unit *drhd;
353
354 rhsa = (struct acpi_dmar_rhsa *)header;
355 for_each_drhd_unit(drhd) {
356 if (drhd->reg_base_addr == rhsa->base_address) {
357 int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
358
359 if (!node_online(node))
360 node = -1;
361 drhd->iommu->node = node;
362 return 0;
363 }
364 }
365 WARN_TAINT(
366 1, TAINT_FIRMWARE_WORKAROUND,
367 "Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
368 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
369 drhd->reg_base_addr,
370 dmi_get_system_info(DMI_BIOS_VENDOR),
371 dmi_get_system_info(DMI_BIOS_VERSION),
372 dmi_get_system_info(DMI_PRODUCT_VERSION));
373
374 return 0;
375}
376#endif
377
378static void __init
379dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
380{
381 struct acpi_dmar_hardware_unit *drhd;
382 struct acpi_dmar_reserved_memory *rmrr;
383 struct acpi_dmar_atsr *atsr;
384 struct acpi_dmar_rhsa *rhsa;
385
386 switch (header->type) {
387 case ACPI_DMAR_TYPE_HARDWARE_UNIT:
388 drhd = container_of(header, struct acpi_dmar_hardware_unit,
389 header);
390 printk (KERN_INFO PREFIX
391 "DRHD base: %#016Lx flags: %#x\n",
392 (unsigned long long)drhd->address, drhd->flags);
393 break;
394 case ACPI_DMAR_TYPE_RESERVED_MEMORY:
395 rmrr = container_of(header, struct acpi_dmar_reserved_memory,
396 header);
397 printk (KERN_INFO PREFIX
398 "RMRR base: %#016Lx end: %#016Lx\n",
399 (unsigned long long)rmrr->base_address,
400 (unsigned long long)rmrr->end_address);
401 break;
402 case ACPI_DMAR_TYPE_ATSR:
403 atsr = container_of(header, struct acpi_dmar_atsr, header);
404 printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
405 break;
406 case ACPI_DMAR_HARDWARE_AFFINITY:
407 rhsa = container_of(header, struct acpi_dmar_rhsa, header);
408 printk(KERN_INFO PREFIX "RHSA base: %#016Lx proximity domain: %#x\n",
409 (unsigned long long)rhsa->base_address,
410 rhsa->proximity_domain);
411 break;
412 }
413}
414
415/**
416 * dmar_table_detect - checks to see if the platform supports DMAR devices
417 */
418static int __init dmar_table_detect(void)
419{
420 acpi_status status = AE_OK;
421
422 /* if we could find DMAR table, then there are DMAR devices */
423 status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0,
424 (struct acpi_table_header **)&dmar_tbl,
425 &dmar_tbl_size);
426
427 if (ACPI_SUCCESS(status) && !dmar_tbl) {
428 printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
429 status = AE_NOT_FOUND;
430 }
431
432 return (ACPI_SUCCESS(status) ? 1 : 0);
433}
434
435/**
436 * parse_dmar_table - parses the DMA reporting table
437 */
438static int __init
439parse_dmar_table(void)
440{
441 struct acpi_table_dmar *dmar;
442 struct acpi_dmar_header *entry_header;
443 int ret = 0;
444
445 /*
446 * Do it again, earlier dmar_tbl mapping could be mapped with
447 * fixed map.
448 */
449 dmar_table_detect();
450
451 /*
452 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
453 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
454 */
455 dmar_tbl = tboot_get_dmar_table(dmar_tbl);
456
457 dmar = (struct acpi_table_dmar *)dmar_tbl;
458 if (!dmar)
459 return -ENODEV;
460
461 if (dmar->width < PAGE_SHIFT - 1) {
462 printk(KERN_WARNING PREFIX "Invalid DMAR haw\n");
463 return -EINVAL;
464 }
465
466 printk (KERN_INFO PREFIX "Host address width %d\n",
467 dmar->width + 1);
468
469 entry_header = (struct acpi_dmar_header *)(dmar + 1);
470 while (((unsigned long)entry_header) <
471 (((unsigned long)dmar) + dmar_tbl->length)) {
472 /* Avoid looping forever on bad ACPI tables */
473 if (entry_header->length == 0) {
474 printk(KERN_WARNING PREFIX
475 "Invalid 0-length structure\n");
476 ret = -EINVAL;
477 break;
478 }
479
480 dmar_table_print_dmar_entry(entry_header);
481
482 switch (entry_header->type) {
483 case ACPI_DMAR_TYPE_HARDWARE_UNIT:
484 ret = dmar_parse_one_drhd(entry_header);
485 break;
486 case ACPI_DMAR_TYPE_RESERVED_MEMORY:
487#ifdef CONFIG_DMAR
488 ret = dmar_parse_one_rmrr(entry_header);
489#endif
490 break;
491 case ACPI_DMAR_TYPE_ATSR:
492#ifdef CONFIG_DMAR
493 ret = dmar_parse_one_atsr(entry_header);
494#endif
495 break;
496 case ACPI_DMAR_HARDWARE_AFFINITY:
497#ifdef CONFIG_ACPI_NUMA
498 ret = dmar_parse_one_rhsa(entry_header);
499#endif
500 break;
501 default:
502 printk(KERN_WARNING PREFIX
503 "Unknown DMAR structure type %d\n",
504 entry_header->type);
505 ret = 0; /* for forward compatibility */
506 break;
507 }
508 if (ret)
509 break;
510
511 entry_header = ((void *)entry_header + entry_header->length);
512 }
513 return ret;
514}
515
516static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
517 struct pci_dev *dev)
518{
519 int index;
520
521 while (dev) {
522 for (index = 0; index < cnt; index++)
523 if (dev == devices[index])
524 return 1;
525
526 /* Check our parent */
527 dev = dev->bus->self;
528 }
529
530 return 0;
531}
532
533struct dmar_drhd_unit *
534dmar_find_matched_drhd_unit(struct pci_dev *dev)
535{
536 struct dmar_drhd_unit *dmaru = NULL;
537 struct acpi_dmar_hardware_unit *drhd;
538
539 dev = pci_physfn(dev);
540
541 list_for_each_entry(dmaru, &dmar_drhd_units, list) {
542 drhd = container_of(dmaru->hdr,
543 struct acpi_dmar_hardware_unit,
544 header);
545
546 if (dmaru->include_all &&
547 drhd->segment == pci_domain_nr(dev->bus))
548 return dmaru;
549
550 if (dmar_pci_device_match(dmaru->devices,
551 dmaru->devices_cnt, dev))
552 return dmaru;
553 }
554
555 return NULL;
556}
557
558int __init dmar_dev_scope_init(void)
559{
560 struct dmar_drhd_unit *drhd, *drhd_n;
561 int ret = -ENODEV;
562
563 list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) {
564 ret = dmar_parse_dev(drhd);
565 if (ret)
566 return ret;
567 }
568
569#ifdef CONFIG_DMAR
570 {
571 struct dmar_rmrr_unit *rmrr, *rmrr_n;
572 struct dmar_atsr_unit *atsr, *atsr_n;
573
574 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
575 ret = rmrr_parse_dev(rmrr);
576 if (ret)
577 return ret;
578 }
579
580 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
581 ret = atsr_parse_dev(atsr);
582 if (ret)
583 return ret;
584 }
585 }
586#endif
587
588 return ret;
589}
590
591
592int __init dmar_table_init(void)
593{
594 static int dmar_table_initialized;
595 int ret;
596
597 if (dmar_table_initialized)
598 return 0;
599
600 dmar_table_initialized = 1;
601
602 ret = parse_dmar_table();
603 if (ret) {
604 if (ret != -ENODEV)
605 printk(KERN_INFO PREFIX "parse DMAR table failure.\n");
606 return ret;
607 }
608
609 if (list_empty(&dmar_drhd_units)) {
610 printk(KERN_INFO PREFIX "No DMAR devices found\n");
611 return -ENODEV;
612 }
613
614#ifdef CONFIG_DMAR
615 if (list_empty(&dmar_rmrr_units))
616 printk(KERN_INFO PREFIX "No RMRR found\n");
617
618 if (list_empty(&dmar_atsr_units))
619 printk(KERN_INFO PREFIX "No ATSR found\n");
620#endif
621
622 return 0;
623}
624
625static void warn_invalid_dmar(u64 addr, const char *message)
626{
627 WARN_TAINT_ONCE(
628 1, TAINT_FIRMWARE_WORKAROUND,
629 "Your BIOS is broken; DMAR reported at address %llx%s!\n"
630 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
631 addr, message,
632 dmi_get_system_info(DMI_BIOS_VENDOR),
633 dmi_get_system_info(DMI_BIOS_VERSION),
634 dmi_get_system_info(DMI_PRODUCT_VERSION));
635}
636
637int __init check_zero_address(void)
638{
639 struct acpi_table_dmar *dmar;
640 struct acpi_dmar_header *entry_header;
641 struct acpi_dmar_hardware_unit *drhd;
642
643 dmar = (struct acpi_table_dmar *)dmar_tbl;
644 entry_header = (struct acpi_dmar_header *)(dmar + 1);
645
646 while (((unsigned long)entry_header) <
647 (((unsigned long)dmar) + dmar_tbl->length)) {
648 /* Avoid looping forever on bad ACPI tables */
649 if (entry_header->length == 0) {
650 printk(KERN_WARNING PREFIX
651 "Invalid 0-length structure\n");
652 return 0;
653 }
654
655 if (entry_header->type == ACPI_DMAR_TYPE_HARDWARE_UNIT) {
656 void __iomem *addr;
657 u64 cap, ecap;
658
659 drhd = (void *)entry_header;
660 if (!drhd->address) {
661 warn_invalid_dmar(0, "");
662 goto failed;
663 }
664
665 addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
666 if (!addr ) {
667 printk("IOMMU: can't validate: %llx\n", drhd->address);
668 goto failed;
669 }
670 cap = dmar_readq(addr + DMAR_CAP_REG);
671 ecap = dmar_readq(addr + DMAR_ECAP_REG);
672 early_iounmap(addr, VTD_PAGE_SIZE);
673 if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
674 warn_invalid_dmar(drhd->address,
675 " returns all ones");
676 goto failed;
677 }
678 }
679
680 entry_header = ((void *)entry_header + entry_header->length);
681 }
682 return 1;
683
684failed:
685#ifdef CONFIG_DMAR
686 dmar_disabled = 1;
687#endif
688 return 0;
689}
690
691int __init detect_intel_iommu(void)
692{
693 int ret;
694
695 ret = dmar_table_detect();
696 if (ret)
697 ret = check_zero_address();
698 {
699#ifdef CONFIG_INTR_REMAP
700 struct acpi_table_dmar *dmar;
701
702 dmar = (struct acpi_table_dmar *) dmar_tbl;
703 if (ret && cpu_has_x2apic && dmar->flags & 0x1)
704 printk(KERN_INFO
705 "Queued invalidation will be enabled to support "
706 "x2apic and Intr-remapping.\n");
707#endif
708#ifdef CONFIG_DMAR
709 if (ret && !no_iommu && !iommu_detected && !dmar_disabled) {
710 iommu_detected = 1;
711 /* Make sure ACS will be enabled */
712 pci_request_acs();
713 }
714#endif
715#ifdef CONFIG_X86
716 if (ret)
717 x86_init.iommu.iommu_init = intel_iommu_init;
718#endif
719 }
720 early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
721 dmar_tbl = NULL;
722
723 return ret ? 1 : -ENODEV;
724}
725
726
727int alloc_iommu(struct dmar_drhd_unit *drhd)
728{
729 struct intel_iommu *iommu;
730 int map_size;
731 u32 ver;
732 static int iommu_allocated = 0;
733 int agaw = 0;
734 int msagaw = 0;
735
736 if (!drhd->reg_base_addr) {
737 warn_invalid_dmar(0, "");
738 return -EINVAL;
739 }
740
741 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
742 if (!iommu)
743 return -ENOMEM;
744
745 iommu->seq_id = iommu_allocated++;
746 sprintf (iommu->name, "dmar%d", iommu->seq_id);
747
748 iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
749 if (!iommu->reg) {
750 printk(KERN_ERR "IOMMU: can't map the region\n");
751 goto error;
752 }
753 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
754 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
755
756 if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
757 warn_invalid_dmar(drhd->reg_base_addr, " returns all ones");
758 goto err_unmap;
759 }
760
761#ifdef CONFIG_DMAR
762 agaw = iommu_calculate_agaw(iommu);
763 if (agaw < 0) {
764 printk(KERN_ERR
765 "Cannot get a valid agaw for iommu (seq_id = %d)\n",
766 iommu->seq_id);
767 goto err_unmap;
768 }
769 msagaw = iommu_calculate_max_sagaw(iommu);
770 if (msagaw < 0) {
771 printk(KERN_ERR
772 "Cannot get a valid max agaw for iommu (seq_id = %d)\n",
773 iommu->seq_id);
774 goto err_unmap;
775 }
776#endif
777 iommu->agaw = agaw;
778 iommu->msagaw = msagaw;
779
780 iommu->node = -1;
781
782 /* the registers might be more than one page */
783 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
784 cap_max_fault_reg_offset(iommu->cap));
785 map_size = VTD_PAGE_ALIGN(map_size);
786 if (map_size > VTD_PAGE_SIZE) {
787 iounmap(iommu->reg);
788 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
789 if (!iommu->reg) {
790 printk(KERN_ERR "IOMMU: can't map the region\n");
791 goto error;
792 }
793 }
794
795 ver = readl(iommu->reg + DMAR_VER_REG);
796 pr_info("IOMMU %d: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
797 iommu->seq_id,
798 (unsigned long long)drhd->reg_base_addr,
799 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
800 (unsigned long long)iommu->cap,
801 (unsigned long long)iommu->ecap);
802
803 spin_lock_init(&iommu->register_lock);
804
805 drhd->iommu = iommu;
806 return 0;
807
808 err_unmap:
809 iounmap(iommu->reg);
810 error:
811 kfree(iommu);
812 return -1;
813}
814
815void free_iommu(struct intel_iommu *iommu)
816{
817 if (!iommu)
818 return;
819
820#ifdef CONFIG_DMAR
821 free_dmar_iommu(iommu);
822#endif
823
824 if (iommu->reg)
825 iounmap(iommu->reg);
826 kfree(iommu);
827}
828
829/*
830 * Reclaim all the submitted descriptors which have completed its work.
831 */
832static inline void reclaim_free_desc(struct q_inval *qi)
833{
834 while (qi->desc_status[qi->free_tail] == QI_DONE ||
835 qi->desc_status[qi->free_tail] == QI_ABORT) {
836 qi->desc_status[qi->free_tail] = QI_FREE;
837 qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
838 qi->free_cnt++;
839 }
840}
841
842static int qi_check_fault(struct intel_iommu *iommu, int index)
843{
844 u32 fault;
845 int head, tail;
846 struct q_inval *qi = iommu->qi;
847 int wait_index = (index + 1) % QI_LENGTH;
848
849 if (qi->desc_status[wait_index] == QI_ABORT)
850 return -EAGAIN;
851
852 fault = readl(iommu->reg + DMAR_FSTS_REG);
853
854 /*
855 * If IQE happens, the head points to the descriptor associated
856 * with the error. No new descriptors are fetched until the IQE
857 * is cleared.
858 */
859 if (fault & DMA_FSTS_IQE) {
860 head = readl(iommu->reg + DMAR_IQH_REG);
861 if ((head >> DMAR_IQ_SHIFT) == index) {
862 printk(KERN_ERR "VT-d detected invalid descriptor: "
863 "low=%llx, high=%llx\n",
864 (unsigned long long)qi->desc[index].low,
865 (unsigned long long)qi->desc[index].high);
866 memcpy(&qi->desc[index], &qi->desc[wait_index],
867 sizeof(struct qi_desc));
868 __iommu_flush_cache(iommu, &qi->desc[index],
869 sizeof(struct qi_desc));
870 writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
871 return -EINVAL;
872 }
873 }
874
875 /*
876 * If ITE happens, all pending wait_desc commands are aborted.
877 * No new descriptors are fetched until the ITE is cleared.
878 */
879 if (fault & DMA_FSTS_ITE) {
880 head = readl(iommu->reg + DMAR_IQH_REG);
881 head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
882 head |= 1;
883 tail = readl(iommu->reg + DMAR_IQT_REG);
884 tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
885
886 writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
887
888 do {
889 if (qi->desc_status[head] == QI_IN_USE)
890 qi->desc_status[head] = QI_ABORT;
891 head = (head - 2 + QI_LENGTH) % QI_LENGTH;
892 } while (head != tail);
893
894 if (qi->desc_status[wait_index] == QI_ABORT)
895 return -EAGAIN;
896 }
897
898 if (fault & DMA_FSTS_ICE)
899 writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
900
901 return 0;
902}
903
904/*
905 * Submit the queued invalidation descriptor to the remapping
906 * hardware unit and wait for its completion.
907 */
908int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
909{
910 int rc;
911 struct q_inval *qi = iommu->qi;
912 struct qi_desc *hw, wait_desc;
913 int wait_index, index;
914 unsigned long flags;
915
916 if (!qi)
917 return 0;
918
919 hw = qi->desc;
920
921restart:
922 rc = 0;
923
924 spin_lock_irqsave(&qi->q_lock, flags);
925 while (qi->free_cnt < 3) {
926 spin_unlock_irqrestore(&qi->q_lock, flags);
927 cpu_relax();
928 spin_lock_irqsave(&qi->q_lock, flags);
929 }
930
931 index = qi->free_head;
932 wait_index = (index + 1) % QI_LENGTH;
933
934 qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
935
936 hw[index] = *desc;
937
938 wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
939 QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
940 wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
941
942 hw[wait_index] = wait_desc;
943
944 __iommu_flush_cache(iommu, &hw[index], sizeof(struct qi_desc));
945 __iommu_flush_cache(iommu, &hw[wait_index], sizeof(struct qi_desc));
946
947 qi->free_head = (qi->free_head + 2) % QI_LENGTH;
948 qi->free_cnt -= 2;
949
950 /*
951 * update the HW tail register indicating the presence of
952 * new descriptors.
953 */
954 writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
955
956 while (qi->desc_status[wait_index] != QI_DONE) {
957 /*
958 * We will leave the interrupts disabled, to prevent interrupt
959 * context to queue another cmd while a cmd is already submitted
960 * and waiting for completion on this cpu. This is to avoid
961 * a deadlock where the interrupt context can wait indefinitely
962 * for free slots in the queue.
963 */
964 rc = qi_check_fault(iommu, index);
965 if (rc)
966 break;
967
968 spin_unlock(&qi->q_lock);
969 cpu_relax();
970 spin_lock(&qi->q_lock);
971 }
972
973 qi->desc_status[index] = QI_DONE;
974
975 reclaim_free_desc(qi);
976 spin_unlock_irqrestore(&qi->q_lock, flags);
977
978 if (rc == -EAGAIN)
979 goto restart;
980
981 return rc;
982}
983
984/*
985 * Flush the global interrupt entry cache.
986 */
987void qi_global_iec(struct intel_iommu *iommu)
988{
989 struct qi_desc desc;
990
991 desc.low = QI_IEC_TYPE;
992 desc.high = 0;
993
994 /* should never fail */
995 qi_submit_sync(&desc, iommu);
996}
997
998void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
999 u64 type)
1000{
1001 struct qi_desc desc;
1002
1003 desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
1004 | QI_CC_GRAN(type) | QI_CC_TYPE;
1005 desc.high = 0;
1006
1007 qi_submit_sync(&desc, iommu);
1008}
1009
1010void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1011 unsigned int size_order, u64 type)
1012{
1013 u8 dw = 0, dr = 0;
1014
1015 struct qi_desc desc;
1016 int ih = 0;
1017
1018 if (cap_write_drain(iommu->cap))
1019 dw = 1;
1020
1021 if (cap_read_drain(iommu->cap))
1022 dr = 1;
1023
1024 desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
1025 | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
1026 desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
1027 | QI_IOTLB_AM(size_order);
1028
1029 qi_submit_sync(&desc, iommu);
1030}
1031
1032void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
1033 u64 addr, unsigned mask)
1034{
1035 struct qi_desc desc;
1036
1037 if (mask) {
1038 BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
1039 addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
1040 desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
1041 } else
1042 desc.high = QI_DEV_IOTLB_ADDR(addr);
1043
1044 if (qdep >= QI_DEV_IOTLB_MAX_INVS)
1045 qdep = 0;
1046
1047 desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
1048 QI_DIOTLB_TYPE;
1049
1050 qi_submit_sync(&desc, iommu);
1051}
1052
1053/*
1054 * Disable Queued Invalidation interface.
1055 */
1056void dmar_disable_qi(struct intel_iommu *iommu)
1057{
1058 unsigned long flags;
1059 u32 sts;
1060 cycles_t start_time = get_cycles();
1061
1062 if (!ecap_qis(iommu->ecap))
1063 return;
1064
1065 spin_lock_irqsave(&iommu->register_lock, flags);
1066
1067 sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
1068 if (!(sts & DMA_GSTS_QIES))
1069 goto end;
1070
1071 /*
1072 * Give a chance to HW to complete the pending invalidation requests.
1073 */
1074 while ((readl(iommu->reg + DMAR_IQT_REG) !=
1075 readl(iommu->reg + DMAR_IQH_REG)) &&
1076 (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
1077 cpu_relax();
1078
1079 iommu->gcmd &= ~DMA_GCMD_QIE;
1080 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1081
1082 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
1083 !(sts & DMA_GSTS_QIES), sts);
1084end:
1085 spin_unlock_irqrestore(&iommu->register_lock, flags);
1086}
1087
1088/*
1089 * Enable queued invalidation.
1090 */
1091static void __dmar_enable_qi(struct intel_iommu *iommu)
1092{
1093 u32 sts;
1094 unsigned long flags;
1095 struct q_inval *qi = iommu->qi;
1096
1097 qi->free_head = qi->free_tail = 0;
1098 qi->free_cnt = QI_LENGTH;
1099
1100 spin_lock_irqsave(&iommu->register_lock, flags);
1101
1102 /* write zero to the tail reg */
1103 writel(0, iommu->reg + DMAR_IQT_REG);
1104
1105 dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
1106
1107 iommu->gcmd |= DMA_GCMD_QIE;
1108 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1109
1110 /* Make sure hardware complete it */
1111 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
1112
1113 spin_unlock_irqrestore(&iommu->register_lock, flags);
1114}
1115
1116/*
1117 * Enable Queued Invalidation interface. This is a must to support
1118 * interrupt-remapping. Also used by DMA-remapping, which replaces
1119 * register based IOTLB invalidation.
1120 */
1121int dmar_enable_qi(struct intel_iommu *iommu)
1122{
1123 struct q_inval *qi;
1124 struct page *desc_page;
1125
1126 if (!ecap_qis(iommu->ecap))
1127 return -ENOENT;
1128
1129 /*
1130 * queued invalidation is already setup and enabled.
1131 */
1132 if (iommu->qi)
1133 return 0;
1134
1135 iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
1136 if (!iommu->qi)
1137 return -ENOMEM;
1138
1139 qi = iommu->qi;
1140
1141
1142 desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0);
1143 if (!desc_page) {
1144 kfree(qi);
1145 iommu->qi = 0;
1146 return -ENOMEM;
1147 }
1148
1149 qi->desc = page_address(desc_page);
1150
1151 qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
1152 if (!qi->desc_status) {
1153 free_page((unsigned long) qi->desc);
1154 kfree(qi);
1155 iommu->qi = 0;
1156 return -ENOMEM;
1157 }
1158
1159 qi->free_head = qi->free_tail = 0;
1160 qi->free_cnt = QI_LENGTH;
1161
1162 spin_lock_init(&qi->q_lock);
1163
1164 __dmar_enable_qi(iommu);
1165
1166 return 0;
1167}
1168
1169/* iommu interrupt handling. Most stuff are MSI-like. */
1170
1171enum faulttype {
1172 DMA_REMAP,
1173 INTR_REMAP,
1174 UNKNOWN,
1175};
1176
1177static const char *dma_remap_fault_reasons[] =
1178{
1179 "Software",
1180 "Present bit in root entry is clear",
1181 "Present bit in context entry is clear",
1182 "Invalid context entry",
1183 "Access beyond MGAW",
1184 "PTE Write access is not set",
1185 "PTE Read access is not set",
1186 "Next page table ptr is invalid",
1187 "Root table address invalid",
1188 "Context table ptr is invalid",
1189 "non-zero reserved fields in RTP",
1190 "non-zero reserved fields in CTP",
1191 "non-zero reserved fields in PTE",
1192};
1193
1194static const char *intr_remap_fault_reasons[] =
1195{
1196 "Detected reserved fields in the decoded interrupt-remapped request",
1197 "Interrupt index exceeded the interrupt-remapping table size",
1198 "Present field in the IRTE entry is clear",
1199 "Error accessing interrupt-remapping table pointed by IRTA_REG",
1200 "Detected reserved fields in the IRTE entry",
1201 "Blocked a compatibility format interrupt request",
1202 "Blocked an interrupt request due to source-id verification failure",
1203};
1204
1205#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1206
1207const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
1208{
1209 if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
1210 ARRAY_SIZE(intr_remap_fault_reasons))) {
1211 *fault_type = INTR_REMAP;
1212 return intr_remap_fault_reasons[fault_reason - 0x20];
1213 } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
1214 *fault_type = DMA_REMAP;
1215 return dma_remap_fault_reasons[fault_reason];
1216 } else {
1217 *fault_type = UNKNOWN;
1218 return "Unknown";
1219 }
1220}
1221
1222void dmar_msi_unmask(struct irq_data *data)
1223{
1224 struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
1225 unsigned long flag;
1226
1227 /* unmask it */
1228 spin_lock_irqsave(&iommu->register_lock, flag);
1229 writel(0, iommu->reg + DMAR_FECTL_REG);
1230 /* Read a reg to force flush the post write */
1231 readl(iommu->reg + DMAR_FECTL_REG);
1232 spin_unlock_irqrestore(&iommu->register_lock, flag);
1233}
1234
1235void dmar_msi_mask(struct irq_data *data)
1236{
1237 unsigned long flag;
1238 struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
1239
1240 /* mask it */
1241 spin_lock_irqsave(&iommu->register_lock, flag);
1242 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1243 /* Read a reg to force flush the post write */
1244 readl(iommu->reg + DMAR_FECTL_REG);
1245 spin_unlock_irqrestore(&iommu->register_lock, flag);
1246}
1247
1248void dmar_msi_write(int irq, struct msi_msg *msg)
1249{
1250 struct intel_iommu *iommu = irq_get_handler_data(irq);
1251 unsigned long flag;
1252
1253 spin_lock_irqsave(&iommu->register_lock, flag);
1254 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1255 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1256 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1257 spin_unlock_irqrestore(&iommu->register_lock, flag);
1258}
1259
1260void dmar_msi_read(int irq, struct msi_msg *msg)
1261{
1262 struct intel_iommu *iommu = irq_get_handler_data(irq);
1263 unsigned long flag;
1264
1265 spin_lock_irqsave(&iommu->register_lock, flag);
1266 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1267 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1268 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1269 spin_unlock_irqrestore(&iommu->register_lock, flag);
1270}
1271
1272static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
1273 u8 fault_reason, u16 source_id, unsigned long long addr)
1274{
1275 const char *reason;
1276 int fault_type;
1277
1278 reason = dmar_get_fault_reason(fault_reason, &fault_type);
1279
1280 if (fault_type == INTR_REMAP)
1281 printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
1282 "fault index %llx\n"
1283 "INTR-REMAP:[fault reason %02d] %s\n",
1284 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1285 PCI_FUNC(source_id & 0xFF), addr >> 48,
1286 fault_reason, reason);
1287 else
1288 printk(KERN_ERR
1289 "DMAR:[%s] Request device [%02x:%02x.%d] "
1290 "fault addr %llx \n"
1291 "DMAR:[fault reason %02d] %s\n",
1292 (type ? "DMA Read" : "DMA Write"),
1293 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1294 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1295 return 0;
1296}
1297
1298#define PRIMARY_FAULT_REG_LEN (16)
1299irqreturn_t dmar_fault(int irq, void *dev_id)
1300{
1301 struct intel_iommu *iommu = dev_id;
1302 int reg, fault_index;
1303 u32 fault_status;
1304 unsigned long flag;
1305
1306 spin_lock_irqsave(&iommu->register_lock, flag);
1307 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1308 if (fault_status)
1309 printk(KERN_ERR "DRHD: handling fault status reg %x\n",
1310 fault_status);
1311
1312 /* TBD: ignore advanced fault log currently */
1313 if (!(fault_status & DMA_FSTS_PPF))
1314 goto clear_rest;
1315
1316 fault_index = dma_fsts_fault_record_index(fault_status);
1317 reg = cap_fault_reg_offset(iommu->cap);
1318 while (1) {
1319 u8 fault_reason;
1320 u16 source_id;
1321 u64 guest_addr;
1322 int type;
1323 u32 data;
1324
1325 /* highest 32 bits */
1326 data = readl(iommu->reg + reg +
1327 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1328 if (!(data & DMA_FRCD_F))
1329 break;
1330
1331 fault_reason = dma_frcd_fault_reason(data);
1332 type = dma_frcd_type(data);
1333
1334 data = readl(iommu->reg + reg +
1335 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1336 source_id = dma_frcd_source_id(data);
1337
1338 guest_addr = dmar_readq(iommu->reg + reg +
1339 fault_index * PRIMARY_FAULT_REG_LEN);
1340 guest_addr = dma_frcd_page_addr(guest_addr);
1341 /* clear the fault */
1342 writel(DMA_FRCD_F, iommu->reg + reg +
1343 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1344
1345 spin_unlock_irqrestore(&iommu->register_lock, flag);
1346
1347 dmar_fault_do_one(iommu, type, fault_reason,
1348 source_id, guest_addr);
1349
1350 fault_index++;
1351 if (fault_index >= cap_num_fault_regs(iommu->cap))
1352 fault_index = 0;
1353 spin_lock_irqsave(&iommu->register_lock, flag);
1354 }
1355clear_rest:
1356 /* clear all the other faults */
1357 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1358 writel(fault_status, iommu->reg + DMAR_FSTS_REG);
1359
1360 spin_unlock_irqrestore(&iommu->register_lock, flag);
1361 return IRQ_HANDLED;
1362}
1363
1364int dmar_set_interrupt(struct intel_iommu *iommu)
1365{
1366 int irq, ret;
1367
1368 /*
1369 * Check if the fault interrupt is already initialized.
1370 */
1371 if (iommu->irq)
1372 return 0;
1373
1374 irq = create_irq();
1375 if (!irq) {
1376 printk(KERN_ERR "IOMMU: no free vectors\n");
1377 return -EINVAL;
1378 }
1379
1380 irq_set_handler_data(irq, iommu);
1381 iommu->irq = irq;
1382
1383 ret = arch_setup_dmar_msi(irq);
1384 if (ret) {
1385 irq_set_handler_data(irq, NULL);
1386 iommu->irq = 0;
1387 destroy_irq(irq);
1388 return ret;
1389 }
1390
1391 ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
1392 if (ret)
1393 printk(KERN_ERR "IOMMU: can't request irq\n");
1394 return ret;
1395}
1396
1397int __init enable_drhd_fault_handling(void)
1398{
1399 struct dmar_drhd_unit *drhd;
1400
1401 /*
1402 * Enable fault control interrupt.
1403 */
1404 for_each_drhd_unit(drhd) {
1405 int ret;
1406 struct intel_iommu *iommu = drhd->iommu;
1407 ret = dmar_set_interrupt(iommu);
1408
1409 if (ret) {
1410 printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
1411 " interrupt, ret %d\n",
1412 (unsigned long long)drhd->reg_base_addr, ret);
1413 return -1;
1414 }
1415
1416 /*
1417 * Clear any previous faults.
1418 */
1419 dmar_fault(iommu->irq, iommu);
1420 }
1421
1422 return 0;
1423}
1424
1425/*
1426 * Re-enable Queued Invalidation interface.
1427 */
1428int dmar_reenable_qi(struct intel_iommu *iommu)
1429{
1430 if (!ecap_qis(iommu->ecap))
1431 return -ENOENT;
1432
1433 if (!iommu->qi)
1434 return -ENOENT;
1435
1436 /*
1437 * First disable queued invalidation.
1438 */
1439 dmar_disable_qi(iommu);
1440 /*
1441 * Then enable queued invalidation again. Since there is no pending
1442 * invalidation requests now, it's safe to re-enable queued
1443 * invalidation.
1444 */
1445 __dmar_enable_qi(iommu);
1446
1447 return 0;
1448}
1449
1450/*
1451 * Check interrupt remapping support in DMAR table description.
1452 */
1453int __init dmar_ir_support(void)
1454{
1455 struct acpi_table_dmar *dmar;
1456 dmar = (struct acpi_table_dmar *)dmar_tbl;
1457 if (!dmar)
1458 return 0;
1459 return dmar->flags & 0x1;
1460}
1461IOMMU_INIT_POST(detect_intel_iommu);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
new file mode 100644
index 000000000000..c621c98c99da
--- /dev/null
+++ b/drivers/iommu/intel-iommu.c
@@ -0,0 +1,4016 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24#include <linux/init.h>
25#include <linux/bitmap.h>
26#include <linux/debugfs.h>
27#include <linux/slab.h>
28#include <linux/irq.h>
29#include <linux/interrupt.h>
30#include <linux/spinlock.h>
31#include <linux/pci.h>
32#include <linux/dmar.h>
33#include <linux/dma-mapping.h>
34#include <linux/mempool.h>
35#include <linux/timer.h>
36#include <linux/iova.h>
37#include <linux/iommu.h>
38#include <linux/intel-iommu.h>
39#include <linux/syscore_ops.h>
40#include <linux/tboot.h>
41#include <linux/dmi.h>
42#include <linux/pci-ats.h>
43#include <asm/cacheflush.h>
44#include <asm/iommu.h>
45
46#define ROOT_SIZE VTD_PAGE_SIZE
47#define CONTEXT_SIZE VTD_PAGE_SIZE
48
49#define IS_BRIDGE_HOST_DEVICE(pdev) \
50 ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
51#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55#define IOAPIC_RANGE_START (0xfee00000)
56#define IOAPIC_RANGE_END (0xfeefffff)
57#define IOVA_START_ADDR (0x1000)
58
59#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61#define MAX_AGAW_WIDTH 64
62
63#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74#define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75
76/* page table handling */
77#define LEVEL_STRIDE (9)
78#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
79
80static inline int agaw_to_level(int agaw)
81{
82 return agaw + 2;
83}
84
85static inline int agaw_to_width(int agaw)
86{
87 return 30 + agaw * LEVEL_STRIDE;
88}
89
90static inline int width_to_agaw(int width)
91{
92 return (width - 30) / LEVEL_STRIDE;
93}
94
95static inline unsigned int level_to_offset_bits(int level)
96{
97 return (level - 1) * LEVEL_STRIDE;
98}
99
100static inline int pfn_level_offset(unsigned long pfn, int level)
101{
102 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
103}
104
105static inline unsigned long level_mask(int level)
106{
107 return -1UL << level_to_offset_bits(level);
108}
109
110static inline unsigned long level_size(int level)
111{
112 return 1UL << level_to_offset_bits(level);
113}
114
115static inline unsigned long align_to_level(unsigned long pfn, int level)
116{
117 return (pfn + level_size(level) - 1) & level_mask(level);
118}
119
120static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
121{
122 return 1 << ((lvl - 1) * LEVEL_STRIDE);
123}
124
125/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
126 are never going to work. */
127static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
128{
129 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
130}
131
132static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
133{
134 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
135}
136static inline unsigned long page_to_dma_pfn(struct page *pg)
137{
138 return mm_to_dma_pfn(page_to_pfn(pg));
139}
140static inline unsigned long virt_to_dma_pfn(void *p)
141{
142 return page_to_dma_pfn(virt_to_page(p));
143}
144
145/* global iommu list, set NULL for ignored DMAR units */
146static struct intel_iommu **g_iommus;
147
148static void __init check_tylersburg_isoch(void);
149static int rwbf_quirk;
150
151/*
152 * set to 1 to panic kernel if can't successfully enable VT-d
153 * (used when kernel is launched w/ TXT)
154 */
155static int force_on = 0;
156
157/*
158 * 0: Present
159 * 1-11: Reserved
160 * 12-63: Context Ptr (12 - (haw-1))
161 * 64-127: Reserved
162 */
163struct root_entry {
164 u64 val;
165 u64 rsvd1;
166};
167#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
168static inline bool root_present(struct root_entry *root)
169{
170 return (root->val & 1);
171}
172static inline void set_root_present(struct root_entry *root)
173{
174 root->val |= 1;
175}
176static inline void set_root_value(struct root_entry *root, unsigned long value)
177{
178 root->val |= value & VTD_PAGE_MASK;
179}
180
181static inline struct context_entry *
182get_context_addr_from_root(struct root_entry *root)
183{
184 return (struct context_entry *)
185 (root_present(root)?phys_to_virt(
186 root->val & VTD_PAGE_MASK) :
187 NULL);
188}
189
190/*
191 * low 64 bits:
192 * 0: present
193 * 1: fault processing disable
194 * 2-3: translation type
195 * 12-63: address space root
196 * high 64 bits:
197 * 0-2: address width
198 * 3-6: aval
199 * 8-23: domain id
200 */
201struct context_entry {
202 u64 lo;
203 u64 hi;
204};
205
206static inline bool context_present(struct context_entry *context)
207{
208 return (context->lo & 1);
209}
210static inline void context_set_present(struct context_entry *context)
211{
212 context->lo |= 1;
213}
214
215static inline void context_set_fault_enable(struct context_entry *context)
216{
217 context->lo &= (((u64)-1) << 2) | 1;
218}
219
220static inline void context_set_translation_type(struct context_entry *context,
221 unsigned long value)
222{
223 context->lo &= (((u64)-1) << 4) | 3;
224 context->lo |= (value & 3) << 2;
225}
226
227static inline void context_set_address_root(struct context_entry *context,
228 unsigned long value)
229{
230 context->lo |= value & VTD_PAGE_MASK;
231}
232
233static inline void context_set_address_width(struct context_entry *context,
234 unsigned long value)
235{
236 context->hi |= value & 7;
237}
238
239static inline void context_set_domain_id(struct context_entry *context,
240 unsigned long value)
241{
242 context->hi |= (value & ((1 << 16) - 1)) << 8;
243}
244
245static inline void context_clear_entry(struct context_entry *context)
246{
247 context->lo = 0;
248 context->hi = 0;
249}
250
251/*
252 * 0: readable
253 * 1: writable
254 * 2-6: reserved
255 * 7: super page
256 * 8-10: available
257 * 11: snoop behavior
258 * 12-63: Host physcial address
259 */
260struct dma_pte {
261 u64 val;
262};
263
264static inline void dma_clear_pte(struct dma_pte *pte)
265{
266 pte->val = 0;
267}
268
269static inline void dma_set_pte_readable(struct dma_pte *pte)
270{
271 pte->val |= DMA_PTE_READ;
272}
273
274static inline void dma_set_pte_writable(struct dma_pte *pte)
275{
276 pte->val |= DMA_PTE_WRITE;
277}
278
279static inline void dma_set_pte_snp(struct dma_pte *pte)
280{
281 pte->val |= DMA_PTE_SNP;
282}
283
284static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
285{
286 pte->val = (pte->val & ~3) | (prot & 3);
287}
288
289static inline u64 dma_pte_addr(struct dma_pte *pte)
290{
291#ifdef CONFIG_64BIT
292 return pte->val & VTD_PAGE_MASK;
293#else
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296#endif
297}
298
299static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
300{
301 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
302}
303
304static inline bool dma_pte_present(struct dma_pte *pte)
305{
306 return (pte->val & 3) != 0;
307}
308
309static inline int first_pte_in_page(struct dma_pte *pte)
310{
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
312}
313
314/*
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
319 */
320static struct dmar_domain *si_domain;
321static int hw_pass_through = 1;
322
323/* devices under the same p2p bridge are owned in one domain */
324#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
325
326/* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
328 */
329#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
330
331/* si_domain contains mulitple devices */
332#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
333
334struct dmar_domain {
335 int id; /* domain id */
336 int nid; /* node id */
337 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
338
339 struct list_head devices; /* all devices' list */
340 struct iova_domain iovad; /* iova's that belong to this domain */
341
342 struct dma_pte *pgd; /* virtual address */
343 int gaw; /* max guest address width */
344
345 /* adjusted guest address width, 0 is level 2 30-bit */
346 int agaw;
347
348 int flags; /* flags to find out type of domain */
349
350 int iommu_coherency;/* indicate coherency of iommu access */
351 int iommu_snooping; /* indicate snooping control feature*/
352 int iommu_count; /* reference count of iommu */
353 int iommu_superpage;/* Level of superpages supported:
354 0 == 4KiB (no superpages), 1 == 2MiB,
355 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
356 spinlock_t iommu_lock; /* protect iommu set in domain */
357 u64 max_addr; /* maximum mapped address */
358};
359
360/* PCI domain-device relationship */
361struct device_domain_info {
362 struct list_head link; /* link to domain siblings */
363 struct list_head global; /* link to global list */
364 int segment; /* PCI domain */
365 u8 bus; /* PCI bus number */
366 u8 devfn; /* PCI devfn number */
367 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
368 struct intel_iommu *iommu; /* IOMMU used by this device */
369 struct dmar_domain *domain; /* pointer to domain */
370};
371
372static void flush_unmaps_timeout(unsigned long data);
373
374DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
375
376#define HIGH_WATER_MARK 250
377struct deferred_flush_tables {
378 int next;
379 struct iova *iova[HIGH_WATER_MARK];
380 struct dmar_domain *domain[HIGH_WATER_MARK];
381};
382
383static struct deferred_flush_tables *deferred_flush;
384
385/* bitmap for indexing intel_iommus */
386static int g_num_of_iommus;
387
388static DEFINE_SPINLOCK(async_umap_flush_lock);
389static LIST_HEAD(unmaps_to_do);
390
391static int timer_on;
392static long list_size;
393
394static void domain_remove_dev_info(struct dmar_domain *domain);
395
396#ifdef CONFIG_DMAR_DEFAULT_ON
397int dmar_disabled = 0;
398#else
399int dmar_disabled = 1;
400#endif /*CONFIG_DMAR_DEFAULT_ON*/
401
402static int dmar_map_gfx = 1;
403static int dmar_forcedac;
404static int intel_iommu_strict;
405static int intel_iommu_superpage = 1;
406
407#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
408static DEFINE_SPINLOCK(device_domain_lock);
409static LIST_HEAD(device_domain_list);
410
411static struct iommu_ops intel_iommu_ops;
412
413static int __init intel_iommu_setup(char *str)
414{
415 if (!str)
416 return -EINVAL;
417 while (*str) {
418 if (!strncmp(str, "on", 2)) {
419 dmar_disabled = 0;
420 printk(KERN_INFO "Intel-IOMMU: enabled\n");
421 } else if (!strncmp(str, "off", 3)) {
422 dmar_disabled = 1;
423 printk(KERN_INFO "Intel-IOMMU: disabled\n");
424 } else if (!strncmp(str, "igfx_off", 8)) {
425 dmar_map_gfx = 0;
426 printk(KERN_INFO
427 "Intel-IOMMU: disable GFX device mapping\n");
428 } else if (!strncmp(str, "forcedac", 8)) {
429 printk(KERN_INFO
430 "Intel-IOMMU: Forcing DAC for PCI devices\n");
431 dmar_forcedac = 1;
432 } else if (!strncmp(str, "strict", 6)) {
433 printk(KERN_INFO
434 "Intel-IOMMU: disable batched IOTLB flush\n");
435 intel_iommu_strict = 1;
436 } else if (!strncmp(str, "sp_off", 6)) {
437 printk(KERN_INFO
438 "Intel-IOMMU: disable supported super page\n");
439 intel_iommu_superpage = 0;
440 }
441
442 str += strcspn(str, ",");
443 while (*str == ',')
444 str++;
445 }
446 return 0;
447}
448__setup("intel_iommu=", intel_iommu_setup);
449
450static struct kmem_cache *iommu_domain_cache;
451static struct kmem_cache *iommu_devinfo_cache;
452static struct kmem_cache *iommu_iova_cache;
453
454static inline void *alloc_pgtable_page(int node)
455{
456 struct page *page;
457 void *vaddr = NULL;
458
459 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
460 if (page)
461 vaddr = page_address(page);
462 return vaddr;
463}
464
465static inline void free_pgtable_page(void *vaddr)
466{
467 free_page((unsigned long)vaddr);
468}
469
470static inline void *alloc_domain_mem(void)
471{
472 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
473}
474
475static void free_domain_mem(void *vaddr)
476{
477 kmem_cache_free(iommu_domain_cache, vaddr);
478}
479
480static inline void * alloc_devinfo_mem(void)
481{
482 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
483}
484
485static inline void free_devinfo_mem(void *vaddr)
486{
487 kmem_cache_free(iommu_devinfo_cache, vaddr);
488}
489
490struct iova *alloc_iova_mem(void)
491{
492 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
493}
494
495void free_iova_mem(struct iova *iova)
496{
497 kmem_cache_free(iommu_iova_cache, iova);
498}
499
500
501static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
502{
503 unsigned long sagaw;
504 int agaw = -1;
505
506 sagaw = cap_sagaw(iommu->cap);
507 for (agaw = width_to_agaw(max_gaw);
508 agaw >= 0; agaw--) {
509 if (test_bit(agaw, &sagaw))
510 break;
511 }
512
513 return agaw;
514}
515
516/*
517 * Calculate max SAGAW for each iommu.
518 */
519int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
520{
521 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
522}
523
524/*
525 * calculate agaw for each iommu.
526 * "SAGAW" may be different across iommus, use a default agaw, and
527 * get a supported less agaw for iommus that don't support the default agaw.
528 */
529int iommu_calculate_agaw(struct intel_iommu *iommu)
530{
531 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
532}
533
534/* This functionin only returns single iommu in a domain */
535static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
536{
537 int iommu_id;
538
539 /* si_domain and vm domain should not get here. */
540 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
541 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
542
543 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
544 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
545 return NULL;
546
547 return g_iommus[iommu_id];
548}
549
550static void domain_update_iommu_coherency(struct dmar_domain *domain)
551{
552 int i;
553
554 domain->iommu_coherency = 1;
555
556 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
557 if (!ecap_coherent(g_iommus[i]->ecap)) {
558 domain->iommu_coherency = 0;
559 break;
560 }
561 }
562}
563
564static void domain_update_iommu_snooping(struct dmar_domain *domain)
565{
566 int i;
567
568 domain->iommu_snooping = 1;
569
570 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571 if (!ecap_sc_support(g_iommus[i]->ecap)) {
572 domain->iommu_snooping = 0;
573 break;
574 }
575 }
576}
577
578static void domain_update_iommu_superpage(struct dmar_domain *domain)
579{
580 int i, mask = 0xf;
581
582 if (!intel_iommu_superpage) {
583 domain->iommu_superpage = 0;
584 return;
585 }
586
587 domain->iommu_superpage = 4; /* 1TiB */
588
589 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
590 mask |= cap_super_page_val(g_iommus[i]->cap);
591 if (!mask) {
592 break;
593 }
594 }
595 domain->iommu_superpage = fls(mask);
596}
597
598/* Some capabilities may be different across iommus */
599static void domain_update_iommu_cap(struct dmar_domain *domain)
600{
601 domain_update_iommu_coherency(domain);
602 domain_update_iommu_snooping(domain);
603 domain_update_iommu_superpage(domain);
604}
605
606static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
607{
608 struct dmar_drhd_unit *drhd = NULL;
609 int i;
610
611 for_each_drhd_unit(drhd) {
612 if (drhd->ignored)
613 continue;
614 if (segment != drhd->segment)
615 continue;
616
617 for (i = 0; i < drhd->devices_cnt; i++) {
618 if (drhd->devices[i] &&
619 drhd->devices[i]->bus->number == bus &&
620 drhd->devices[i]->devfn == devfn)
621 return drhd->iommu;
622 if (drhd->devices[i] &&
623 drhd->devices[i]->subordinate &&
624 drhd->devices[i]->subordinate->number <= bus &&
625 drhd->devices[i]->subordinate->subordinate >= bus)
626 return drhd->iommu;
627 }
628
629 if (drhd->include_all)
630 return drhd->iommu;
631 }
632
633 return NULL;
634}
635
636static void domain_flush_cache(struct dmar_domain *domain,
637 void *addr, int size)
638{
639 if (!domain->iommu_coherency)
640 clflush_cache_range(addr, size);
641}
642
643/* Gets context entry for a given bus and devfn */
644static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
645 u8 bus, u8 devfn)
646{
647 struct root_entry *root;
648 struct context_entry *context;
649 unsigned long phy_addr;
650 unsigned long flags;
651
652 spin_lock_irqsave(&iommu->lock, flags);
653 root = &iommu->root_entry[bus];
654 context = get_context_addr_from_root(root);
655 if (!context) {
656 context = (struct context_entry *)
657 alloc_pgtable_page(iommu->node);
658 if (!context) {
659 spin_unlock_irqrestore(&iommu->lock, flags);
660 return NULL;
661 }
662 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
663 phy_addr = virt_to_phys((void *)context);
664 set_root_value(root, phy_addr);
665 set_root_present(root);
666 __iommu_flush_cache(iommu, root, sizeof(*root));
667 }
668 spin_unlock_irqrestore(&iommu->lock, flags);
669 return &context[devfn];
670}
671
672static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
673{
674 struct root_entry *root;
675 struct context_entry *context;
676 int ret;
677 unsigned long flags;
678
679 spin_lock_irqsave(&iommu->lock, flags);
680 root = &iommu->root_entry[bus];
681 context = get_context_addr_from_root(root);
682 if (!context) {
683 ret = 0;
684 goto out;
685 }
686 ret = context_present(&context[devfn]);
687out:
688 spin_unlock_irqrestore(&iommu->lock, flags);
689 return ret;
690}
691
692static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
693{
694 struct root_entry *root;
695 struct context_entry *context;
696 unsigned long flags;
697
698 spin_lock_irqsave(&iommu->lock, flags);
699 root = &iommu->root_entry[bus];
700 context = get_context_addr_from_root(root);
701 if (context) {
702 context_clear_entry(&context[devfn]);
703 __iommu_flush_cache(iommu, &context[devfn], \
704 sizeof(*context));
705 }
706 spin_unlock_irqrestore(&iommu->lock, flags);
707}
708
709static void free_context_table(struct intel_iommu *iommu)
710{
711 struct root_entry *root;
712 int i;
713 unsigned long flags;
714 struct context_entry *context;
715
716 spin_lock_irqsave(&iommu->lock, flags);
717 if (!iommu->root_entry) {
718 goto out;
719 }
720 for (i = 0; i < ROOT_ENTRY_NR; i++) {
721 root = &iommu->root_entry[i];
722 context = get_context_addr_from_root(root);
723 if (context)
724 free_pgtable_page(context);
725 }
726 free_pgtable_page(iommu->root_entry);
727 iommu->root_entry = NULL;
728out:
729 spin_unlock_irqrestore(&iommu->lock, flags);
730}
731
732static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
733 unsigned long pfn, int large_level)
734{
735 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
736 struct dma_pte *parent, *pte = NULL;
737 int level = agaw_to_level(domain->agaw);
738 int offset, target_level;
739
740 BUG_ON(!domain->pgd);
741 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
742 parent = domain->pgd;
743
744 /* Search pte */
745 if (!large_level)
746 target_level = 1;
747 else
748 target_level = large_level;
749
750 while (level > 0) {
751 void *tmp_page;
752
753 offset = pfn_level_offset(pfn, level);
754 pte = &parent[offset];
755 if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
756 break;
757 if (level == target_level)
758 break;
759
760 if (!dma_pte_present(pte)) {
761 uint64_t pteval;
762
763 tmp_page = alloc_pgtable_page(domain->nid);
764
765 if (!tmp_page)
766 return NULL;
767
768 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
769 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
770 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
771 /* Someone else set it while we were thinking; use theirs. */
772 free_pgtable_page(tmp_page);
773 } else {
774 dma_pte_addr(pte);
775 domain_flush_cache(domain, pte, sizeof(*pte));
776 }
777 }
778 parent = phys_to_virt(dma_pte_addr(pte));
779 level--;
780 }
781
782 return pte;
783}
784
785
786/* return address's pte at specific level */
787static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
788 unsigned long pfn,
789 int level, int *large_page)
790{
791 struct dma_pte *parent, *pte = NULL;
792 int total = agaw_to_level(domain->agaw);
793 int offset;
794
795 parent = domain->pgd;
796 while (level <= total) {
797 offset = pfn_level_offset(pfn, total);
798 pte = &parent[offset];
799 if (level == total)
800 return pte;
801
802 if (!dma_pte_present(pte)) {
803 *large_page = total;
804 break;
805 }
806
807 if (pte->val & DMA_PTE_LARGE_PAGE) {
808 *large_page = total;
809 return pte;
810 }
811
812 parent = phys_to_virt(dma_pte_addr(pte));
813 total--;
814 }
815 return NULL;
816}
817
818/* clear last level pte, a tlb flush should be followed */
819static void dma_pte_clear_range(struct dmar_domain *domain,
820 unsigned long start_pfn,
821 unsigned long last_pfn)
822{
823 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
824 unsigned int large_page = 1;
825 struct dma_pte *first_pte, *pte;
826
827 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
828 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
829 BUG_ON(start_pfn > last_pfn);
830
831 /* we don't need lock here; nobody else touches the iova range */
832 do {
833 large_page = 1;
834 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
835 if (!pte) {
836 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
837 continue;
838 }
839 do {
840 dma_clear_pte(pte);
841 start_pfn += lvl_to_nr_pages(large_page);
842 pte++;
843 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
844
845 domain_flush_cache(domain, first_pte,
846 (void *)pte - (void *)first_pte);
847
848 } while (start_pfn && start_pfn <= last_pfn);
849}
850
851/* free page table pages. last level pte should already be cleared */
852static void dma_pte_free_pagetable(struct dmar_domain *domain,
853 unsigned long start_pfn,
854 unsigned long last_pfn)
855{
856 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857 struct dma_pte *first_pte, *pte;
858 int total = agaw_to_level(domain->agaw);
859 int level;
860 unsigned long tmp;
861 int large_page = 2;
862
863 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
864 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
865 BUG_ON(start_pfn > last_pfn);
866
867 /* We don't need lock here; nobody else touches the iova range */
868 level = 2;
869 while (level <= total) {
870 tmp = align_to_level(start_pfn, level);
871
872 /* If we can't even clear one PTE at this level, we're done */
873 if (tmp + level_size(level) - 1 > last_pfn)
874 return;
875
876 do {
877 large_page = level;
878 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
879 if (large_page > level)
880 level = large_page + 1;
881 if (!pte) {
882 tmp = align_to_level(tmp + 1, level + 1);
883 continue;
884 }
885 do {
886 if (dma_pte_present(pte)) {
887 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
888 dma_clear_pte(pte);
889 }
890 pte++;
891 tmp += level_size(level);
892 } while (!first_pte_in_page(pte) &&
893 tmp + level_size(level) - 1 <= last_pfn);
894
895 domain_flush_cache(domain, first_pte,
896 (void *)pte - (void *)first_pte);
897
898 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
899 level++;
900 }
901 /* free pgd */
902 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
903 free_pgtable_page(domain->pgd);
904 domain->pgd = NULL;
905 }
906}
907
908/* iommu handling */
909static int iommu_alloc_root_entry(struct intel_iommu *iommu)
910{
911 struct root_entry *root;
912 unsigned long flags;
913
914 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
915 if (!root)
916 return -ENOMEM;
917
918 __iommu_flush_cache(iommu, root, ROOT_SIZE);
919
920 spin_lock_irqsave(&iommu->lock, flags);
921 iommu->root_entry = root;
922 spin_unlock_irqrestore(&iommu->lock, flags);
923
924 return 0;
925}
926
927static void iommu_set_root_entry(struct intel_iommu *iommu)
928{
929 void *addr;
930 u32 sts;
931 unsigned long flag;
932
933 addr = iommu->root_entry;
934
935 spin_lock_irqsave(&iommu->register_lock, flag);
936 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
937
938 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
939
940 /* Make sure hardware complete it */
941 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
942 readl, (sts & DMA_GSTS_RTPS), sts);
943
944 spin_unlock_irqrestore(&iommu->register_lock, flag);
945}
946
947static void iommu_flush_write_buffer(struct intel_iommu *iommu)
948{
949 u32 val;
950 unsigned long flag;
951
952 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
953 return;
954
955 spin_lock_irqsave(&iommu->register_lock, flag);
956 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
957
958 /* Make sure hardware complete it */
959 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
960 readl, (!(val & DMA_GSTS_WBFS)), val);
961
962 spin_unlock_irqrestore(&iommu->register_lock, flag);
963}
964
965/* return value determine if we need a write buffer flush */
966static void __iommu_flush_context(struct intel_iommu *iommu,
967 u16 did, u16 source_id, u8 function_mask,
968 u64 type)
969{
970 u64 val = 0;
971 unsigned long flag;
972
973 switch (type) {
974 case DMA_CCMD_GLOBAL_INVL:
975 val = DMA_CCMD_GLOBAL_INVL;
976 break;
977 case DMA_CCMD_DOMAIN_INVL:
978 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
979 break;
980 case DMA_CCMD_DEVICE_INVL:
981 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
982 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
983 break;
984 default:
985 BUG();
986 }
987 val |= DMA_CCMD_ICC;
988
989 spin_lock_irqsave(&iommu->register_lock, flag);
990 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
991
992 /* Make sure hardware complete it */
993 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
994 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
995
996 spin_unlock_irqrestore(&iommu->register_lock, flag);
997}
998
999/* return value determine if we need a write buffer flush */
1000static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1001 u64 addr, unsigned int size_order, u64 type)
1002{
1003 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1004 u64 val = 0, val_iva = 0;
1005 unsigned long flag;
1006
1007 switch (type) {
1008 case DMA_TLB_GLOBAL_FLUSH:
1009 /* global flush doesn't need set IVA_REG */
1010 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1011 break;
1012 case DMA_TLB_DSI_FLUSH:
1013 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1014 break;
1015 case DMA_TLB_PSI_FLUSH:
1016 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1017 /* Note: always flush non-leaf currently */
1018 val_iva = size_order | addr;
1019 break;
1020 default:
1021 BUG();
1022 }
1023 /* Note: set drain read/write */
1024#if 0
1025 /*
1026 * This is probably to be super secure.. Looks like we can
1027 * ignore it without any impact.
1028 */
1029 if (cap_read_drain(iommu->cap))
1030 val |= DMA_TLB_READ_DRAIN;
1031#endif
1032 if (cap_write_drain(iommu->cap))
1033 val |= DMA_TLB_WRITE_DRAIN;
1034
1035 spin_lock_irqsave(&iommu->register_lock, flag);
1036 /* Note: Only uses first TLB reg currently */
1037 if (val_iva)
1038 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1039 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1040
1041 /* Make sure hardware complete it */
1042 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1043 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1044
1045 spin_unlock_irqrestore(&iommu->register_lock, flag);
1046
1047 /* check IOTLB invalidation granularity */
1048 if (DMA_TLB_IAIG(val) == 0)
1049 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1050 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1051 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1052 (unsigned long long)DMA_TLB_IIRG(type),
1053 (unsigned long long)DMA_TLB_IAIG(val));
1054}
1055
1056static struct device_domain_info *iommu_support_dev_iotlb(
1057 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1058{
1059 int found = 0;
1060 unsigned long flags;
1061 struct device_domain_info *info;
1062 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1063
1064 if (!ecap_dev_iotlb_support(iommu->ecap))
1065 return NULL;
1066
1067 if (!iommu->qi)
1068 return NULL;
1069
1070 spin_lock_irqsave(&device_domain_lock, flags);
1071 list_for_each_entry(info, &domain->devices, link)
1072 if (info->bus == bus && info->devfn == devfn) {
1073 found = 1;
1074 break;
1075 }
1076 spin_unlock_irqrestore(&device_domain_lock, flags);
1077
1078 if (!found || !info->dev)
1079 return NULL;
1080
1081 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1082 return NULL;
1083
1084 if (!dmar_find_matched_atsr_unit(info->dev))
1085 return NULL;
1086
1087 info->iommu = iommu;
1088
1089 return info;
1090}
1091
1092static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1093{
1094 if (!info)
1095 return;
1096
1097 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1098}
1099
1100static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1101{
1102 if (!info->dev || !pci_ats_enabled(info->dev))
1103 return;
1104
1105 pci_disable_ats(info->dev);
1106}
1107
1108static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1109 u64 addr, unsigned mask)
1110{
1111 u16 sid, qdep;
1112 unsigned long flags;
1113 struct device_domain_info *info;
1114
1115 spin_lock_irqsave(&device_domain_lock, flags);
1116 list_for_each_entry(info, &domain->devices, link) {
1117 if (!info->dev || !pci_ats_enabled(info->dev))
1118 continue;
1119
1120 sid = info->bus << 8 | info->devfn;
1121 qdep = pci_ats_queue_depth(info->dev);
1122 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1123 }
1124 spin_unlock_irqrestore(&device_domain_lock, flags);
1125}
1126
1127static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1128 unsigned long pfn, unsigned int pages, int map)
1129{
1130 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1131 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1132
1133 BUG_ON(pages == 0);
1134
1135 /*
1136 * Fallback to domain selective flush if no PSI support or the size is
1137 * too big.
1138 * PSI requires page size to be 2 ^ x, and the base address is naturally
1139 * aligned to the size
1140 */
1141 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1142 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1143 DMA_TLB_DSI_FLUSH);
1144 else
1145 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1146 DMA_TLB_PSI_FLUSH);
1147
1148 /*
1149 * In caching mode, changes of pages from non-present to present require
1150 * flush. However, device IOTLB doesn't need to be flushed in this case.
1151 */
1152 if (!cap_caching_mode(iommu->cap) || !map)
1153 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1154}
1155
1156static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1157{
1158 u32 pmen;
1159 unsigned long flags;
1160
1161 spin_lock_irqsave(&iommu->register_lock, flags);
1162 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1163 pmen &= ~DMA_PMEN_EPM;
1164 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1165
1166 /* wait for the protected region status bit to clear */
1167 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1168 readl, !(pmen & DMA_PMEN_PRS), pmen);
1169
1170 spin_unlock_irqrestore(&iommu->register_lock, flags);
1171}
1172
1173static int iommu_enable_translation(struct intel_iommu *iommu)
1174{
1175 u32 sts;
1176 unsigned long flags;
1177
1178 spin_lock_irqsave(&iommu->register_lock, flags);
1179 iommu->gcmd |= DMA_GCMD_TE;
1180 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1181
1182 /* Make sure hardware complete it */
1183 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1184 readl, (sts & DMA_GSTS_TES), sts);
1185
1186 spin_unlock_irqrestore(&iommu->register_lock, flags);
1187 return 0;
1188}
1189
1190static int iommu_disable_translation(struct intel_iommu *iommu)
1191{
1192 u32 sts;
1193 unsigned long flag;
1194
1195 spin_lock_irqsave(&iommu->register_lock, flag);
1196 iommu->gcmd &= ~DMA_GCMD_TE;
1197 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1198
1199 /* Make sure hardware complete it */
1200 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1201 readl, (!(sts & DMA_GSTS_TES)), sts);
1202
1203 spin_unlock_irqrestore(&iommu->register_lock, flag);
1204 return 0;
1205}
1206
1207
1208static int iommu_init_domains(struct intel_iommu *iommu)
1209{
1210 unsigned long ndomains;
1211 unsigned long nlongs;
1212
1213 ndomains = cap_ndoms(iommu->cap);
1214 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1215 ndomains);
1216 nlongs = BITS_TO_LONGS(ndomains);
1217
1218 spin_lock_init(&iommu->lock);
1219
1220 /* TBD: there might be 64K domains,
1221 * consider other allocation for future chip
1222 */
1223 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1224 if (!iommu->domain_ids) {
1225 printk(KERN_ERR "Allocating domain id array failed\n");
1226 return -ENOMEM;
1227 }
1228 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1229 GFP_KERNEL);
1230 if (!iommu->domains) {
1231 printk(KERN_ERR "Allocating domain array failed\n");
1232 return -ENOMEM;
1233 }
1234
1235 /*
1236 * if Caching mode is set, then invalid translations are tagged
1237 * with domainid 0. Hence we need to pre-allocate it.
1238 */
1239 if (cap_caching_mode(iommu->cap))
1240 set_bit(0, iommu->domain_ids);
1241 return 0;
1242}
1243
1244
1245static void domain_exit(struct dmar_domain *domain);
1246static void vm_domain_exit(struct dmar_domain *domain);
1247
1248void free_dmar_iommu(struct intel_iommu *iommu)
1249{
1250 struct dmar_domain *domain;
1251 int i;
1252 unsigned long flags;
1253
1254 if ((iommu->domains) && (iommu->domain_ids)) {
1255 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1256 domain = iommu->domains[i];
1257 clear_bit(i, iommu->domain_ids);
1258
1259 spin_lock_irqsave(&domain->iommu_lock, flags);
1260 if (--domain->iommu_count == 0) {
1261 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1262 vm_domain_exit(domain);
1263 else
1264 domain_exit(domain);
1265 }
1266 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1267 }
1268 }
1269
1270 if (iommu->gcmd & DMA_GCMD_TE)
1271 iommu_disable_translation(iommu);
1272
1273 if (iommu->irq) {
1274 irq_set_handler_data(iommu->irq, NULL);
1275 /* This will mask the irq */
1276 free_irq(iommu->irq, iommu);
1277 destroy_irq(iommu->irq);
1278 }
1279
1280 kfree(iommu->domains);
1281 kfree(iommu->domain_ids);
1282
1283 g_iommus[iommu->seq_id] = NULL;
1284
1285 /* if all iommus are freed, free g_iommus */
1286 for (i = 0; i < g_num_of_iommus; i++) {
1287 if (g_iommus[i])
1288 break;
1289 }
1290
1291 if (i == g_num_of_iommus)
1292 kfree(g_iommus);
1293
1294 /* free context mapping */
1295 free_context_table(iommu);
1296}
1297
1298static struct dmar_domain *alloc_domain(void)
1299{
1300 struct dmar_domain *domain;
1301
1302 domain = alloc_domain_mem();
1303 if (!domain)
1304 return NULL;
1305
1306 domain->nid = -1;
1307 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1308 domain->flags = 0;
1309
1310 return domain;
1311}
1312
1313static int iommu_attach_domain(struct dmar_domain *domain,
1314 struct intel_iommu *iommu)
1315{
1316 int num;
1317 unsigned long ndomains;
1318 unsigned long flags;
1319
1320 ndomains = cap_ndoms(iommu->cap);
1321
1322 spin_lock_irqsave(&iommu->lock, flags);
1323
1324 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1325 if (num >= ndomains) {
1326 spin_unlock_irqrestore(&iommu->lock, flags);
1327 printk(KERN_ERR "IOMMU: no free domain ids\n");
1328 return -ENOMEM;
1329 }
1330
1331 domain->id = num;
1332 set_bit(num, iommu->domain_ids);
1333 set_bit(iommu->seq_id, &domain->iommu_bmp);
1334 iommu->domains[num] = domain;
1335 spin_unlock_irqrestore(&iommu->lock, flags);
1336
1337 return 0;
1338}
1339
1340static void iommu_detach_domain(struct dmar_domain *domain,
1341 struct intel_iommu *iommu)
1342{
1343 unsigned long flags;
1344 int num, ndomains;
1345 int found = 0;
1346
1347 spin_lock_irqsave(&iommu->lock, flags);
1348 ndomains = cap_ndoms(iommu->cap);
1349 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1350 if (iommu->domains[num] == domain) {
1351 found = 1;
1352 break;
1353 }
1354 }
1355
1356 if (found) {
1357 clear_bit(num, iommu->domain_ids);
1358 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1359 iommu->domains[num] = NULL;
1360 }
1361 spin_unlock_irqrestore(&iommu->lock, flags);
1362}
1363
1364static struct iova_domain reserved_iova_list;
1365static struct lock_class_key reserved_rbtree_key;
1366
1367static int dmar_init_reserved_ranges(void)
1368{
1369 struct pci_dev *pdev = NULL;
1370 struct iova *iova;
1371 int i;
1372
1373 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1374
1375 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1376 &reserved_rbtree_key);
1377
1378 /* IOAPIC ranges shouldn't be accessed by DMA */
1379 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1380 IOVA_PFN(IOAPIC_RANGE_END));
1381 if (!iova) {
1382 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1383 return -ENODEV;
1384 }
1385
1386 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1387 for_each_pci_dev(pdev) {
1388 struct resource *r;
1389
1390 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1391 r = &pdev->resource[i];
1392 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1393 continue;
1394 iova = reserve_iova(&reserved_iova_list,
1395 IOVA_PFN(r->start),
1396 IOVA_PFN(r->end));
1397 if (!iova) {
1398 printk(KERN_ERR "Reserve iova failed\n");
1399 return -ENODEV;
1400 }
1401 }
1402 }
1403 return 0;
1404}
1405
1406static void domain_reserve_special_ranges(struct dmar_domain *domain)
1407{
1408 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1409}
1410
1411static inline int guestwidth_to_adjustwidth(int gaw)
1412{
1413 int agaw;
1414 int r = (gaw - 12) % 9;
1415
1416 if (r == 0)
1417 agaw = gaw;
1418 else
1419 agaw = gaw + 9 - r;
1420 if (agaw > 64)
1421 agaw = 64;
1422 return agaw;
1423}
1424
1425static int domain_init(struct dmar_domain *domain, int guest_width)
1426{
1427 struct intel_iommu *iommu;
1428 int adjust_width, agaw;
1429 unsigned long sagaw;
1430
1431 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1432 spin_lock_init(&domain->iommu_lock);
1433
1434 domain_reserve_special_ranges(domain);
1435
1436 /* calculate AGAW */
1437 iommu = domain_get_iommu(domain);
1438 if (guest_width > cap_mgaw(iommu->cap))
1439 guest_width = cap_mgaw(iommu->cap);
1440 domain->gaw = guest_width;
1441 adjust_width = guestwidth_to_adjustwidth(guest_width);
1442 agaw = width_to_agaw(adjust_width);
1443 sagaw = cap_sagaw(iommu->cap);
1444 if (!test_bit(agaw, &sagaw)) {
1445 /* hardware doesn't support it, choose a bigger one */
1446 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1447 agaw = find_next_bit(&sagaw, 5, agaw);
1448 if (agaw >= 5)
1449 return -ENODEV;
1450 }
1451 domain->agaw = agaw;
1452 INIT_LIST_HEAD(&domain->devices);
1453
1454 if (ecap_coherent(iommu->ecap))
1455 domain->iommu_coherency = 1;
1456 else
1457 domain->iommu_coherency = 0;
1458
1459 if (ecap_sc_support(iommu->ecap))
1460 domain->iommu_snooping = 1;
1461 else
1462 domain->iommu_snooping = 0;
1463
1464 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1465 domain->iommu_count = 1;
1466 domain->nid = iommu->node;
1467
1468 /* always allocate the top pgd */
1469 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1470 if (!domain->pgd)
1471 return -ENOMEM;
1472 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1473 return 0;
1474}
1475
1476static void domain_exit(struct dmar_domain *domain)
1477{
1478 struct dmar_drhd_unit *drhd;
1479 struct intel_iommu *iommu;
1480
1481 /* Domain 0 is reserved, so dont process it */
1482 if (!domain)
1483 return;
1484
1485 /* Flush any lazy unmaps that may reference this domain */
1486 if (!intel_iommu_strict)
1487 flush_unmaps_timeout(0);
1488
1489 domain_remove_dev_info(domain);
1490 /* destroy iovas */
1491 put_iova_domain(&domain->iovad);
1492
1493 /* clear ptes */
1494 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1495
1496 /* free page tables */
1497 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1498
1499 for_each_active_iommu(iommu, drhd)
1500 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1501 iommu_detach_domain(domain, iommu);
1502
1503 free_domain_mem(domain);
1504}
1505
1506static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1507 u8 bus, u8 devfn, int translation)
1508{
1509 struct context_entry *context;
1510 unsigned long flags;
1511 struct intel_iommu *iommu;
1512 struct dma_pte *pgd;
1513 unsigned long num;
1514 unsigned long ndomains;
1515 int id;
1516 int agaw;
1517 struct device_domain_info *info = NULL;
1518
1519 pr_debug("Set context mapping for %02x:%02x.%d\n",
1520 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1521
1522 BUG_ON(!domain->pgd);
1523 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1524 translation != CONTEXT_TT_MULTI_LEVEL);
1525
1526 iommu = device_to_iommu(segment, bus, devfn);
1527 if (!iommu)
1528 return -ENODEV;
1529
1530 context = device_to_context_entry(iommu, bus, devfn);
1531 if (!context)
1532 return -ENOMEM;
1533 spin_lock_irqsave(&iommu->lock, flags);
1534 if (context_present(context)) {
1535 spin_unlock_irqrestore(&iommu->lock, flags);
1536 return 0;
1537 }
1538
1539 id = domain->id;
1540 pgd = domain->pgd;
1541
1542 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1543 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1544 int found = 0;
1545
1546 /* find an available domain id for this device in iommu */
1547 ndomains = cap_ndoms(iommu->cap);
1548 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1549 if (iommu->domains[num] == domain) {
1550 id = num;
1551 found = 1;
1552 break;
1553 }
1554 }
1555
1556 if (found == 0) {
1557 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1558 if (num >= ndomains) {
1559 spin_unlock_irqrestore(&iommu->lock, flags);
1560 printk(KERN_ERR "IOMMU: no free domain ids\n");
1561 return -EFAULT;
1562 }
1563
1564 set_bit(num, iommu->domain_ids);
1565 iommu->domains[num] = domain;
1566 id = num;
1567 }
1568
1569 /* Skip top levels of page tables for
1570 * iommu which has less agaw than default.
1571 * Unnecessary for PT mode.
1572 */
1573 if (translation != CONTEXT_TT_PASS_THROUGH) {
1574 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1575 pgd = phys_to_virt(dma_pte_addr(pgd));
1576 if (!dma_pte_present(pgd)) {
1577 spin_unlock_irqrestore(&iommu->lock, flags);
1578 return -ENOMEM;
1579 }
1580 }
1581 }
1582 }
1583
1584 context_set_domain_id(context, id);
1585
1586 if (translation != CONTEXT_TT_PASS_THROUGH) {
1587 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1588 translation = info ? CONTEXT_TT_DEV_IOTLB :
1589 CONTEXT_TT_MULTI_LEVEL;
1590 }
1591 /*
1592 * In pass through mode, AW must be programmed to indicate the largest
1593 * AGAW value supported by hardware. And ASR is ignored by hardware.
1594 */
1595 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1596 context_set_address_width(context, iommu->msagaw);
1597 else {
1598 context_set_address_root(context, virt_to_phys(pgd));
1599 context_set_address_width(context, iommu->agaw);
1600 }
1601
1602 context_set_translation_type(context, translation);
1603 context_set_fault_enable(context);
1604 context_set_present(context);
1605 domain_flush_cache(domain, context, sizeof(*context));
1606
1607 /*
1608 * It's a non-present to present mapping. If hardware doesn't cache
1609 * non-present entry we only need to flush the write-buffer. If the
1610 * _does_ cache non-present entries, then it does so in the special
1611 * domain #0, which we have to flush:
1612 */
1613 if (cap_caching_mode(iommu->cap)) {
1614 iommu->flush.flush_context(iommu, 0,
1615 (((u16)bus) << 8) | devfn,
1616 DMA_CCMD_MASK_NOBIT,
1617 DMA_CCMD_DEVICE_INVL);
1618 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1619 } else {
1620 iommu_flush_write_buffer(iommu);
1621 }
1622 iommu_enable_dev_iotlb(info);
1623 spin_unlock_irqrestore(&iommu->lock, flags);
1624
1625 spin_lock_irqsave(&domain->iommu_lock, flags);
1626 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1627 domain->iommu_count++;
1628 if (domain->iommu_count == 1)
1629 domain->nid = iommu->node;
1630 domain_update_iommu_cap(domain);
1631 }
1632 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1633 return 0;
1634}
1635
1636static int
1637domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1638 int translation)
1639{
1640 int ret;
1641 struct pci_dev *tmp, *parent;
1642
1643 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1644 pdev->bus->number, pdev->devfn,
1645 translation);
1646 if (ret)
1647 return ret;
1648
1649 /* dependent device mapping */
1650 tmp = pci_find_upstream_pcie_bridge(pdev);
1651 if (!tmp)
1652 return 0;
1653 /* Secondary interface's bus number and devfn 0 */
1654 parent = pdev->bus->self;
1655 while (parent != tmp) {
1656 ret = domain_context_mapping_one(domain,
1657 pci_domain_nr(parent->bus),
1658 parent->bus->number,
1659 parent->devfn, translation);
1660 if (ret)
1661 return ret;
1662 parent = parent->bus->self;
1663 }
1664 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1665 return domain_context_mapping_one(domain,
1666 pci_domain_nr(tmp->subordinate),
1667 tmp->subordinate->number, 0,
1668 translation);
1669 else /* this is a legacy PCI bridge */
1670 return domain_context_mapping_one(domain,
1671 pci_domain_nr(tmp->bus),
1672 tmp->bus->number,
1673 tmp->devfn,
1674 translation);
1675}
1676
1677static int domain_context_mapped(struct pci_dev *pdev)
1678{
1679 int ret;
1680 struct pci_dev *tmp, *parent;
1681 struct intel_iommu *iommu;
1682
1683 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1684 pdev->devfn);
1685 if (!iommu)
1686 return -ENODEV;
1687
1688 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1689 if (!ret)
1690 return ret;
1691 /* dependent device mapping */
1692 tmp = pci_find_upstream_pcie_bridge(pdev);
1693 if (!tmp)
1694 return ret;
1695 /* Secondary interface's bus number and devfn 0 */
1696 parent = pdev->bus->self;
1697 while (parent != tmp) {
1698 ret = device_context_mapped(iommu, parent->bus->number,
1699 parent->devfn);
1700 if (!ret)
1701 return ret;
1702 parent = parent->bus->self;
1703 }
1704 if (pci_is_pcie(tmp))
1705 return device_context_mapped(iommu, tmp->subordinate->number,
1706 0);
1707 else
1708 return device_context_mapped(iommu, tmp->bus->number,
1709 tmp->devfn);
1710}
1711
1712/* Returns a number of VTD pages, but aligned to MM page size */
1713static inline unsigned long aligned_nrpages(unsigned long host_addr,
1714 size_t size)
1715{
1716 host_addr &= ~PAGE_MASK;
1717 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1718}
1719
1720/* Return largest possible superpage level for a given mapping */
1721static inline int hardware_largepage_caps(struct dmar_domain *domain,
1722 unsigned long iov_pfn,
1723 unsigned long phy_pfn,
1724 unsigned long pages)
1725{
1726 int support, level = 1;
1727 unsigned long pfnmerge;
1728
1729 support = domain->iommu_superpage;
1730
1731 /* To use a large page, the virtual *and* physical addresses
1732 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1733 of them will mean we have to use smaller pages. So just
1734 merge them and check both at once. */
1735 pfnmerge = iov_pfn | phy_pfn;
1736
1737 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1738 pages >>= VTD_STRIDE_SHIFT;
1739 if (!pages)
1740 break;
1741 pfnmerge >>= VTD_STRIDE_SHIFT;
1742 level++;
1743 support--;
1744 }
1745 return level;
1746}
1747
1748static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1749 struct scatterlist *sg, unsigned long phys_pfn,
1750 unsigned long nr_pages, int prot)
1751{
1752 struct dma_pte *first_pte = NULL, *pte = NULL;
1753 phys_addr_t uninitialized_var(pteval);
1754 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1755 unsigned long sg_res;
1756 unsigned int largepage_lvl = 0;
1757 unsigned long lvl_pages = 0;
1758
1759 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1760
1761 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1762 return -EINVAL;
1763
1764 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1765
1766 if (sg)
1767 sg_res = 0;
1768 else {
1769 sg_res = nr_pages + 1;
1770 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1771 }
1772
1773 while (nr_pages > 0) {
1774 uint64_t tmp;
1775
1776 if (!sg_res) {
1777 sg_res = aligned_nrpages(sg->offset, sg->length);
1778 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1779 sg->dma_length = sg->length;
1780 pteval = page_to_phys(sg_page(sg)) | prot;
1781 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1782 }
1783
1784 if (!pte) {
1785 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1786
1787 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1788 if (!pte)
1789 return -ENOMEM;
1790 /* It is large page*/
1791 if (largepage_lvl > 1)
1792 pteval |= DMA_PTE_LARGE_PAGE;
1793 else
1794 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1795
1796 }
1797 /* We don't need lock here, nobody else
1798 * touches the iova range
1799 */
1800 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1801 if (tmp) {
1802 static int dumps = 5;
1803 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1804 iov_pfn, tmp, (unsigned long long)pteval);
1805 if (dumps) {
1806 dumps--;
1807 debug_dma_dump_mappings(NULL);
1808 }
1809 WARN_ON(1);
1810 }
1811
1812 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1813
1814 BUG_ON(nr_pages < lvl_pages);
1815 BUG_ON(sg_res < lvl_pages);
1816
1817 nr_pages -= lvl_pages;
1818 iov_pfn += lvl_pages;
1819 phys_pfn += lvl_pages;
1820 pteval += lvl_pages * VTD_PAGE_SIZE;
1821 sg_res -= lvl_pages;
1822
1823 /* If the next PTE would be the first in a new page, then we
1824 need to flush the cache on the entries we've just written.
1825 And then we'll need to recalculate 'pte', so clear it and
1826 let it get set again in the if (!pte) block above.
1827
1828 If we're done (!nr_pages) we need to flush the cache too.
1829
1830 Also if we've been setting superpages, we may need to
1831 recalculate 'pte' and switch back to smaller pages for the
1832 end of the mapping, if the trailing size is not enough to
1833 use another superpage (i.e. sg_res < lvl_pages). */
1834 pte++;
1835 if (!nr_pages || first_pte_in_page(pte) ||
1836 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1837 domain_flush_cache(domain, first_pte,
1838 (void *)pte - (void *)first_pte);
1839 pte = NULL;
1840 }
1841
1842 if (!sg_res && nr_pages)
1843 sg = sg_next(sg);
1844 }
1845 return 0;
1846}
1847
1848static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1849 struct scatterlist *sg, unsigned long nr_pages,
1850 int prot)
1851{
1852 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1853}
1854
1855static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1856 unsigned long phys_pfn, unsigned long nr_pages,
1857 int prot)
1858{
1859 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1860}
1861
1862static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1863{
1864 if (!iommu)
1865 return;
1866
1867 clear_context_table(iommu, bus, devfn);
1868 iommu->flush.flush_context(iommu, 0, 0, 0,
1869 DMA_CCMD_GLOBAL_INVL);
1870 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1871}
1872
1873static void domain_remove_dev_info(struct dmar_domain *domain)
1874{
1875 struct device_domain_info *info;
1876 unsigned long flags;
1877 struct intel_iommu *iommu;
1878
1879 spin_lock_irqsave(&device_domain_lock, flags);
1880 while (!list_empty(&domain->devices)) {
1881 info = list_entry(domain->devices.next,
1882 struct device_domain_info, link);
1883 list_del(&info->link);
1884 list_del(&info->global);
1885 if (info->dev)
1886 info->dev->dev.archdata.iommu = NULL;
1887 spin_unlock_irqrestore(&device_domain_lock, flags);
1888
1889 iommu_disable_dev_iotlb(info);
1890 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1891 iommu_detach_dev(iommu, info->bus, info->devfn);
1892 free_devinfo_mem(info);
1893
1894 spin_lock_irqsave(&device_domain_lock, flags);
1895 }
1896 spin_unlock_irqrestore(&device_domain_lock, flags);
1897}
1898
1899/*
1900 * find_domain
1901 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1902 */
1903static struct dmar_domain *
1904find_domain(struct pci_dev *pdev)
1905{
1906 struct device_domain_info *info;
1907
1908 /* No lock here, assumes no domain exit in normal case */
1909 info = pdev->dev.archdata.iommu;
1910 if (info)
1911 return info->domain;
1912 return NULL;
1913}
1914
1915/* domain is initialized */
1916static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1917{
1918 struct dmar_domain *domain, *found = NULL;
1919 struct intel_iommu *iommu;
1920 struct dmar_drhd_unit *drhd;
1921 struct device_domain_info *info, *tmp;
1922 struct pci_dev *dev_tmp;
1923 unsigned long flags;
1924 int bus = 0, devfn = 0;
1925 int segment;
1926 int ret;
1927
1928 domain = find_domain(pdev);
1929 if (domain)
1930 return domain;
1931
1932 segment = pci_domain_nr(pdev->bus);
1933
1934 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1935 if (dev_tmp) {
1936 if (pci_is_pcie(dev_tmp)) {
1937 bus = dev_tmp->subordinate->number;
1938 devfn = 0;
1939 } else {
1940 bus = dev_tmp->bus->number;
1941 devfn = dev_tmp->devfn;
1942 }
1943 spin_lock_irqsave(&device_domain_lock, flags);
1944 list_for_each_entry(info, &device_domain_list, global) {
1945 if (info->segment == segment &&
1946 info->bus == bus && info->devfn == devfn) {
1947 found = info->domain;
1948 break;
1949 }
1950 }
1951 spin_unlock_irqrestore(&device_domain_lock, flags);
1952 /* pcie-pci bridge already has a domain, uses it */
1953 if (found) {
1954 domain = found;
1955 goto found_domain;
1956 }
1957 }
1958
1959 domain = alloc_domain();
1960 if (!domain)
1961 goto error;
1962
1963 /* Allocate new domain for the device */
1964 drhd = dmar_find_matched_drhd_unit(pdev);
1965 if (!drhd) {
1966 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1967 pci_name(pdev));
1968 return NULL;
1969 }
1970 iommu = drhd->iommu;
1971
1972 ret = iommu_attach_domain(domain, iommu);
1973 if (ret) {
1974 free_domain_mem(domain);
1975 goto error;
1976 }
1977
1978 if (domain_init(domain, gaw)) {
1979 domain_exit(domain);
1980 goto error;
1981 }
1982
1983 /* register pcie-to-pci device */
1984 if (dev_tmp) {
1985 info = alloc_devinfo_mem();
1986 if (!info) {
1987 domain_exit(domain);
1988 goto error;
1989 }
1990 info->segment = segment;
1991 info->bus = bus;
1992 info->devfn = devfn;
1993 info->dev = NULL;
1994 info->domain = domain;
1995 /* This domain is shared by devices under p2p bridge */
1996 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1997
1998 /* pcie-to-pci bridge already has a domain, uses it */
1999 found = NULL;
2000 spin_lock_irqsave(&device_domain_lock, flags);
2001 list_for_each_entry(tmp, &device_domain_list, global) {
2002 if (tmp->segment == segment &&
2003 tmp->bus == bus && tmp->devfn == devfn) {
2004 found = tmp->domain;
2005 break;
2006 }
2007 }
2008 if (found) {
2009 spin_unlock_irqrestore(&device_domain_lock, flags);
2010 free_devinfo_mem(info);
2011 domain_exit(domain);
2012 domain = found;
2013 } else {
2014 list_add(&info->link, &domain->devices);
2015 list_add(&info->global, &device_domain_list);
2016 spin_unlock_irqrestore(&device_domain_lock, flags);
2017 }
2018 }
2019
2020found_domain:
2021 info = alloc_devinfo_mem();
2022 if (!info)
2023 goto error;
2024 info->segment = segment;
2025 info->bus = pdev->bus->number;
2026 info->devfn = pdev->devfn;
2027 info->dev = pdev;
2028 info->domain = domain;
2029 spin_lock_irqsave(&device_domain_lock, flags);
2030 /* somebody is fast */
2031 found = find_domain(pdev);
2032 if (found != NULL) {
2033 spin_unlock_irqrestore(&device_domain_lock, flags);
2034 if (found != domain) {
2035 domain_exit(domain);
2036 domain = found;
2037 }
2038 free_devinfo_mem(info);
2039 return domain;
2040 }
2041 list_add(&info->link, &domain->devices);
2042 list_add(&info->global, &device_domain_list);
2043 pdev->dev.archdata.iommu = info;
2044 spin_unlock_irqrestore(&device_domain_lock, flags);
2045 return domain;
2046error:
2047 /* recheck it here, maybe others set it */
2048 return find_domain(pdev);
2049}
2050
2051static int iommu_identity_mapping;
2052#define IDENTMAP_ALL 1
2053#define IDENTMAP_GFX 2
2054#define IDENTMAP_AZALIA 4
2055
2056static int iommu_domain_identity_map(struct dmar_domain *domain,
2057 unsigned long long start,
2058 unsigned long long end)
2059{
2060 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2061 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2062
2063 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2064 dma_to_mm_pfn(last_vpfn))) {
2065 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2066 return -ENOMEM;
2067 }
2068
2069 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2070 start, end, domain->id);
2071 /*
2072 * RMRR range might have overlap with physical memory range,
2073 * clear it first
2074 */
2075 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2076
2077 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2078 last_vpfn - first_vpfn + 1,
2079 DMA_PTE_READ|DMA_PTE_WRITE);
2080}
2081
2082static int iommu_prepare_identity_map(struct pci_dev *pdev,
2083 unsigned long long start,
2084 unsigned long long end)
2085{
2086 struct dmar_domain *domain;
2087 int ret;
2088
2089 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2090 if (!domain)
2091 return -ENOMEM;
2092
2093 /* For _hardware_ passthrough, don't bother. But for software
2094 passthrough, we do it anyway -- it may indicate a memory
2095 range which is reserved in E820, so which didn't get set
2096 up to start with in si_domain */
2097 if (domain == si_domain && hw_pass_through) {
2098 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2099 pci_name(pdev), start, end);
2100 return 0;
2101 }
2102
2103 printk(KERN_INFO
2104 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2105 pci_name(pdev), start, end);
2106
2107 if (end < start) {
2108 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2109 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2110 dmi_get_system_info(DMI_BIOS_VENDOR),
2111 dmi_get_system_info(DMI_BIOS_VERSION),
2112 dmi_get_system_info(DMI_PRODUCT_VERSION));
2113 ret = -EIO;
2114 goto error;
2115 }
2116
2117 if (end >> agaw_to_width(domain->agaw)) {
2118 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2119 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2120 agaw_to_width(domain->agaw),
2121 dmi_get_system_info(DMI_BIOS_VENDOR),
2122 dmi_get_system_info(DMI_BIOS_VERSION),
2123 dmi_get_system_info(DMI_PRODUCT_VERSION));
2124 ret = -EIO;
2125 goto error;
2126 }
2127
2128 ret = iommu_domain_identity_map(domain, start, end);
2129 if (ret)
2130 goto error;
2131
2132 /* context entry init */
2133 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2134 if (ret)
2135 goto error;
2136
2137 return 0;
2138
2139 error:
2140 domain_exit(domain);
2141 return ret;
2142}
2143
2144static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2145 struct pci_dev *pdev)
2146{
2147 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2148 return 0;
2149 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2150 rmrr->end_address);
2151}
2152
2153#ifdef CONFIG_DMAR_FLOPPY_WA
2154static inline void iommu_prepare_isa(void)
2155{
2156 struct pci_dev *pdev;
2157 int ret;
2158
2159 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2160 if (!pdev)
2161 return;
2162
2163 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2164 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2165
2166 if (ret)
2167 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2168 "floppy might not work\n");
2169
2170}
2171#else
2172static inline void iommu_prepare_isa(void)
2173{
2174 return;
2175}
2176#endif /* !CONFIG_DMAR_FLPY_WA */
2177
2178static int md_domain_init(struct dmar_domain *domain, int guest_width);
2179
2180static int __init si_domain_work_fn(unsigned long start_pfn,
2181 unsigned long end_pfn, void *datax)
2182{
2183 int *ret = datax;
2184
2185 *ret = iommu_domain_identity_map(si_domain,
2186 (uint64_t)start_pfn << PAGE_SHIFT,
2187 (uint64_t)end_pfn << PAGE_SHIFT);
2188 return *ret;
2189
2190}
2191
2192static int __init si_domain_init(int hw)
2193{
2194 struct dmar_drhd_unit *drhd;
2195 struct intel_iommu *iommu;
2196 int nid, ret = 0;
2197
2198 si_domain = alloc_domain();
2199 if (!si_domain)
2200 return -EFAULT;
2201
2202 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2203
2204 for_each_active_iommu(iommu, drhd) {
2205 ret = iommu_attach_domain(si_domain, iommu);
2206 if (ret) {
2207 domain_exit(si_domain);
2208 return -EFAULT;
2209 }
2210 }
2211
2212 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2213 domain_exit(si_domain);
2214 return -EFAULT;
2215 }
2216
2217 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2218
2219 if (hw)
2220 return 0;
2221
2222 for_each_online_node(nid) {
2223 work_with_active_regions(nid, si_domain_work_fn, &ret);
2224 if (ret)
2225 return ret;
2226 }
2227
2228 return 0;
2229}
2230
2231static void domain_remove_one_dev_info(struct dmar_domain *domain,
2232 struct pci_dev *pdev);
2233static int identity_mapping(struct pci_dev *pdev)
2234{
2235 struct device_domain_info *info;
2236
2237 if (likely(!iommu_identity_mapping))
2238 return 0;
2239
2240 info = pdev->dev.archdata.iommu;
2241 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2242 return (info->domain == si_domain);
2243
2244 return 0;
2245}
2246
2247static int domain_add_dev_info(struct dmar_domain *domain,
2248 struct pci_dev *pdev,
2249 int translation)
2250{
2251 struct device_domain_info *info;
2252 unsigned long flags;
2253 int ret;
2254
2255 info = alloc_devinfo_mem();
2256 if (!info)
2257 return -ENOMEM;
2258
2259 ret = domain_context_mapping(domain, pdev, translation);
2260 if (ret) {
2261 free_devinfo_mem(info);
2262 return ret;
2263 }
2264
2265 info->segment = pci_domain_nr(pdev->bus);
2266 info->bus = pdev->bus->number;
2267 info->devfn = pdev->devfn;
2268 info->dev = pdev;
2269 info->domain = domain;
2270
2271 spin_lock_irqsave(&device_domain_lock, flags);
2272 list_add(&info->link, &domain->devices);
2273 list_add(&info->global, &device_domain_list);
2274 pdev->dev.archdata.iommu = info;
2275 spin_unlock_irqrestore(&device_domain_lock, flags);
2276
2277 return 0;
2278}
2279
2280static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2281{
2282 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2283 return 1;
2284
2285 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2286 return 1;
2287
2288 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2289 return 0;
2290
2291 /*
2292 * We want to start off with all devices in the 1:1 domain, and
2293 * take them out later if we find they can't access all of memory.
2294 *
2295 * However, we can't do this for PCI devices behind bridges,
2296 * because all PCI devices behind the same bridge will end up
2297 * with the same source-id on their transactions.
2298 *
2299 * Practically speaking, we can't change things around for these
2300 * devices at run-time, because we can't be sure there'll be no
2301 * DMA transactions in flight for any of their siblings.
2302 *
2303 * So PCI devices (unless they're on the root bus) as well as
2304 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2305 * the 1:1 domain, just in _case_ one of their siblings turns out
2306 * not to be able to map all of memory.
2307 */
2308 if (!pci_is_pcie(pdev)) {
2309 if (!pci_is_root_bus(pdev->bus))
2310 return 0;
2311 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2312 return 0;
2313 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2314 return 0;
2315
2316 /*
2317 * At boot time, we don't yet know if devices will be 64-bit capable.
2318 * Assume that they will -- if they turn out not to be, then we can
2319 * take them out of the 1:1 domain later.
2320 */
2321 if (!startup) {
2322 /*
2323 * If the device's dma_mask is less than the system's memory
2324 * size then this is not a candidate for identity mapping.
2325 */
2326 u64 dma_mask = pdev->dma_mask;
2327
2328 if (pdev->dev.coherent_dma_mask &&
2329 pdev->dev.coherent_dma_mask < dma_mask)
2330 dma_mask = pdev->dev.coherent_dma_mask;
2331
2332 return dma_mask >= dma_get_required_mask(&pdev->dev);
2333 }
2334
2335 return 1;
2336}
2337
2338static int __init iommu_prepare_static_identity_mapping(int hw)
2339{
2340 struct pci_dev *pdev = NULL;
2341 int ret;
2342
2343 ret = si_domain_init(hw);
2344 if (ret)
2345 return -EFAULT;
2346
2347 for_each_pci_dev(pdev) {
2348 /* Skip Host/PCI Bridge devices */
2349 if (IS_BRIDGE_HOST_DEVICE(pdev))
2350 continue;
2351 if (iommu_should_identity_map(pdev, 1)) {
2352 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2353 hw ? "hardware" : "software", pci_name(pdev));
2354
2355 ret = domain_add_dev_info(si_domain, pdev,
2356 hw ? CONTEXT_TT_PASS_THROUGH :
2357 CONTEXT_TT_MULTI_LEVEL);
2358 if (ret)
2359 return ret;
2360 }
2361 }
2362
2363 return 0;
2364}
2365
2366static int __init init_dmars(void)
2367{
2368 struct dmar_drhd_unit *drhd;
2369 struct dmar_rmrr_unit *rmrr;
2370 struct pci_dev *pdev;
2371 struct intel_iommu *iommu;
2372 int i, ret;
2373
2374 /*
2375 * for each drhd
2376 * allocate root
2377 * initialize and program root entry to not present
2378 * endfor
2379 */
2380 for_each_drhd_unit(drhd) {
2381 g_num_of_iommus++;
2382 /*
2383 * lock not needed as this is only incremented in the single
2384 * threaded kernel __init code path all other access are read
2385 * only
2386 */
2387 }
2388
2389 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2390 GFP_KERNEL);
2391 if (!g_iommus) {
2392 printk(KERN_ERR "Allocating global iommu array failed\n");
2393 ret = -ENOMEM;
2394 goto error;
2395 }
2396
2397 deferred_flush = kzalloc(g_num_of_iommus *
2398 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2399 if (!deferred_flush) {
2400 ret = -ENOMEM;
2401 goto error;
2402 }
2403
2404 for_each_drhd_unit(drhd) {
2405 if (drhd->ignored)
2406 continue;
2407
2408 iommu = drhd->iommu;
2409 g_iommus[iommu->seq_id] = iommu;
2410
2411 ret = iommu_init_domains(iommu);
2412 if (ret)
2413 goto error;
2414
2415 /*
2416 * TBD:
2417 * we could share the same root & context tables
2418 * among all IOMMU's. Need to Split it later.
2419 */
2420 ret = iommu_alloc_root_entry(iommu);
2421 if (ret) {
2422 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2423 goto error;
2424 }
2425 if (!ecap_pass_through(iommu->ecap))
2426 hw_pass_through = 0;
2427 }
2428
2429 /*
2430 * Start from the sane iommu hardware state.
2431 */
2432 for_each_drhd_unit(drhd) {
2433 if (drhd->ignored)
2434 continue;
2435
2436 iommu = drhd->iommu;
2437
2438 /*
2439 * If the queued invalidation is already initialized by us
2440 * (for example, while enabling interrupt-remapping) then
2441 * we got the things already rolling from a sane state.
2442 */
2443 if (iommu->qi)
2444 continue;
2445
2446 /*
2447 * Clear any previous faults.
2448 */
2449 dmar_fault(-1, iommu);
2450 /*
2451 * Disable queued invalidation if supported and already enabled
2452 * before OS handover.
2453 */
2454 dmar_disable_qi(iommu);
2455 }
2456
2457 for_each_drhd_unit(drhd) {
2458 if (drhd->ignored)
2459 continue;
2460
2461 iommu = drhd->iommu;
2462
2463 if (dmar_enable_qi(iommu)) {
2464 /*
2465 * Queued Invalidate not enabled, use Register Based
2466 * Invalidate
2467 */
2468 iommu->flush.flush_context = __iommu_flush_context;
2469 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2470 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2471 "invalidation\n",
2472 iommu->seq_id,
2473 (unsigned long long)drhd->reg_base_addr);
2474 } else {
2475 iommu->flush.flush_context = qi_flush_context;
2476 iommu->flush.flush_iotlb = qi_flush_iotlb;
2477 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2478 "invalidation\n",
2479 iommu->seq_id,
2480 (unsigned long long)drhd->reg_base_addr);
2481 }
2482 }
2483
2484 if (iommu_pass_through)
2485 iommu_identity_mapping |= IDENTMAP_ALL;
2486
2487#ifdef CONFIG_DMAR_BROKEN_GFX_WA
2488 iommu_identity_mapping |= IDENTMAP_GFX;
2489#endif
2490
2491 check_tylersburg_isoch();
2492
2493 /*
2494 * If pass through is not set or not enabled, setup context entries for
2495 * identity mappings for rmrr, gfx, and isa and may fall back to static
2496 * identity mapping if iommu_identity_mapping is set.
2497 */
2498 if (iommu_identity_mapping) {
2499 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2500 if (ret) {
2501 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2502 goto error;
2503 }
2504 }
2505 /*
2506 * For each rmrr
2507 * for each dev attached to rmrr
2508 * do
2509 * locate drhd for dev, alloc domain for dev
2510 * allocate free domain
2511 * allocate page table entries for rmrr
2512 * if context not allocated for bus
2513 * allocate and init context
2514 * set present in root table for this bus
2515 * init context with domain, translation etc
2516 * endfor
2517 * endfor
2518 */
2519 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2520 for_each_rmrr_units(rmrr) {
2521 for (i = 0; i < rmrr->devices_cnt; i++) {
2522 pdev = rmrr->devices[i];
2523 /*
2524 * some BIOS lists non-exist devices in DMAR
2525 * table.
2526 */
2527 if (!pdev)
2528 continue;
2529 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2530 if (ret)
2531 printk(KERN_ERR
2532 "IOMMU: mapping reserved region failed\n");
2533 }
2534 }
2535
2536 iommu_prepare_isa();
2537
2538 /*
2539 * for each drhd
2540 * enable fault log
2541 * global invalidate context cache
2542 * global invalidate iotlb
2543 * enable translation
2544 */
2545 for_each_drhd_unit(drhd) {
2546 if (drhd->ignored) {
2547 /*
2548 * we always have to disable PMRs or DMA may fail on
2549 * this device
2550 */
2551 if (force_on)
2552 iommu_disable_protect_mem_regions(drhd->iommu);
2553 continue;
2554 }
2555 iommu = drhd->iommu;
2556
2557 iommu_flush_write_buffer(iommu);
2558
2559 ret = dmar_set_interrupt(iommu);
2560 if (ret)
2561 goto error;
2562
2563 iommu_set_root_entry(iommu);
2564
2565 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2566 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2567
2568 ret = iommu_enable_translation(iommu);
2569 if (ret)
2570 goto error;
2571
2572 iommu_disable_protect_mem_regions(iommu);
2573 }
2574
2575 return 0;
2576error:
2577 for_each_drhd_unit(drhd) {
2578 if (drhd->ignored)
2579 continue;
2580 iommu = drhd->iommu;
2581 free_iommu(iommu);
2582 }
2583 kfree(g_iommus);
2584 return ret;
2585}
2586
2587/* This takes a number of _MM_ pages, not VTD pages */
2588static struct iova *intel_alloc_iova(struct device *dev,
2589 struct dmar_domain *domain,
2590 unsigned long nrpages, uint64_t dma_mask)
2591{
2592 struct pci_dev *pdev = to_pci_dev(dev);
2593 struct iova *iova = NULL;
2594
2595 /* Restrict dma_mask to the width that the iommu can handle */
2596 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2597
2598 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2599 /*
2600 * First try to allocate an io virtual address in
2601 * DMA_BIT_MASK(32) and if that fails then try allocating
2602 * from higher range
2603 */
2604 iova = alloc_iova(&domain->iovad, nrpages,
2605 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2606 if (iova)
2607 return iova;
2608 }
2609 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2610 if (unlikely(!iova)) {
2611 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2612 nrpages, pci_name(pdev));
2613 return NULL;
2614 }
2615
2616 return iova;
2617}
2618
2619static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2620{
2621 struct dmar_domain *domain;
2622 int ret;
2623
2624 domain = get_domain_for_dev(pdev,
2625 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2626 if (!domain) {
2627 printk(KERN_ERR
2628 "Allocating domain for %s failed", pci_name(pdev));
2629 return NULL;
2630 }
2631
2632 /* make sure context mapping is ok */
2633 if (unlikely(!domain_context_mapped(pdev))) {
2634 ret = domain_context_mapping(domain, pdev,
2635 CONTEXT_TT_MULTI_LEVEL);
2636 if (ret) {
2637 printk(KERN_ERR
2638 "Domain context map for %s failed",
2639 pci_name(pdev));
2640 return NULL;
2641 }
2642 }
2643
2644 return domain;
2645}
2646
2647static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2648{
2649 struct device_domain_info *info;
2650
2651 /* No lock here, assumes no domain exit in normal case */
2652 info = dev->dev.archdata.iommu;
2653 if (likely(info))
2654 return info->domain;
2655
2656 return __get_valid_domain_for_dev(dev);
2657}
2658
2659static int iommu_dummy(struct pci_dev *pdev)
2660{
2661 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2662}
2663
2664/* Check if the pdev needs to go through non-identity map and unmap process.*/
2665static int iommu_no_mapping(struct device *dev)
2666{
2667 struct pci_dev *pdev;
2668 int found;
2669
2670 if (unlikely(dev->bus != &pci_bus_type))
2671 return 1;
2672
2673 pdev = to_pci_dev(dev);
2674 if (iommu_dummy(pdev))
2675 return 1;
2676
2677 if (!iommu_identity_mapping)
2678 return 0;
2679
2680 found = identity_mapping(pdev);
2681 if (found) {
2682 if (iommu_should_identity_map(pdev, 0))
2683 return 1;
2684 else {
2685 /*
2686 * 32 bit DMA is removed from si_domain and fall back
2687 * to non-identity mapping.
2688 */
2689 domain_remove_one_dev_info(si_domain, pdev);
2690 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2691 pci_name(pdev));
2692 return 0;
2693 }
2694 } else {
2695 /*
2696 * In case of a detached 64 bit DMA device from vm, the device
2697 * is put into si_domain for identity mapping.
2698 */
2699 if (iommu_should_identity_map(pdev, 0)) {
2700 int ret;
2701 ret = domain_add_dev_info(si_domain, pdev,
2702 hw_pass_through ?
2703 CONTEXT_TT_PASS_THROUGH :
2704 CONTEXT_TT_MULTI_LEVEL);
2705 if (!ret) {
2706 printk(KERN_INFO "64bit %s uses identity mapping\n",
2707 pci_name(pdev));
2708 return 1;
2709 }
2710 }
2711 }
2712
2713 return 0;
2714}
2715
2716static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2717 size_t size, int dir, u64 dma_mask)
2718{
2719 struct pci_dev *pdev = to_pci_dev(hwdev);
2720 struct dmar_domain *domain;
2721 phys_addr_t start_paddr;
2722 struct iova *iova;
2723 int prot = 0;
2724 int ret;
2725 struct intel_iommu *iommu;
2726 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2727
2728 BUG_ON(dir == DMA_NONE);
2729
2730 if (iommu_no_mapping(hwdev))
2731 return paddr;
2732
2733 domain = get_valid_domain_for_dev(pdev);
2734 if (!domain)
2735 return 0;
2736
2737 iommu = domain_get_iommu(domain);
2738 size = aligned_nrpages(paddr, size);
2739
2740 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2741 if (!iova)
2742 goto error;
2743
2744 /*
2745 * Check if DMAR supports zero-length reads on write only
2746 * mappings..
2747 */
2748 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2749 !cap_zlr(iommu->cap))
2750 prot |= DMA_PTE_READ;
2751 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2752 prot |= DMA_PTE_WRITE;
2753 /*
2754 * paddr - (paddr + size) might be partial page, we should map the whole
2755 * page. Note: if two part of one page are separately mapped, we
2756 * might have two guest_addr mapping to the same host paddr, but this
2757 * is not a big problem
2758 */
2759 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2760 mm_to_dma_pfn(paddr_pfn), size, prot);
2761 if (ret)
2762 goto error;
2763
2764 /* it's a non-present to present mapping. Only flush if caching mode */
2765 if (cap_caching_mode(iommu->cap))
2766 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2767 else
2768 iommu_flush_write_buffer(iommu);
2769
2770 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2771 start_paddr += paddr & ~PAGE_MASK;
2772 return start_paddr;
2773
2774error:
2775 if (iova)
2776 __free_iova(&domain->iovad, iova);
2777 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2778 pci_name(pdev), size, (unsigned long long)paddr, dir);
2779 return 0;
2780}
2781
2782static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2783 unsigned long offset, size_t size,
2784 enum dma_data_direction dir,
2785 struct dma_attrs *attrs)
2786{
2787 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2788 dir, to_pci_dev(dev)->dma_mask);
2789}
2790
2791static void flush_unmaps(void)
2792{
2793 int i, j;
2794
2795 timer_on = 0;
2796
2797 /* just flush them all */
2798 for (i = 0; i < g_num_of_iommus; i++) {
2799 struct intel_iommu *iommu = g_iommus[i];
2800 if (!iommu)
2801 continue;
2802
2803 if (!deferred_flush[i].next)
2804 continue;
2805
2806 /* In caching mode, global flushes turn emulation expensive */
2807 if (!cap_caching_mode(iommu->cap))
2808 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809 DMA_TLB_GLOBAL_FLUSH);
2810 for (j = 0; j < deferred_flush[i].next; j++) {
2811 unsigned long mask;
2812 struct iova *iova = deferred_flush[i].iova[j];
2813 struct dmar_domain *domain = deferred_flush[i].domain[j];
2814
2815 /* On real hardware multiple invalidations are expensive */
2816 if (cap_caching_mode(iommu->cap))
2817 iommu_flush_iotlb_psi(iommu, domain->id,
2818 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2819 else {
2820 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2821 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2822 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2823 }
2824 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2825 }
2826 deferred_flush[i].next = 0;
2827 }
2828
2829 list_size = 0;
2830}
2831
2832static void flush_unmaps_timeout(unsigned long data)
2833{
2834 unsigned long flags;
2835
2836 spin_lock_irqsave(&async_umap_flush_lock, flags);
2837 flush_unmaps();
2838 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2839}
2840
2841static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2842{
2843 unsigned long flags;
2844 int next, iommu_id;
2845 struct intel_iommu *iommu;
2846
2847 spin_lock_irqsave(&async_umap_flush_lock, flags);
2848 if (list_size == HIGH_WATER_MARK)
2849 flush_unmaps();
2850
2851 iommu = domain_get_iommu(dom);
2852 iommu_id = iommu->seq_id;
2853
2854 next = deferred_flush[iommu_id].next;
2855 deferred_flush[iommu_id].domain[next] = dom;
2856 deferred_flush[iommu_id].iova[next] = iova;
2857 deferred_flush[iommu_id].next++;
2858
2859 if (!timer_on) {
2860 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2861 timer_on = 1;
2862 }
2863 list_size++;
2864 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2865}
2866
2867static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2868 size_t size, enum dma_data_direction dir,
2869 struct dma_attrs *attrs)
2870{
2871 struct pci_dev *pdev = to_pci_dev(dev);
2872 struct dmar_domain *domain;
2873 unsigned long start_pfn, last_pfn;
2874 struct iova *iova;
2875 struct intel_iommu *iommu;
2876
2877 if (iommu_no_mapping(dev))
2878 return;
2879
2880 domain = find_domain(pdev);
2881 BUG_ON(!domain);
2882
2883 iommu = domain_get_iommu(domain);
2884
2885 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2886 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2887 (unsigned long long)dev_addr))
2888 return;
2889
2890 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2891 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2892
2893 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2894 pci_name(pdev), start_pfn, last_pfn);
2895
2896 /* clear the whole page */
2897 dma_pte_clear_range(domain, start_pfn, last_pfn);
2898
2899 /* free page tables */
2900 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2901
2902 if (intel_iommu_strict) {
2903 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2904 last_pfn - start_pfn + 1, 0);
2905 /* free iova */
2906 __free_iova(&domain->iovad, iova);
2907 } else {
2908 add_unmap(domain, iova);
2909 /*
2910 * queue up the release of the unmap to save the 1/6th of the
2911 * cpu used up by the iotlb flush operation...
2912 */
2913 }
2914}
2915
2916static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2917 dma_addr_t *dma_handle, gfp_t flags)
2918{
2919 void *vaddr;
2920 int order;
2921
2922 size = PAGE_ALIGN(size);
2923 order = get_order(size);
2924
2925 if (!iommu_no_mapping(hwdev))
2926 flags &= ~(GFP_DMA | GFP_DMA32);
2927 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2928 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2929 flags |= GFP_DMA;
2930 else
2931 flags |= GFP_DMA32;
2932 }
2933
2934 vaddr = (void *)__get_free_pages(flags, order);
2935 if (!vaddr)
2936 return NULL;
2937 memset(vaddr, 0, size);
2938
2939 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2940 DMA_BIDIRECTIONAL,
2941 hwdev->coherent_dma_mask);
2942 if (*dma_handle)
2943 return vaddr;
2944 free_pages((unsigned long)vaddr, order);
2945 return NULL;
2946}
2947
2948static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2949 dma_addr_t dma_handle)
2950{
2951 int order;
2952
2953 size = PAGE_ALIGN(size);
2954 order = get_order(size);
2955
2956 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2957 free_pages((unsigned long)vaddr, order);
2958}
2959
2960static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2961 int nelems, enum dma_data_direction dir,
2962 struct dma_attrs *attrs)
2963{
2964 struct pci_dev *pdev = to_pci_dev(hwdev);
2965 struct dmar_domain *domain;
2966 unsigned long start_pfn, last_pfn;
2967 struct iova *iova;
2968 struct intel_iommu *iommu;
2969
2970 if (iommu_no_mapping(hwdev))
2971 return;
2972
2973 domain = find_domain(pdev);
2974 BUG_ON(!domain);
2975
2976 iommu = domain_get_iommu(domain);
2977
2978 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2979 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2980 (unsigned long long)sglist[0].dma_address))
2981 return;
2982
2983 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2984 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2985
2986 /* clear the whole page */
2987 dma_pte_clear_range(domain, start_pfn, last_pfn);
2988
2989 /* free page tables */
2990 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2991
2992 if (intel_iommu_strict) {
2993 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2994 last_pfn - start_pfn + 1, 0);
2995 /* free iova */
2996 __free_iova(&domain->iovad, iova);
2997 } else {
2998 add_unmap(domain, iova);
2999 /*
3000 * queue up the release of the unmap to save the 1/6th of the
3001 * cpu used up by the iotlb flush operation...
3002 */
3003 }
3004}
3005
3006static int intel_nontranslate_map_sg(struct device *hddev,
3007 struct scatterlist *sglist, int nelems, int dir)
3008{
3009 int i;
3010 struct scatterlist *sg;
3011
3012 for_each_sg(sglist, sg, nelems, i) {
3013 BUG_ON(!sg_page(sg));
3014 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3015 sg->dma_length = sg->length;
3016 }
3017 return nelems;
3018}
3019
3020static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3021 enum dma_data_direction dir, struct dma_attrs *attrs)
3022{
3023 int i;
3024 struct pci_dev *pdev = to_pci_dev(hwdev);
3025 struct dmar_domain *domain;
3026 size_t size = 0;
3027 int prot = 0;
3028 struct iova *iova = NULL;
3029 int ret;
3030 struct scatterlist *sg;
3031 unsigned long start_vpfn;
3032 struct intel_iommu *iommu;
3033
3034 BUG_ON(dir == DMA_NONE);
3035 if (iommu_no_mapping(hwdev))
3036 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3037
3038 domain = get_valid_domain_for_dev(pdev);
3039 if (!domain)
3040 return 0;
3041
3042 iommu = domain_get_iommu(domain);
3043
3044 for_each_sg(sglist, sg, nelems, i)
3045 size += aligned_nrpages(sg->offset, sg->length);
3046
3047 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3048 pdev->dma_mask);
3049 if (!iova) {
3050 sglist->dma_length = 0;
3051 return 0;
3052 }
3053
3054 /*
3055 * Check if DMAR supports zero-length reads on write only
3056 * mappings..
3057 */
3058 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3059 !cap_zlr(iommu->cap))
3060 prot |= DMA_PTE_READ;
3061 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3062 prot |= DMA_PTE_WRITE;
3063
3064 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3065
3066 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3067 if (unlikely(ret)) {
3068 /* clear the page */
3069 dma_pte_clear_range(domain, start_vpfn,
3070 start_vpfn + size - 1);
3071 /* free page tables */
3072 dma_pte_free_pagetable(domain, start_vpfn,
3073 start_vpfn + size - 1);
3074 /* free iova */
3075 __free_iova(&domain->iovad, iova);
3076 return 0;
3077 }
3078
3079 /* it's a non-present to present mapping. Only flush if caching mode */
3080 if (cap_caching_mode(iommu->cap))
3081 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3082 else
3083 iommu_flush_write_buffer(iommu);
3084
3085 return nelems;
3086}
3087
3088static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3089{
3090 return !dma_addr;
3091}
3092
3093struct dma_map_ops intel_dma_ops = {
3094 .alloc_coherent = intel_alloc_coherent,
3095 .free_coherent = intel_free_coherent,
3096 .map_sg = intel_map_sg,
3097 .unmap_sg = intel_unmap_sg,
3098 .map_page = intel_map_page,
3099 .unmap_page = intel_unmap_page,
3100 .mapping_error = intel_mapping_error,
3101};
3102
3103static inline int iommu_domain_cache_init(void)
3104{
3105 int ret = 0;
3106
3107 iommu_domain_cache = kmem_cache_create("iommu_domain",
3108 sizeof(struct dmar_domain),
3109 0,
3110 SLAB_HWCACHE_ALIGN,
3111
3112 NULL);
3113 if (!iommu_domain_cache) {
3114 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3115 ret = -ENOMEM;
3116 }
3117
3118 return ret;
3119}
3120
3121static inline int iommu_devinfo_cache_init(void)
3122{
3123 int ret = 0;
3124
3125 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3126 sizeof(struct device_domain_info),
3127 0,
3128 SLAB_HWCACHE_ALIGN,
3129 NULL);
3130 if (!iommu_devinfo_cache) {
3131 printk(KERN_ERR "Couldn't create devinfo cache\n");
3132 ret = -ENOMEM;
3133 }
3134
3135 return ret;
3136}
3137
3138static inline int iommu_iova_cache_init(void)
3139{
3140 int ret = 0;
3141
3142 iommu_iova_cache = kmem_cache_create("iommu_iova",
3143 sizeof(struct iova),
3144 0,
3145 SLAB_HWCACHE_ALIGN,
3146 NULL);
3147 if (!iommu_iova_cache) {
3148 printk(KERN_ERR "Couldn't create iova cache\n");
3149 ret = -ENOMEM;
3150 }
3151
3152 return ret;
3153}
3154
3155static int __init iommu_init_mempool(void)
3156{
3157 int ret;
3158 ret = iommu_iova_cache_init();
3159 if (ret)
3160 return ret;
3161
3162 ret = iommu_domain_cache_init();
3163 if (ret)
3164 goto domain_error;
3165
3166 ret = iommu_devinfo_cache_init();
3167 if (!ret)
3168 return ret;
3169
3170 kmem_cache_destroy(iommu_domain_cache);
3171domain_error:
3172 kmem_cache_destroy(iommu_iova_cache);
3173
3174 return -ENOMEM;
3175}
3176
3177static void __init iommu_exit_mempool(void)
3178{
3179 kmem_cache_destroy(iommu_devinfo_cache);
3180 kmem_cache_destroy(iommu_domain_cache);
3181 kmem_cache_destroy(iommu_iova_cache);
3182
3183}
3184
3185static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3186{
3187 struct dmar_drhd_unit *drhd;
3188 u32 vtbar;
3189 int rc;
3190
3191 /* We know that this device on this chipset has its own IOMMU.
3192 * If we find it under a different IOMMU, then the BIOS is lying
3193 * to us. Hope that the IOMMU for this device is actually
3194 * disabled, and it needs no translation...
3195 */
3196 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3197 if (rc) {
3198 /* "can't" happen */
3199 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3200 return;
3201 }
3202 vtbar &= 0xffff0000;
3203
3204 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3205 drhd = dmar_find_matched_drhd_unit(pdev);
3206 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3207 TAINT_FIRMWARE_WORKAROUND,
3208 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3209 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3210}
3211DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3212
3213static void __init init_no_remapping_devices(void)
3214{
3215 struct dmar_drhd_unit *drhd;
3216
3217 for_each_drhd_unit(drhd) {
3218 if (!drhd->include_all) {
3219 int i;
3220 for (i = 0; i < drhd->devices_cnt; i++)
3221 if (drhd->devices[i] != NULL)
3222 break;
3223 /* ignore DMAR unit if no pci devices exist */
3224 if (i == drhd->devices_cnt)
3225 drhd->ignored = 1;
3226 }
3227 }
3228
3229 if (dmar_map_gfx)
3230 return;
3231
3232 for_each_drhd_unit(drhd) {
3233 int i;
3234 if (drhd->ignored || drhd->include_all)
3235 continue;
3236
3237 for (i = 0; i < drhd->devices_cnt; i++)
3238 if (drhd->devices[i] &&
3239 !IS_GFX_DEVICE(drhd->devices[i]))
3240 break;
3241
3242 if (i < drhd->devices_cnt)
3243 continue;
3244
3245 /* bypass IOMMU if it is just for gfx devices */
3246 drhd->ignored = 1;
3247 for (i = 0; i < drhd->devices_cnt; i++) {
3248 if (!drhd->devices[i])
3249 continue;
3250 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3251 }
3252 }
3253}
3254
3255#ifdef CONFIG_SUSPEND
3256static int init_iommu_hw(void)
3257{
3258 struct dmar_drhd_unit *drhd;
3259 struct intel_iommu *iommu = NULL;
3260
3261 for_each_active_iommu(iommu, drhd)
3262 if (iommu->qi)
3263 dmar_reenable_qi(iommu);
3264
3265 for_each_iommu(iommu, drhd) {
3266 if (drhd->ignored) {
3267 /*
3268 * we always have to disable PMRs or DMA may fail on
3269 * this device
3270 */
3271 if (force_on)
3272 iommu_disable_protect_mem_regions(iommu);
3273 continue;
3274 }
3275
3276 iommu_flush_write_buffer(iommu);
3277
3278 iommu_set_root_entry(iommu);
3279
3280 iommu->flush.flush_context(iommu, 0, 0, 0,
3281 DMA_CCMD_GLOBAL_INVL);
3282 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3283 DMA_TLB_GLOBAL_FLUSH);
3284 if (iommu_enable_translation(iommu))
3285 return 1;
3286 iommu_disable_protect_mem_regions(iommu);
3287 }
3288
3289 return 0;
3290}
3291
3292static void iommu_flush_all(void)
3293{
3294 struct dmar_drhd_unit *drhd;
3295 struct intel_iommu *iommu;
3296
3297 for_each_active_iommu(iommu, drhd) {
3298 iommu->flush.flush_context(iommu, 0, 0, 0,
3299 DMA_CCMD_GLOBAL_INVL);
3300 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3301 DMA_TLB_GLOBAL_FLUSH);
3302 }
3303}
3304
3305static int iommu_suspend(void)
3306{
3307 struct dmar_drhd_unit *drhd;
3308 struct intel_iommu *iommu = NULL;
3309 unsigned long flag;
3310
3311 for_each_active_iommu(iommu, drhd) {
3312 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3313 GFP_ATOMIC);
3314 if (!iommu->iommu_state)
3315 goto nomem;
3316 }
3317
3318 iommu_flush_all();
3319
3320 for_each_active_iommu(iommu, drhd) {
3321 iommu_disable_translation(iommu);
3322
3323 spin_lock_irqsave(&iommu->register_lock, flag);
3324
3325 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3326 readl(iommu->reg + DMAR_FECTL_REG);
3327 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3328 readl(iommu->reg + DMAR_FEDATA_REG);
3329 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3330 readl(iommu->reg + DMAR_FEADDR_REG);
3331 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3332 readl(iommu->reg + DMAR_FEUADDR_REG);
3333
3334 spin_unlock_irqrestore(&iommu->register_lock, flag);
3335 }
3336 return 0;
3337
3338nomem:
3339 for_each_active_iommu(iommu, drhd)
3340 kfree(iommu->iommu_state);
3341
3342 return -ENOMEM;
3343}
3344
3345static void iommu_resume(void)
3346{
3347 struct dmar_drhd_unit *drhd;
3348 struct intel_iommu *iommu = NULL;
3349 unsigned long flag;
3350
3351 if (init_iommu_hw()) {
3352 if (force_on)
3353 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3354 else
3355 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3356 return;
3357 }
3358
3359 for_each_active_iommu(iommu, drhd) {
3360
3361 spin_lock_irqsave(&iommu->register_lock, flag);
3362
3363 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3364 iommu->reg + DMAR_FECTL_REG);
3365 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3366 iommu->reg + DMAR_FEDATA_REG);
3367 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3368 iommu->reg + DMAR_FEADDR_REG);
3369 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3370 iommu->reg + DMAR_FEUADDR_REG);
3371
3372 spin_unlock_irqrestore(&iommu->register_lock, flag);
3373 }
3374
3375 for_each_active_iommu(iommu, drhd)
3376 kfree(iommu->iommu_state);
3377}
3378
3379static struct syscore_ops iommu_syscore_ops = {
3380 .resume = iommu_resume,
3381 .suspend = iommu_suspend,
3382};
3383
3384static void __init init_iommu_pm_ops(void)
3385{
3386 register_syscore_ops(&iommu_syscore_ops);
3387}
3388
3389#else
3390static inline void init_iommu_pm_ops(void) {}
3391#endif /* CONFIG_PM */
3392
3393/*
3394 * Here we only respond to action of unbound device from driver.
3395 *
3396 * Added device is not attached to its DMAR domain here yet. That will happen
3397 * when mapping the device to iova.
3398 */
3399static int device_notifier(struct notifier_block *nb,
3400 unsigned long action, void *data)
3401{
3402 struct device *dev = data;
3403 struct pci_dev *pdev = to_pci_dev(dev);
3404 struct dmar_domain *domain;
3405
3406 if (iommu_no_mapping(dev))
3407 return 0;
3408
3409 domain = find_domain(pdev);
3410 if (!domain)
3411 return 0;
3412
3413 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3414 domain_remove_one_dev_info(domain, pdev);
3415
3416 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3417 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3418 list_empty(&domain->devices))
3419 domain_exit(domain);
3420 }
3421
3422 return 0;
3423}
3424
3425static struct notifier_block device_nb = {
3426 .notifier_call = device_notifier,
3427};
3428
3429int __init intel_iommu_init(void)
3430{
3431 int ret = 0;
3432
3433 /* VT-d is required for a TXT/tboot launch, so enforce that */
3434 force_on = tboot_force_iommu();
3435
3436 if (dmar_table_init()) {
3437 if (force_on)
3438 panic("tboot: Failed to initialize DMAR table\n");
3439 return -ENODEV;
3440 }
3441
3442 if (dmar_dev_scope_init()) {
3443 if (force_on)
3444 panic("tboot: Failed to initialize DMAR device scope\n");
3445 return -ENODEV;
3446 }
3447
3448 /*
3449 * Check the need for DMA-remapping initialization now.
3450 * Above initialization will also be used by Interrupt-remapping.
3451 */
3452 if (no_iommu || dmar_disabled)
3453 return -ENODEV;
3454
3455 if (iommu_init_mempool()) {
3456 if (force_on)
3457 panic("tboot: Failed to initialize iommu memory\n");
3458 return -ENODEV;
3459 }
3460
3461 if (dmar_init_reserved_ranges()) {
3462 if (force_on)
3463 panic("tboot: Failed to reserve iommu ranges\n");
3464 return -ENODEV;
3465 }
3466
3467 init_no_remapping_devices();
3468
3469 ret = init_dmars();
3470 if (ret) {
3471 if (force_on)
3472 panic("tboot: Failed to initialize DMARs\n");
3473 printk(KERN_ERR "IOMMU: dmar init failed\n");
3474 put_iova_domain(&reserved_iova_list);
3475 iommu_exit_mempool();
3476 return ret;
3477 }
3478 printk(KERN_INFO
3479 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3480
3481 init_timer(&unmap_timer);
3482#ifdef CONFIG_SWIOTLB
3483 swiotlb = 0;
3484#endif
3485 dma_ops = &intel_dma_ops;
3486
3487 init_iommu_pm_ops();
3488
3489 register_iommu(&intel_iommu_ops);
3490
3491 bus_register_notifier(&pci_bus_type, &device_nb);
3492
3493 return 0;
3494}
3495
3496static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3497 struct pci_dev *pdev)
3498{
3499 struct pci_dev *tmp, *parent;
3500
3501 if (!iommu || !pdev)
3502 return;
3503
3504 /* dependent device detach */
3505 tmp = pci_find_upstream_pcie_bridge(pdev);
3506 /* Secondary interface's bus number and devfn 0 */
3507 if (tmp) {
3508 parent = pdev->bus->self;
3509 while (parent != tmp) {
3510 iommu_detach_dev(iommu, parent->bus->number,
3511 parent->devfn);
3512 parent = parent->bus->self;
3513 }
3514 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3515 iommu_detach_dev(iommu,
3516 tmp->subordinate->number, 0);
3517 else /* this is a legacy PCI bridge */
3518 iommu_detach_dev(iommu, tmp->bus->number,
3519 tmp->devfn);
3520 }
3521}
3522
3523static void domain_remove_one_dev_info(struct dmar_domain *domain,
3524 struct pci_dev *pdev)
3525{
3526 struct device_domain_info *info;
3527 struct intel_iommu *iommu;
3528 unsigned long flags;
3529 int found = 0;
3530 struct list_head *entry, *tmp;
3531
3532 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3533 pdev->devfn);
3534 if (!iommu)
3535 return;
3536
3537 spin_lock_irqsave(&device_domain_lock, flags);
3538 list_for_each_safe(entry, tmp, &domain->devices) {
3539 info = list_entry(entry, struct device_domain_info, link);
3540 if (info->segment == pci_domain_nr(pdev->bus) &&
3541 info->bus == pdev->bus->number &&
3542 info->devfn == pdev->devfn) {
3543 list_del(&info->link);
3544 list_del(&info->global);
3545 if (info->dev)
3546 info->dev->dev.archdata.iommu = NULL;
3547 spin_unlock_irqrestore(&device_domain_lock, flags);
3548
3549 iommu_disable_dev_iotlb(info);
3550 iommu_detach_dev(iommu, info->bus, info->devfn);
3551 iommu_detach_dependent_devices(iommu, pdev);
3552 free_devinfo_mem(info);
3553
3554 spin_lock_irqsave(&device_domain_lock, flags);
3555
3556 if (found)
3557 break;
3558 else
3559 continue;
3560 }
3561
3562 /* if there is no other devices under the same iommu
3563 * owned by this domain, clear this iommu in iommu_bmp
3564 * update iommu count and coherency
3565 */
3566 if (iommu == device_to_iommu(info->segment, info->bus,
3567 info->devfn))
3568 found = 1;
3569 }
3570
3571 if (found == 0) {
3572 unsigned long tmp_flags;
3573 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3574 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3575 domain->iommu_count--;
3576 domain_update_iommu_cap(domain);
3577 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3578
3579 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3580 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3581 spin_lock_irqsave(&iommu->lock, tmp_flags);
3582 clear_bit(domain->id, iommu->domain_ids);
3583 iommu->domains[domain->id] = NULL;
3584 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3585 }
3586 }
3587
3588 spin_unlock_irqrestore(&device_domain_lock, flags);
3589}
3590
3591static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3592{
3593 struct device_domain_info *info;
3594 struct intel_iommu *iommu;
3595 unsigned long flags1, flags2;
3596
3597 spin_lock_irqsave(&device_domain_lock, flags1);
3598 while (!list_empty(&domain->devices)) {
3599 info = list_entry(domain->devices.next,
3600 struct device_domain_info, link);
3601 list_del(&info->link);
3602 list_del(&info->global);
3603 if (info->dev)
3604 info->dev->dev.archdata.iommu = NULL;
3605
3606 spin_unlock_irqrestore(&device_domain_lock, flags1);
3607
3608 iommu_disable_dev_iotlb(info);
3609 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3610 iommu_detach_dev(iommu, info->bus, info->devfn);
3611 iommu_detach_dependent_devices(iommu, info->dev);
3612
3613 /* clear this iommu in iommu_bmp, update iommu count
3614 * and capabilities
3615 */
3616 spin_lock_irqsave(&domain->iommu_lock, flags2);
3617 if (test_and_clear_bit(iommu->seq_id,
3618 &domain->iommu_bmp)) {
3619 domain->iommu_count--;
3620 domain_update_iommu_cap(domain);
3621 }
3622 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3623
3624 free_devinfo_mem(info);
3625 spin_lock_irqsave(&device_domain_lock, flags1);
3626 }
3627 spin_unlock_irqrestore(&device_domain_lock, flags1);
3628}
3629
3630/* domain id for virtual machine, it won't be set in context */
3631static unsigned long vm_domid;
3632
3633static struct dmar_domain *iommu_alloc_vm_domain(void)
3634{
3635 struct dmar_domain *domain;
3636
3637 domain = alloc_domain_mem();
3638 if (!domain)
3639 return NULL;
3640
3641 domain->id = vm_domid++;
3642 domain->nid = -1;
3643 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3644 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3645
3646 return domain;
3647}
3648
3649static int md_domain_init(struct dmar_domain *domain, int guest_width)
3650{
3651 int adjust_width;
3652
3653 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3654 spin_lock_init(&domain->iommu_lock);
3655
3656 domain_reserve_special_ranges(domain);
3657
3658 /* calculate AGAW */
3659 domain->gaw = guest_width;
3660 adjust_width = guestwidth_to_adjustwidth(guest_width);
3661 domain->agaw = width_to_agaw(adjust_width);
3662
3663 INIT_LIST_HEAD(&domain->devices);
3664
3665 domain->iommu_count = 0;
3666 domain->iommu_coherency = 0;
3667 domain->iommu_snooping = 0;
3668 domain->iommu_superpage = 0;
3669 domain->max_addr = 0;
3670 domain->nid = -1;
3671
3672 /* always allocate the top pgd */
3673 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3674 if (!domain->pgd)
3675 return -ENOMEM;
3676 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3677 return 0;
3678}
3679
3680static void iommu_free_vm_domain(struct dmar_domain *domain)
3681{
3682 unsigned long flags;
3683 struct dmar_drhd_unit *drhd;
3684 struct intel_iommu *iommu;
3685 unsigned long i;
3686 unsigned long ndomains;
3687
3688 for_each_drhd_unit(drhd) {
3689 if (drhd->ignored)
3690 continue;
3691 iommu = drhd->iommu;
3692
3693 ndomains = cap_ndoms(iommu->cap);
3694 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3695 if (iommu->domains[i] == domain) {
3696 spin_lock_irqsave(&iommu->lock, flags);
3697 clear_bit(i, iommu->domain_ids);
3698 iommu->domains[i] = NULL;
3699 spin_unlock_irqrestore(&iommu->lock, flags);
3700 break;
3701 }
3702 }
3703 }
3704}
3705
3706static void vm_domain_exit(struct dmar_domain *domain)
3707{
3708 /* Domain 0 is reserved, so dont process it */
3709 if (!domain)
3710 return;
3711
3712 vm_domain_remove_all_dev_info(domain);
3713 /* destroy iovas */
3714 put_iova_domain(&domain->iovad);
3715
3716 /* clear ptes */
3717 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3718
3719 /* free page tables */
3720 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3721
3722 iommu_free_vm_domain(domain);
3723 free_domain_mem(domain);
3724}
3725
3726static int intel_iommu_domain_init(struct iommu_domain *domain)
3727{
3728 struct dmar_domain *dmar_domain;
3729
3730 dmar_domain = iommu_alloc_vm_domain();
3731 if (!dmar_domain) {
3732 printk(KERN_ERR
3733 "intel_iommu_domain_init: dmar_domain == NULL\n");
3734 return -ENOMEM;
3735 }
3736 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3737 printk(KERN_ERR
3738 "intel_iommu_domain_init() failed\n");
3739 vm_domain_exit(dmar_domain);
3740 return -ENOMEM;
3741 }
3742 domain->priv = dmar_domain;
3743
3744 return 0;
3745}
3746
3747static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3748{
3749 struct dmar_domain *dmar_domain = domain->priv;
3750
3751 domain->priv = NULL;
3752 vm_domain_exit(dmar_domain);
3753}
3754
3755static int intel_iommu_attach_device(struct iommu_domain *domain,
3756 struct device *dev)
3757{
3758 struct dmar_domain *dmar_domain = domain->priv;
3759 struct pci_dev *pdev = to_pci_dev(dev);
3760 struct intel_iommu *iommu;
3761 int addr_width;
3762
3763 /* normally pdev is not mapped */
3764 if (unlikely(domain_context_mapped(pdev))) {
3765 struct dmar_domain *old_domain;
3766
3767 old_domain = find_domain(pdev);
3768 if (old_domain) {
3769 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3770 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3771 domain_remove_one_dev_info(old_domain, pdev);
3772 else
3773 domain_remove_dev_info(old_domain);
3774 }
3775 }
3776
3777 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3778 pdev->devfn);
3779 if (!iommu)
3780 return -ENODEV;
3781
3782 /* check if this iommu agaw is sufficient for max mapped address */
3783 addr_width = agaw_to_width(iommu->agaw);
3784 if (addr_width > cap_mgaw(iommu->cap))
3785 addr_width = cap_mgaw(iommu->cap);
3786
3787 if (dmar_domain->max_addr > (1LL << addr_width)) {
3788 printk(KERN_ERR "%s: iommu width (%d) is not "
3789 "sufficient for the mapped address (%llx)\n",
3790 __func__, addr_width, dmar_domain->max_addr);
3791 return -EFAULT;
3792 }
3793 dmar_domain->gaw = addr_width;
3794
3795 /*
3796 * Knock out extra levels of page tables if necessary
3797 */
3798 while (iommu->agaw < dmar_domain->agaw) {
3799 struct dma_pte *pte;
3800
3801 pte = dmar_domain->pgd;
3802 if (dma_pte_present(pte)) {
3803 dmar_domain->pgd = (struct dma_pte *)
3804 phys_to_virt(dma_pte_addr(pte));
3805 free_pgtable_page(pte);
3806 }
3807 dmar_domain->agaw--;
3808 }
3809
3810 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3811}
3812
3813static void intel_iommu_detach_device(struct iommu_domain *domain,
3814 struct device *dev)
3815{
3816 struct dmar_domain *dmar_domain = domain->priv;
3817 struct pci_dev *pdev = to_pci_dev(dev);
3818
3819 domain_remove_one_dev_info(dmar_domain, pdev);
3820}
3821
3822static int intel_iommu_map(struct iommu_domain *domain,
3823 unsigned long iova, phys_addr_t hpa,
3824 int gfp_order, int iommu_prot)
3825{
3826 struct dmar_domain *dmar_domain = domain->priv;
3827 u64 max_addr;
3828 int prot = 0;
3829 size_t size;
3830 int ret;
3831
3832 if (iommu_prot & IOMMU_READ)
3833 prot |= DMA_PTE_READ;
3834 if (iommu_prot & IOMMU_WRITE)
3835 prot |= DMA_PTE_WRITE;
3836 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3837 prot |= DMA_PTE_SNP;
3838
3839 size = PAGE_SIZE << gfp_order;
3840 max_addr = iova + size;
3841 if (dmar_domain->max_addr < max_addr) {
3842 u64 end;
3843
3844 /* check if minimum agaw is sufficient for mapped address */
3845 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3846 if (end < max_addr) {
3847 printk(KERN_ERR "%s: iommu width (%d) is not "
3848 "sufficient for the mapped address (%llx)\n",
3849 __func__, dmar_domain->gaw, max_addr);
3850 return -EFAULT;
3851 }
3852 dmar_domain->max_addr = max_addr;
3853 }
3854 /* Round up size to next multiple of PAGE_SIZE, if it and
3855 the low bits of hpa would take us onto the next page */
3856 size = aligned_nrpages(hpa, size);
3857 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3858 hpa >> VTD_PAGE_SHIFT, size, prot);
3859 return ret;
3860}
3861
3862static int intel_iommu_unmap(struct iommu_domain *domain,
3863 unsigned long iova, int gfp_order)
3864{
3865 struct dmar_domain *dmar_domain = domain->priv;
3866 size_t size = PAGE_SIZE << gfp_order;
3867
3868 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3869 (iova + size - 1) >> VTD_PAGE_SHIFT);
3870
3871 if (dmar_domain->max_addr == iova + size)
3872 dmar_domain->max_addr = iova;
3873
3874 return gfp_order;
3875}
3876
3877static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3878 unsigned long iova)
3879{
3880 struct dmar_domain *dmar_domain = domain->priv;
3881 struct dma_pte *pte;
3882 u64 phys = 0;
3883
3884 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3885 if (pte)
3886 phys = dma_pte_addr(pte);
3887
3888 return phys;
3889}
3890
3891static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3892 unsigned long cap)
3893{
3894 struct dmar_domain *dmar_domain = domain->priv;
3895
3896 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3897 return dmar_domain->iommu_snooping;
3898 if (cap == IOMMU_CAP_INTR_REMAP)
3899 return intr_remapping_enabled;
3900
3901 return 0;
3902}
3903
3904static struct iommu_ops intel_iommu_ops = {
3905 .domain_init = intel_iommu_domain_init,
3906 .domain_destroy = intel_iommu_domain_destroy,
3907 .attach_dev = intel_iommu_attach_device,
3908 .detach_dev = intel_iommu_detach_device,
3909 .map = intel_iommu_map,
3910 .unmap = intel_iommu_unmap,
3911 .iova_to_phys = intel_iommu_iova_to_phys,
3912 .domain_has_cap = intel_iommu_domain_has_cap,
3913};
3914
3915static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3916{
3917 /*
3918 * Mobile 4 Series Chipset neglects to set RWBF capability,
3919 * but needs it:
3920 */
3921 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3922 rwbf_quirk = 1;
3923
3924 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3925 if (dev->revision == 0x07) {
3926 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3927 dmar_map_gfx = 0;
3928 }
3929}
3930
3931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3932
3933#define GGC 0x52
3934#define GGC_MEMORY_SIZE_MASK (0xf << 8)
3935#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3936#define GGC_MEMORY_SIZE_1M (0x1 << 8)
3937#define GGC_MEMORY_SIZE_2M (0x3 << 8)
3938#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3939#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3940#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3941#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3942
3943static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3944{
3945 unsigned short ggc;
3946
3947 if (pci_read_config_word(dev, GGC, &ggc))
3948 return;
3949
3950 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3951 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3952 dmar_map_gfx = 0;
3953 }
3954}
3955DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3956DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3957DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3958DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3959
3960/* On Tylersburg chipsets, some BIOSes have been known to enable the
3961 ISOCH DMAR unit for the Azalia sound device, but not give it any
3962 TLB entries, which causes it to deadlock. Check for that. We do
3963 this in a function called from init_dmars(), instead of in a PCI
3964 quirk, because we don't want to print the obnoxious "BIOS broken"
3965 message if VT-d is actually disabled.
3966*/
3967static void __init check_tylersburg_isoch(void)
3968{
3969 struct pci_dev *pdev;
3970 uint32_t vtisochctrl;
3971
3972 /* If there's no Azalia in the system anyway, forget it. */
3973 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3974 if (!pdev)
3975 return;
3976 pci_dev_put(pdev);
3977
3978 /* System Management Registers. Might be hidden, in which case
3979 we can't do the sanity check. But that's OK, because the
3980 known-broken BIOSes _don't_ actually hide it, so far. */
3981 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3982 if (!pdev)
3983 return;
3984
3985 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3986 pci_dev_put(pdev);
3987 return;
3988 }
3989
3990 pci_dev_put(pdev);
3991
3992 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3993 if (vtisochctrl & 1)
3994 return;
3995
3996 /* Drop all bits other than the number of TLB entries */
3997 vtisochctrl &= 0x1c;
3998
3999 /* If we have the recommended number of TLB entries (16), fine. */
4000 if (vtisochctrl == 0x10)
4001 return;
4002
4003 /* Zero TLB entries? You get to ride the short bus to school. */
4004 if (!vtisochctrl) {
4005 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4006 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4007 dmi_get_system_info(DMI_BIOS_VENDOR),
4008 dmi_get_system_info(DMI_BIOS_VERSION),
4009 dmi_get_system_info(DMI_PRODUCT_VERSION));
4010 iommu_identity_mapping |= IDENTMAP_AZALIA;
4011 return;
4012 }
4013
4014 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4015 vtisochctrl);
4016}
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
new file mode 100644
index 000000000000..1a89d4a2cadf
--- /dev/null
+++ b/drivers/iommu/intr_remapping.c
@@ -0,0 +1,797 @@
1#include <linux/interrupt.h>
2#include <linux/dmar.h>
3#include <linux/spinlock.h>
4#include <linux/slab.h>
5#include <linux/jiffies.h>
6#include <linux/hpet.h>
7#include <linux/pci.h>
8#include <linux/irq.h>
9#include <asm/io_apic.h>
10#include <asm/smp.h>
11#include <asm/cpu.h>
12#include <linux/intel-iommu.h>
13#include "intr_remapping.h"
14#include <acpi/acpi.h>
15#include <asm/pci-direct.h>
16
17static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
18static struct hpet_scope ir_hpet[MAX_HPET_TBS];
19static int ir_ioapic_num, ir_hpet_num;
20int intr_remapping_enabled;
21
22static int disable_intremap;
23static int disable_sourceid_checking;
24
25static __init int setup_nointremap(char *str)
26{
27 disable_intremap = 1;
28 return 0;
29}
30early_param("nointremap", setup_nointremap);
31
32static __init int setup_intremap(char *str)
33{
34 if (!str)
35 return -EINVAL;
36
37 if (!strncmp(str, "on", 2))
38 disable_intremap = 0;
39 else if (!strncmp(str, "off", 3))
40 disable_intremap = 1;
41 else if (!strncmp(str, "nosid", 5))
42 disable_sourceid_checking = 1;
43
44 return 0;
45}
46early_param("intremap", setup_intremap);
47
48static DEFINE_SPINLOCK(irq_2_ir_lock);
49
50static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
51{
52 struct irq_cfg *cfg = irq_get_chip_data(irq);
53 return cfg ? &cfg->irq_2_iommu : NULL;
54}
55
56int get_irte(int irq, struct irte *entry)
57{
58 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
59 unsigned long flags;
60 int index;
61
62 if (!entry || !irq_iommu)
63 return -1;
64
65 spin_lock_irqsave(&irq_2_ir_lock, flags);
66
67 index = irq_iommu->irte_index + irq_iommu->sub_handle;
68 *entry = *(irq_iommu->iommu->ir_table->base + index);
69
70 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
71 return 0;
72}
73
74int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
75{
76 struct ir_table *table = iommu->ir_table;
77 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
78 u16 index, start_index;
79 unsigned int mask = 0;
80 unsigned long flags;
81 int i;
82
83 if (!count || !irq_iommu)
84 return -1;
85
86 /*
87 * start the IRTE search from index 0.
88 */
89 index = start_index = 0;
90
91 if (count > 1) {
92 count = __roundup_pow_of_two(count);
93 mask = ilog2(count);
94 }
95
96 if (mask > ecap_max_handle_mask(iommu->ecap)) {
97 printk(KERN_ERR
98 "Requested mask %x exceeds the max invalidation handle"
99 " mask value %Lx\n", mask,
100 ecap_max_handle_mask(iommu->ecap));
101 return -1;
102 }
103
104 spin_lock_irqsave(&irq_2_ir_lock, flags);
105 do {
106 for (i = index; i < index + count; i++)
107 if (table->base[i].present)
108 break;
109 /* empty index found */
110 if (i == index + count)
111 break;
112
113 index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
114
115 if (index == start_index) {
116 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
117 printk(KERN_ERR "can't allocate an IRTE\n");
118 return -1;
119 }
120 } while (1);
121
122 for (i = index; i < index + count; i++)
123 table->base[i].present = 1;
124
125 irq_iommu->iommu = iommu;
126 irq_iommu->irte_index = index;
127 irq_iommu->sub_handle = 0;
128 irq_iommu->irte_mask = mask;
129
130 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
131
132 return index;
133}
134
135static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
136{
137 struct qi_desc desc;
138
139 desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
140 | QI_IEC_SELECTIVE;
141 desc.high = 0;
142
143 return qi_submit_sync(&desc, iommu);
144}
145
146int map_irq_to_irte_handle(int irq, u16 *sub_handle)
147{
148 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
149 unsigned long flags;
150 int index;
151
152 if (!irq_iommu)
153 return -1;
154
155 spin_lock_irqsave(&irq_2_ir_lock, flags);
156 *sub_handle = irq_iommu->sub_handle;
157 index = irq_iommu->irte_index;
158 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
159 return index;
160}
161
162int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
163{
164 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
165 unsigned long flags;
166
167 if (!irq_iommu)
168 return -1;
169
170 spin_lock_irqsave(&irq_2_ir_lock, flags);
171
172 irq_iommu->iommu = iommu;
173 irq_iommu->irte_index = index;
174 irq_iommu->sub_handle = subhandle;
175 irq_iommu->irte_mask = 0;
176
177 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
178
179 return 0;
180}
181
182int modify_irte(int irq, struct irte *irte_modified)
183{
184 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
185 struct intel_iommu *iommu;
186 unsigned long flags;
187 struct irte *irte;
188 int rc, index;
189
190 if (!irq_iommu)
191 return -1;
192
193 spin_lock_irqsave(&irq_2_ir_lock, flags);
194
195 iommu = irq_iommu->iommu;
196
197 index = irq_iommu->irte_index + irq_iommu->sub_handle;
198 irte = &iommu->ir_table->base[index];
199
200 set_64bit(&irte->low, irte_modified->low);
201 set_64bit(&irte->high, irte_modified->high);
202 __iommu_flush_cache(iommu, irte, sizeof(*irte));
203
204 rc = qi_flush_iec(iommu, index, 0);
205 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
206
207 return rc;
208}
209
210struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
211{
212 int i;
213
214 for (i = 0; i < MAX_HPET_TBS; i++)
215 if (ir_hpet[i].id == hpet_id)
216 return ir_hpet[i].iommu;
217 return NULL;
218}
219
220struct intel_iommu *map_ioapic_to_ir(int apic)
221{
222 int i;
223
224 for (i = 0; i < MAX_IO_APICS; i++)
225 if (ir_ioapic[i].id == apic)
226 return ir_ioapic[i].iommu;
227 return NULL;
228}
229
230struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
231{
232 struct dmar_drhd_unit *drhd;
233
234 drhd = dmar_find_matched_drhd_unit(dev);
235 if (!drhd)
236 return NULL;
237
238 return drhd->iommu;
239}
240
241static int clear_entries(struct irq_2_iommu *irq_iommu)
242{
243 struct irte *start, *entry, *end;
244 struct intel_iommu *iommu;
245 int index;
246
247 if (irq_iommu->sub_handle)
248 return 0;
249
250 iommu = irq_iommu->iommu;
251 index = irq_iommu->irte_index + irq_iommu->sub_handle;
252
253 start = iommu->ir_table->base + index;
254 end = start + (1 << irq_iommu->irte_mask);
255
256 for (entry = start; entry < end; entry++) {
257 set_64bit(&entry->low, 0);
258 set_64bit(&entry->high, 0);
259 }
260
261 return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
262}
263
264int free_irte(int irq)
265{
266 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
267 unsigned long flags;
268 int rc;
269
270 if (!irq_iommu)
271 return -1;
272
273 spin_lock_irqsave(&irq_2_ir_lock, flags);
274
275 rc = clear_entries(irq_iommu);
276
277 irq_iommu->iommu = NULL;
278 irq_iommu->irte_index = 0;
279 irq_iommu->sub_handle = 0;
280 irq_iommu->irte_mask = 0;
281
282 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
283
284 return rc;
285}
286
287/*
288 * source validation type
289 */
290#define SVT_NO_VERIFY 0x0 /* no verification is required */
291#define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fields */
292#define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */
293
294/*
295 * source-id qualifier
296 */
297#define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */
298#define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore
299 * the third least significant bit
300 */
301#define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore
302 * the second and third least significant bits
303 */
304#define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore
305 * the least three significant bits
306 */
307
308/*
309 * set SVT, SQ and SID fields of irte to verify
310 * source ids of interrupt requests
311 */
312static void set_irte_sid(struct irte *irte, unsigned int svt,
313 unsigned int sq, unsigned int sid)
314{
315 if (disable_sourceid_checking)
316 svt = SVT_NO_VERIFY;
317 irte->svt = svt;
318 irte->sq = sq;
319 irte->sid = sid;
320}
321
322int set_ioapic_sid(struct irte *irte, int apic)
323{
324 int i;
325 u16 sid = 0;
326
327 if (!irte)
328 return -1;
329
330 for (i = 0; i < MAX_IO_APICS; i++) {
331 if (ir_ioapic[i].id == apic) {
332 sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
333 break;
334 }
335 }
336
337 if (sid == 0) {
338 pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
339 return -1;
340 }
341
342 set_irte_sid(irte, 1, 0, sid);
343
344 return 0;
345}
346
347int set_hpet_sid(struct irte *irte, u8 id)
348{
349 int i;
350 u16 sid = 0;
351
352 if (!irte)
353 return -1;
354
355 for (i = 0; i < MAX_HPET_TBS; i++) {
356 if (ir_hpet[i].id == id) {
357 sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
358 break;
359 }
360 }
361
362 if (sid == 0) {
363 pr_warning("Failed to set source-id of HPET block (%d)\n", id);
364 return -1;
365 }
366
367 /*
368 * Should really use SQ_ALL_16. Some platforms are broken.
369 * While we figure out the right quirks for these broken platforms, use
370 * SQ_13_IGNORE_3 for now.
371 */
372 set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
373
374 return 0;
375}
376
377int set_msi_sid(struct irte *irte, struct pci_dev *dev)
378{
379 struct pci_dev *bridge;
380
381 if (!irte || !dev)
382 return -1;
383
384 /* PCIe device or Root Complex integrated PCI device */
385 if (pci_is_pcie(dev) || !dev->bus->parent) {
386 set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
387 (dev->bus->number << 8) | dev->devfn);
388 return 0;
389 }
390
391 bridge = pci_find_upstream_pcie_bridge(dev);
392 if (bridge) {
393 if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */
394 set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
395 (bridge->bus->number << 8) | dev->bus->number);
396 else /* this is a legacy PCI bridge */
397 set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
398 (bridge->bus->number << 8) | bridge->devfn);
399 }
400
401 return 0;
402}
403
404static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
405{
406 u64 addr;
407 u32 sts;
408 unsigned long flags;
409
410 addr = virt_to_phys((void *)iommu->ir_table->base);
411
412 spin_lock_irqsave(&iommu->register_lock, flags);
413
414 dmar_writeq(iommu->reg + DMAR_IRTA_REG,
415 (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
416
417 /* Set interrupt-remapping table pointer */
418 iommu->gcmd |= DMA_GCMD_SIRTP;
419 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
420
421 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
422 readl, (sts & DMA_GSTS_IRTPS), sts);
423 spin_unlock_irqrestore(&iommu->register_lock, flags);
424
425 /*
426 * global invalidation of interrupt entry cache before enabling
427 * interrupt-remapping.
428 */
429 qi_global_iec(iommu);
430
431 spin_lock_irqsave(&iommu->register_lock, flags);
432
433 /* Enable interrupt-remapping */
434 iommu->gcmd |= DMA_GCMD_IRE;
435 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
436
437 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
438 readl, (sts & DMA_GSTS_IRES), sts);
439
440 spin_unlock_irqrestore(&iommu->register_lock, flags);
441}
442
443
444static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
445{
446 struct ir_table *ir_table;
447 struct page *pages;
448
449 ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
450 GFP_ATOMIC);
451
452 if (!iommu->ir_table)
453 return -ENOMEM;
454
455 pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
456 INTR_REMAP_PAGE_ORDER);
457
458 if (!pages) {
459 printk(KERN_ERR "failed to allocate pages of order %d\n",
460 INTR_REMAP_PAGE_ORDER);
461 kfree(iommu->ir_table);
462 return -ENOMEM;
463 }
464
465 ir_table->base = page_address(pages);
466
467 iommu_set_intr_remapping(iommu, mode);
468 return 0;
469}
470
471/*
472 * Disable Interrupt Remapping.
473 */
474static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
475{
476 unsigned long flags;
477 u32 sts;
478
479 if (!ecap_ir_support(iommu->ecap))
480 return;
481
482 /*
483 * global invalidation of interrupt entry cache before disabling
484 * interrupt-remapping.
485 */
486 qi_global_iec(iommu);
487
488 spin_lock_irqsave(&iommu->register_lock, flags);
489
490 sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
491 if (!(sts & DMA_GSTS_IRES))
492 goto end;
493
494 iommu->gcmd &= ~DMA_GCMD_IRE;
495 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
496
497 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
498 readl, !(sts & DMA_GSTS_IRES), sts);
499
500end:
501 spin_unlock_irqrestore(&iommu->register_lock, flags);
502}
503
504int __init intr_remapping_supported(void)
505{
506 struct dmar_drhd_unit *drhd;
507
508 if (disable_intremap)
509 return 0;
510
511 if (!dmar_ir_support())
512 return 0;
513
514 for_each_drhd_unit(drhd) {
515 struct intel_iommu *iommu = drhd->iommu;
516
517 if (!ecap_ir_support(iommu->ecap))
518 return 0;
519 }
520
521 return 1;
522}
523
524int __init enable_intr_remapping(int eim)
525{
526 struct dmar_drhd_unit *drhd;
527 int setup = 0;
528
529 if (parse_ioapics_under_ir() != 1) {
530 printk(KERN_INFO "Not enable interrupt remapping\n");
531 return -1;
532 }
533
534 for_each_drhd_unit(drhd) {
535 struct intel_iommu *iommu = drhd->iommu;
536
537 /*
538 * If the queued invalidation is already initialized,
539 * shouldn't disable it.
540 */
541 if (iommu->qi)
542 continue;
543
544 /*
545 * Clear previous faults.
546 */
547 dmar_fault(-1, iommu);
548
549 /*
550 * Disable intr remapping and queued invalidation, if already
551 * enabled prior to OS handover.
552 */
553 iommu_disable_intr_remapping(iommu);
554
555 dmar_disable_qi(iommu);
556 }
557
558 /*
559 * check for the Interrupt-remapping support
560 */
561 for_each_drhd_unit(drhd) {
562 struct intel_iommu *iommu = drhd->iommu;
563
564 if (!ecap_ir_support(iommu->ecap))
565 continue;
566
567 if (eim && !ecap_eim_support(iommu->ecap)) {
568 printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
569 " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
570 return -1;
571 }
572 }
573
574 /*
575 * Enable queued invalidation for all the DRHD's.
576 */
577 for_each_drhd_unit(drhd) {
578 int ret;
579 struct intel_iommu *iommu = drhd->iommu;
580 ret = dmar_enable_qi(iommu);
581
582 if (ret) {
583 printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
584 " invalidation, ecap %Lx, ret %d\n",
585 drhd->reg_base_addr, iommu->ecap, ret);
586 return -1;
587 }
588 }
589
590 /*
591 * Setup Interrupt-remapping for all the DRHD's now.
592 */
593 for_each_drhd_unit(drhd) {
594 struct intel_iommu *iommu = drhd->iommu;
595
596 if (!ecap_ir_support(iommu->ecap))
597 continue;
598
599 if (setup_intr_remapping(iommu, eim))
600 goto error;
601
602 setup = 1;
603 }
604
605 if (!setup)
606 goto error;
607
608 intr_remapping_enabled = 1;
609
610 return 0;
611
612error:
613 /*
614 * handle error condition gracefully here!
615 */
616 return -1;
617}
618
619static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
620 struct intel_iommu *iommu)
621{
622 struct acpi_dmar_pci_path *path;
623 u8 bus;
624 int count;
625
626 bus = scope->bus;
627 path = (struct acpi_dmar_pci_path *)(scope + 1);
628 count = (scope->length - sizeof(struct acpi_dmar_device_scope))
629 / sizeof(struct acpi_dmar_pci_path);
630
631 while (--count > 0) {
632 /*
633 * Access PCI directly due to the PCI
634 * subsystem isn't initialized yet.
635 */
636 bus = read_pci_config_byte(bus, path->dev, path->fn,
637 PCI_SECONDARY_BUS);
638 path++;
639 }
640 ir_hpet[ir_hpet_num].bus = bus;
641 ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn);
642 ir_hpet[ir_hpet_num].iommu = iommu;
643 ir_hpet[ir_hpet_num].id = scope->enumeration_id;
644 ir_hpet_num++;
645}
646
647static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
648 struct intel_iommu *iommu)
649{
650 struct acpi_dmar_pci_path *path;
651 u8 bus;
652 int count;
653
654 bus = scope->bus;
655 path = (struct acpi_dmar_pci_path *)(scope + 1);
656 count = (scope->length - sizeof(struct acpi_dmar_device_scope))
657 / sizeof(struct acpi_dmar_pci_path);
658
659 while (--count > 0) {
660 /*
661 * Access PCI directly due to the PCI
662 * subsystem isn't initialized yet.
663 */
664 bus = read_pci_config_byte(bus, path->dev, path->fn,
665 PCI_SECONDARY_BUS);
666 path++;
667 }
668
669 ir_ioapic[ir_ioapic_num].bus = bus;
670 ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
671 ir_ioapic[ir_ioapic_num].iommu = iommu;
672 ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
673 ir_ioapic_num++;
674}
675
676static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
677 struct intel_iommu *iommu)
678{
679 struct acpi_dmar_hardware_unit *drhd;
680 struct acpi_dmar_device_scope *scope;
681 void *start, *end;
682
683 drhd = (struct acpi_dmar_hardware_unit *)header;
684
685 start = (void *)(drhd + 1);
686 end = ((void *)drhd) + header->length;
687
688 while (start < end) {
689 scope = start;
690 if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
691 if (ir_ioapic_num == MAX_IO_APICS) {
692 printk(KERN_WARNING "Exceeded Max IO APICS\n");
693 return -1;
694 }
695
696 printk(KERN_INFO "IOAPIC id %d under DRHD base "
697 " 0x%Lx IOMMU %d\n", scope->enumeration_id,
698 drhd->address, iommu->seq_id);
699
700 ir_parse_one_ioapic_scope(scope, iommu);
701 } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
702 if (ir_hpet_num == MAX_HPET_TBS) {
703 printk(KERN_WARNING "Exceeded Max HPET blocks\n");
704 return -1;
705 }
706
707 printk(KERN_INFO "HPET id %d under DRHD base"
708 " 0x%Lx\n", scope->enumeration_id,
709 drhd->address);
710
711 ir_parse_one_hpet_scope(scope, iommu);
712 }
713 start += scope->length;
714 }
715
716 return 0;
717}
718
719/*
720 * Finds the assocaition between IOAPIC's and its Interrupt-remapping
721 * hardware unit.
722 */
723int __init parse_ioapics_under_ir(void)
724{
725 struct dmar_drhd_unit *drhd;
726 int ir_supported = 0;
727
728 for_each_drhd_unit(drhd) {
729 struct intel_iommu *iommu = drhd->iommu;
730
731 if (ecap_ir_support(iommu->ecap)) {
732 if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
733 return -1;
734
735 ir_supported = 1;
736 }
737 }
738
739 if (ir_supported && ir_ioapic_num != nr_ioapics) {
740 printk(KERN_WARNING
741 "Not all IO-APIC's listed under remapping hardware\n");
742 return -1;
743 }
744
745 return ir_supported;
746}
747
748void disable_intr_remapping(void)
749{
750 struct dmar_drhd_unit *drhd;
751 struct intel_iommu *iommu = NULL;
752
753 /*
754 * Disable Interrupt-remapping for all the DRHD's now.
755 */
756 for_each_iommu(iommu, drhd) {
757 if (!ecap_ir_support(iommu->ecap))
758 continue;
759
760 iommu_disable_intr_remapping(iommu);
761 }
762}
763
764int reenable_intr_remapping(int eim)
765{
766 struct dmar_drhd_unit *drhd;
767 int setup = 0;
768 struct intel_iommu *iommu = NULL;
769
770 for_each_iommu(iommu, drhd)
771 if (iommu->qi)
772 dmar_reenable_qi(iommu);
773
774 /*
775 * Setup Interrupt-remapping for all the DRHD's now.
776 */
777 for_each_iommu(iommu, drhd) {
778 if (!ecap_ir_support(iommu->ecap))
779 continue;
780
781 /* Set up interrupt remapping for iommu.*/
782 iommu_set_intr_remapping(iommu, eim);
783 setup = 1;
784 }
785
786 if (!setup)
787 goto error;
788
789 return 0;
790
791error:
792 /*
793 * handle error condition gracefully here!
794 */
795 return -1;
796}
797
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
new file mode 100644
index 000000000000..5662fecfee60
--- /dev/null
+++ b/drivers/iommu/intr_remapping.h
@@ -0,0 +1,17 @@
1#include <linux/intel-iommu.h>
2
3struct ioapic_scope {
4 struct intel_iommu *iommu;
5 unsigned int id;
6 unsigned int bus; /* PCI bus number */
7 unsigned int devfn; /* PCI devfn number */
8};
9
10struct hpet_scope {
11 struct intel_iommu *iommu;
12 u8 id;
13 unsigned int bus;
14 unsigned int devfn;
15};
16
17#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
new file mode 100644
index 000000000000..6e6b6a11b3ce
--- /dev/null
+++ b/drivers/iommu/iommu.c
@@ -0,0 +1,124 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/bug.h>
20#include <linux/types.h>
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/errno.h>
24#include <linux/iommu.h>
25
26static struct iommu_ops *iommu_ops;
27
28void register_iommu(struct iommu_ops *ops)
29{
30 if (iommu_ops)
31 BUG();
32
33 iommu_ops = ops;
34}
35
36bool iommu_found(void)
37{
38 return iommu_ops != NULL;
39}
40EXPORT_SYMBOL_GPL(iommu_found);
41
42struct iommu_domain *iommu_domain_alloc(void)
43{
44 struct iommu_domain *domain;
45 int ret;
46
47 domain = kmalloc(sizeof(*domain), GFP_KERNEL);
48 if (!domain)
49 return NULL;
50
51 ret = iommu_ops->domain_init(domain);
52 if (ret)
53 goto out_free;
54
55 return domain;
56
57out_free:
58 kfree(domain);
59
60 return NULL;
61}
62EXPORT_SYMBOL_GPL(iommu_domain_alloc);
63
64void iommu_domain_free(struct iommu_domain *domain)
65{
66 iommu_ops->domain_destroy(domain);
67 kfree(domain);
68}
69EXPORT_SYMBOL_GPL(iommu_domain_free);
70
71int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
72{
73 return iommu_ops->attach_dev(domain, dev);
74}
75EXPORT_SYMBOL_GPL(iommu_attach_device);
76
77void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
78{
79 iommu_ops->detach_dev(domain, dev);
80}
81EXPORT_SYMBOL_GPL(iommu_detach_device);
82
83phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
84 unsigned long iova)
85{
86 return iommu_ops->iova_to_phys(domain, iova);
87}
88EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
89
90int iommu_domain_has_cap(struct iommu_domain *domain,
91 unsigned long cap)
92{
93 return iommu_ops->domain_has_cap(domain, cap);
94}
95EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
96
97int iommu_map(struct iommu_domain *domain, unsigned long iova,
98 phys_addr_t paddr, int gfp_order, int prot)
99{
100 unsigned long invalid_mask;
101 size_t size;
102
103 size = 0x1000UL << gfp_order;
104 invalid_mask = size - 1;
105
106 BUG_ON((iova | paddr) & invalid_mask);
107
108 return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
109}
110EXPORT_SYMBOL_GPL(iommu_map);
111
112int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
113{
114 unsigned long invalid_mask;
115 size_t size;
116
117 size = 0x1000UL << gfp_order;
118 invalid_mask = size - 1;
119
120 BUG_ON(iova & invalid_mask);
121
122 return iommu_ops->unmap(domain, iova, gfp_order);
123}
124EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
new file mode 100644
index 000000000000..c5c274ab5c5a
--- /dev/null
+++ b/drivers/iommu/iova.c
@@ -0,0 +1,435 @@
1/*
2 * Copyright © 2006-2009, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
18 */
19
20#include <linux/iova.h>
21
22void
23init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
24{
25 spin_lock_init(&iovad->iova_rbtree_lock);
26 iovad->rbroot = RB_ROOT;
27 iovad->cached32_node = NULL;
28 iovad->dma_32bit_pfn = pfn_32bit;
29}
30
31static struct rb_node *
32__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
33{
34 if ((*limit_pfn != iovad->dma_32bit_pfn) ||
35 (iovad->cached32_node == NULL))
36 return rb_last(&iovad->rbroot);
37 else {
38 struct rb_node *prev_node = rb_prev(iovad->cached32_node);
39 struct iova *curr_iova =
40 container_of(iovad->cached32_node, struct iova, node);
41 *limit_pfn = curr_iova->pfn_lo - 1;
42 return prev_node;
43 }
44}
45
46static void
47__cached_rbnode_insert_update(struct iova_domain *iovad,
48 unsigned long limit_pfn, struct iova *new)
49{
50 if (limit_pfn != iovad->dma_32bit_pfn)
51 return;
52 iovad->cached32_node = &new->node;
53}
54
55static void
56__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
57{
58 struct iova *cached_iova;
59 struct rb_node *curr;
60
61 if (!iovad->cached32_node)
62 return;
63 curr = iovad->cached32_node;
64 cached_iova = container_of(curr, struct iova, node);
65
66 if (free->pfn_lo >= cached_iova->pfn_lo) {
67 struct rb_node *node = rb_next(&free->node);
68 struct iova *iova = container_of(node, struct iova, node);
69
70 /* only cache if it's below 32bit pfn */
71 if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
72 iovad->cached32_node = node;
73 else
74 iovad->cached32_node = NULL;
75 }
76}
77
78/* Computes the padding size required, to make the
79 * the start address naturally aligned on its size
80 */
81static int
82iova_get_pad_size(int size, unsigned int limit_pfn)
83{
84 unsigned int pad_size = 0;
85 unsigned int order = ilog2(size);
86
87 if (order)
88 pad_size = (limit_pfn + 1) % (1 << order);
89
90 return pad_size;
91}
92
93static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
94 unsigned long size, unsigned long limit_pfn,
95 struct iova *new, bool size_aligned)
96{
97 struct rb_node *prev, *curr = NULL;
98 unsigned long flags;
99 unsigned long saved_pfn;
100 unsigned int pad_size = 0;
101
102 /* Walk the tree backwards */
103 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
104 saved_pfn = limit_pfn;
105 curr = __get_cached_rbnode(iovad, &limit_pfn);
106 prev = curr;
107 while (curr) {
108 struct iova *curr_iova = container_of(curr, struct iova, node);
109
110 if (limit_pfn < curr_iova->pfn_lo)
111 goto move_left;
112 else if (limit_pfn < curr_iova->pfn_hi)
113 goto adjust_limit_pfn;
114 else {
115 if (size_aligned)
116 pad_size = iova_get_pad_size(size, limit_pfn);
117 if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
118 break; /* found a free slot */
119 }
120adjust_limit_pfn:
121 limit_pfn = curr_iova->pfn_lo - 1;
122move_left:
123 prev = curr;
124 curr = rb_prev(curr);
125 }
126
127 if (!curr) {
128 if (size_aligned)
129 pad_size = iova_get_pad_size(size, limit_pfn);
130 if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
131 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
132 return -ENOMEM;
133 }
134 }
135
136 /* pfn_lo will point to size aligned address if size_aligned is set */
137 new->pfn_lo = limit_pfn - (size + pad_size) + 1;
138 new->pfn_hi = new->pfn_lo + size - 1;
139
140 /* Insert the new_iova into domain rbtree by holding writer lock */
141 /* Add new node and rebalance tree. */
142 {
143 struct rb_node **entry, *parent = NULL;
144
145 /* If we have 'prev', it's a valid place to start the
146 insertion. Otherwise, start from the root. */
147 if (prev)
148 entry = &prev;
149 else
150 entry = &iovad->rbroot.rb_node;
151
152 /* Figure out where to put new node */
153 while (*entry) {
154 struct iova *this = container_of(*entry,
155 struct iova, node);
156 parent = *entry;
157
158 if (new->pfn_lo < this->pfn_lo)
159 entry = &((*entry)->rb_left);
160 else if (new->pfn_lo > this->pfn_lo)
161 entry = &((*entry)->rb_right);
162 else
163 BUG(); /* this should not happen */
164 }
165
166 /* Add new node and rebalance tree. */
167 rb_link_node(&new->node, parent, entry);
168 rb_insert_color(&new->node, &iovad->rbroot);
169 }
170 __cached_rbnode_insert_update(iovad, saved_pfn, new);
171
172 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
173
174
175 return 0;
176}
177
178static void
179iova_insert_rbtree(struct rb_root *root, struct iova *iova)
180{
181 struct rb_node **new = &(root->rb_node), *parent = NULL;
182 /* Figure out where to put new node */
183 while (*new) {
184 struct iova *this = container_of(*new, struct iova, node);
185 parent = *new;
186
187 if (iova->pfn_lo < this->pfn_lo)
188 new = &((*new)->rb_left);
189 else if (iova->pfn_lo > this->pfn_lo)
190 new = &((*new)->rb_right);
191 else
192 BUG(); /* this should not happen */
193 }
194 /* Add new node and rebalance tree. */
195 rb_link_node(&iova->node, parent, new);
196 rb_insert_color(&iova->node, root);
197}
198
199/**
200 * alloc_iova - allocates an iova
201 * @iovad - iova domain in question
202 * @size - size of page frames to allocate
203 * @limit_pfn - max limit address
204 * @size_aligned - set if size_aligned address range is required
205 * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
206 * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
207 * flag is set then the allocated address iova->pfn_lo will be naturally
208 * aligned on roundup_power_of_two(size).
209 */
210struct iova *
211alloc_iova(struct iova_domain *iovad, unsigned long size,
212 unsigned long limit_pfn,
213 bool size_aligned)
214{
215 struct iova *new_iova;
216 int ret;
217
218 new_iova = alloc_iova_mem();
219 if (!new_iova)
220 return NULL;
221
222 /* If size aligned is set then round the size to
223 * to next power of two.
224 */
225 if (size_aligned)
226 size = __roundup_pow_of_two(size);
227
228 ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
229 new_iova, size_aligned);
230
231 if (ret) {
232 free_iova_mem(new_iova);
233 return NULL;
234 }
235
236 return new_iova;
237}
238
239/**
240 * find_iova - find's an iova for a given pfn
241 * @iovad - iova domain in question.
242 * pfn - page frame number
243 * This function finds and returns an iova belonging to the
244 * given doamin which matches the given pfn.
245 */
246struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
247{
248 unsigned long flags;
249 struct rb_node *node;
250
251 /* Take the lock so that no other thread is manipulating the rbtree */
252 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
253 node = iovad->rbroot.rb_node;
254 while (node) {
255 struct iova *iova = container_of(node, struct iova, node);
256
257 /* If pfn falls within iova's range, return iova */
258 if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
259 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
260 /* We are not holding the lock while this iova
261 * is referenced by the caller as the same thread
262 * which called this function also calls __free_iova()
263 * and it is by desing that only one thread can possibly
264 * reference a particular iova and hence no conflict.
265 */
266 return iova;
267 }
268
269 if (pfn < iova->pfn_lo)
270 node = node->rb_left;
271 else if (pfn > iova->pfn_lo)
272 node = node->rb_right;
273 }
274
275 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
276 return NULL;
277}
278
279/**
280 * __free_iova - frees the given iova
281 * @iovad: iova domain in question.
282 * @iova: iova in question.
283 * Frees the given iova belonging to the giving domain
284 */
285void
286__free_iova(struct iova_domain *iovad, struct iova *iova)
287{
288 unsigned long flags;
289
290 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
291 __cached_rbnode_delete_update(iovad, iova);
292 rb_erase(&iova->node, &iovad->rbroot);
293 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
294 free_iova_mem(iova);
295}
296
297/**
298 * free_iova - finds and frees the iova for a given pfn
299 * @iovad: - iova domain in question.
300 * @pfn: - pfn that is allocated previously
301 * This functions finds an iova for a given pfn and then
302 * frees the iova from that domain.
303 */
304void
305free_iova(struct iova_domain *iovad, unsigned long pfn)
306{
307 struct iova *iova = find_iova(iovad, pfn);
308 if (iova)
309 __free_iova(iovad, iova);
310
311}
312
313/**
314 * put_iova_domain - destroys the iova doamin
315 * @iovad: - iova domain in question.
316 * All the iova's in that domain are destroyed.
317 */
318void put_iova_domain(struct iova_domain *iovad)
319{
320 struct rb_node *node;
321 unsigned long flags;
322
323 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
324 node = rb_first(&iovad->rbroot);
325 while (node) {
326 struct iova *iova = container_of(node, struct iova, node);
327 rb_erase(node, &iovad->rbroot);
328 free_iova_mem(iova);
329 node = rb_first(&iovad->rbroot);
330 }
331 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
332}
333
334static int
335__is_range_overlap(struct rb_node *node,
336 unsigned long pfn_lo, unsigned long pfn_hi)
337{
338 struct iova *iova = container_of(node, struct iova, node);
339
340 if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
341 return 1;
342 return 0;
343}
344
345static struct iova *
346__insert_new_range(struct iova_domain *iovad,
347 unsigned long pfn_lo, unsigned long pfn_hi)
348{
349 struct iova *iova;
350
351 iova = alloc_iova_mem();
352 if (!iova)
353 return iova;
354
355 iova->pfn_hi = pfn_hi;
356 iova->pfn_lo = pfn_lo;
357 iova_insert_rbtree(&iovad->rbroot, iova);
358 return iova;
359}
360
361static void
362__adjust_overlap_range(struct iova *iova,
363 unsigned long *pfn_lo, unsigned long *pfn_hi)
364{
365 if (*pfn_lo < iova->pfn_lo)
366 iova->pfn_lo = *pfn_lo;
367 if (*pfn_hi > iova->pfn_hi)
368 *pfn_lo = iova->pfn_hi + 1;
369}
370
371/**
372 * reserve_iova - reserves an iova in the given range
373 * @iovad: - iova domain pointer
374 * @pfn_lo: - lower page frame address
375 * @pfn_hi:- higher pfn adderss
376 * This function allocates reserves the address range from pfn_lo to pfn_hi so
377 * that this address is not dished out as part of alloc_iova.
378 */
379struct iova *
380reserve_iova(struct iova_domain *iovad,
381 unsigned long pfn_lo, unsigned long pfn_hi)
382{
383 struct rb_node *node;
384 unsigned long flags;
385 struct iova *iova;
386 unsigned int overlap = 0;
387
388 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
389 for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
390 if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
391 iova = container_of(node, struct iova, node);
392 __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
393 if ((pfn_lo >= iova->pfn_lo) &&
394 (pfn_hi <= iova->pfn_hi))
395 goto finish;
396 overlap = 1;
397
398 } else if (overlap)
399 break;
400 }
401
402 /* We are here either because this is the first reserver node
403 * or need to insert remaining non overlap addr range
404 */
405 iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
406finish:
407
408 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
409 return iova;
410}
411
412/**
413 * copy_reserved_iova - copies the reserved between domains
414 * @from: - source doamin from where to copy
415 * @to: - destination domin where to copy
416 * This function copies reserved iova's from one doamin to
417 * other.
418 */
419void
420copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
421{
422 unsigned long flags;
423 struct rb_node *node;
424
425 spin_lock_irqsave(&from->iova_rbtree_lock, flags);
426 for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
427 struct iova *iova = container_of(node, struct iova, node);
428 struct iova *new_iova;
429 new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
430 if (!new_iova)
431 printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
432 iova->pfn_lo, iova->pfn_lo);
433 }
434 spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
435}
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
new file mode 100644
index 000000000000..1a584e077c61
--- /dev/null
+++ b/drivers/iommu/msm_iommu.c
@@ -0,0 +1,731 @@
1/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 and
5 * only version 2 as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
15 * 02110-1301, USA.
16 */
17
18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/platform_device.h>
22#include <linux/errno.h>
23#include <linux/io.h>
24#include <linux/interrupt.h>
25#include <linux/list.h>
26#include <linux/spinlock.h>
27#include <linux/slab.h>
28#include <linux/iommu.h>
29#include <linux/clk.h>
30
31#include <asm/cacheflush.h>
32#include <asm/sizes.h>
33
34#include <mach/iommu_hw-8xxx.h>
35#include <mach/iommu.h>
36
37#define MRC(reg, processor, op1, crn, crm, op2) \
38__asm__ __volatile__ ( \
39" mrc " #processor "," #op1 ", %0," #crn "," #crm "," #op2 "\n" \
40: "=r" (reg))
41
42#define RCP15_PRRR(reg) MRC(reg, p15, 0, c10, c2, 0)
43#define RCP15_NMRR(reg) MRC(reg, p15, 0, c10, c2, 1)
44
45static int msm_iommu_tex_class[4];
46
47DEFINE_SPINLOCK(msm_iommu_lock);
48
49struct msm_priv {
50 unsigned long *pgtable;
51 struct list_head list_attached;
52};
53
54static int __enable_clocks(struct msm_iommu_drvdata *drvdata)
55{
56 int ret;
57
58 ret = clk_enable(drvdata->pclk);
59 if (ret)
60 goto fail;
61
62 if (drvdata->clk) {
63 ret = clk_enable(drvdata->clk);
64 if (ret)
65 clk_disable(drvdata->pclk);
66 }
67fail:
68 return ret;
69}
70
71static void __disable_clocks(struct msm_iommu_drvdata *drvdata)
72{
73 if (drvdata->clk)
74 clk_disable(drvdata->clk);
75 clk_disable(drvdata->pclk);
76}
77
78static int __flush_iotlb(struct iommu_domain *domain)
79{
80 struct msm_priv *priv = domain->priv;
81 struct msm_iommu_drvdata *iommu_drvdata;
82 struct msm_iommu_ctx_drvdata *ctx_drvdata;
83 int ret = 0;
84#ifndef CONFIG_IOMMU_PGTABLES_L2
85 unsigned long *fl_table = priv->pgtable;
86 int i;
87
88 if (!list_empty(&priv->list_attached)) {
89 dmac_flush_range(fl_table, fl_table + SZ_16K);
90
91 for (i = 0; i < NUM_FL_PTE; i++)
92 if ((fl_table[i] & 0x03) == FL_TYPE_TABLE) {
93 void *sl_table = __va(fl_table[i] &
94 FL_BASE_MASK);
95 dmac_flush_range(sl_table, sl_table + SZ_4K);
96 }
97 }
98#endif
99
100 list_for_each_entry(ctx_drvdata, &priv->list_attached, attached_elm) {
101 if (!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent)
102 BUG();
103
104 iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent);
105 BUG_ON(!iommu_drvdata);
106
107 ret = __enable_clocks(iommu_drvdata);
108 if (ret)
109 goto fail;
110
111 SET_CTX_TLBIALL(iommu_drvdata->base, ctx_drvdata->num, 0);
112 __disable_clocks(iommu_drvdata);
113 }
114fail:
115 return ret;
116}
117
118static void __reset_context(void __iomem *base, int ctx)
119{
120 SET_BPRCOSH(base, ctx, 0);
121 SET_BPRCISH(base, ctx, 0);
122 SET_BPRCNSH(base, ctx, 0);
123 SET_BPSHCFG(base, ctx, 0);
124 SET_BPMTCFG(base, ctx, 0);
125 SET_ACTLR(base, ctx, 0);
126 SET_SCTLR(base, ctx, 0);
127 SET_FSRRESTORE(base, ctx, 0);
128 SET_TTBR0(base, ctx, 0);
129 SET_TTBR1(base, ctx, 0);
130 SET_TTBCR(base, ctx, 0);
131 SET_BFBCR(base, ctx, 0);
132 SET_PAR(base, ctx, 0);
133 SET_FAR(base, ctx, 0);
134 SET_CTX_TLBIALL(base, ctx, 0);
135 SET_TLBFLPTER(base, ctx, 0);
136 SET_TLBSLPTER(base, ctx, 0);
137 SET_TLBLKCR(base, ctx, 0);
138 SET_PRRR(base, ctx, 0);
139 SET_NMRR(base, ctx, 0);
140}
141
142static void __program_context(void __iomem *base, int ctx, phys_addr_t pgtable)
143{
144 unsigned int prrr, nmrr;
145 __reset_context(base, ctx);
146
147 /* Set up HTW mode */
148 /* TLB miss configuration: perform HTW on miss */
149 SET_TLBMCFG(base, ctx, 0x3);
150
151 /* V2P configuration: HTW for access */
152 SET_V2PCFG(base, ctx, 0x3);
153
154 SET_TTBCR(base, ctx, 0);
155 SET_TTBR0_PA(base, ctx, (pgtable >> 14));
156
157 /* Invalidate the TLB for this context */
158 SET_CTX_TLBIALL(base, ctx, 0);
159
160 /* Set interrupt number to "secure" interrupt */
161 SET_IRPTNDX(base, ctx, 0);
162
163 /* Enable context fault interrupt */
164 SET_CFEIE(base, ctx, 1);
165
166 /* Stall access on a context fault and let the handler deal with it */
167 SET_CFCFG(base, ctx, 1);
168
169 /* Redirect all cacheable requests to L2 slave port. */
170 SET_RCISH(base, ctx, 1);
171 SET_RCOSH(base, ctx, 1);
172 SET_RCNSH(base, ctx, 1);
173
174 /* Turn on TEX Remap */
175 SET_TRE(base, ctx, 1);
176
177 /* Set TEX remap attributes */
178 RCP15_PRRR(prrr);
179 RCP15_NMRR(nmrr);
180 SET_PRRR(base, ctx, prrr);
181 SET_NMRR(base, ctx, nmrr);
182
183 /* Turn on BFB prefetch */
184 SET_BFBDFE(base, ctx, 1);
185
186#ifdef CONFIG_IOMMU_PGTABLES_L2
187 /* Configure page tables as inner-cacheable and shareable to reduce
188 * the TLB miss penalty.
189 */
190 SET_TTBR0_SH(base, ctx, 1);
191 SET_TTBR1_SH(base, ctx, 1);
192
193 SET_TTBR0_NOS(base, ctx, 1);
194 SET_TTBR1_NOS(base, ctx, 1);
195
196 SET_TTBR0_IRGNH(base, ctx, 0); /* WB, WA */
197 SET_TTBR0_IRGNL(base, ctx, 1);
198
199 SET_TTBR1_IRGNH(base, ctx, 0); /* WB, WA */
200 SET_TTBR1_IRGNL(base, ctx, 1);
201
202 SET_TTBR0_ORGN(base, ctx, 1); /* WB, WA */
203 SET_TTBR1_ORGN(base, ctx, 1); /* WB, WA */
204#endif
205
206 /* Enable the MMU */
207 SET_M(base, ctx, 1);
208}
209
210static int msm_iommu_domain_init(struct iommu_domain *domain)
211{
212 struct msm_priv *priv = kzalloc(sizeof(*priv), GFP_KERNEL);
213
214 if (!priv)
215 goto fail_nomem;
216
217 INIT_LIST_HEAD(&priv->list_attached);
218 priv->pgtable = (unsigned long *)__get_free_pages(GFP_KERNEL,
219 get_order(SZ_16K));
220
221 if (!priv->pgtable)
222 goto fail_nomem;
223
224 memset(priv->pgtable, 0, SZ_16K);
225 domain->priv = priv;
226 return 0;
227
228fail_nomem:
229 kfree(priv);
230 return -ENOMEM;
231}
232
233static void msm_iommu_domain_destroy(struct iommu_domain *domain)
234{
235 struct msm_priv *priv;
236 unsigned long flags;
237 unsigned long *fl_table;
238 int i;
239
240 spin_lock_irqsave(&msm_iommu_lock, flags);
241 priv = domain->priv;
242 domain->priv = NULL;
243
244 if (priv) {
245 fl_table = priv->pgtable;
246
247 for (i = 0; i < NUM_FL_PTE; i++)
248 if ((fl_table[i] & 0x03) == FL_TYPE_TABLE)
249 free_page((unsigned long) __va(((fl_table[i]) &
250 FL_BASE_MASK)));
251
252 free_pages((unsigned long)priv->pgtable, get_order(SZ_16K));
253 priv->pgtable = NULL;
254 }
255
256 kfree(priv);
257 spin_unlock_irqrestore(&msm_iommu_lock, flags);
258}
259
260static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
261{
262 struct msm_priv *priv;
263 struct msm_iommu_ctx_dev *ctx_dev;
264 struct msm_iommu_drvdata *iommu_drvdata;
265 struct msm_iommu_ctx_drvdata *ctx_drvdata;
266 struct msm_iommu_ctx_drvdata *tmp_drvdata;
267 int ret = 0;
268 unsigned long flags;
269
270 spin_lock_irqsave(&msm_iommu_lock, flags);
271
272 priv = domain->priv;
273
274 if (!priv || !dev) {
275 ret = -EINVAL;
276 goto fail;
277 }
278
279 iommu_drvdata = dev_get_drvdata(dev->parent);
280 ctx_drvdata = dev_get_drvdata(dev);
281 ctx_dev = dev->platform_data;
282
283 if (!iommu_drvdata || !ctx_drvdata || !ctx_dev) {
284 ret = -EINVAL;
285 goto fail;
286 }
287
288 if (!list_empty(&ctx_drvdata->attached_elm)) {
289 ret = -EBUSY;
290 goto fail;
291 }
292
293 list_for_each_entry(tmp_drvdata, &priv->list_attached, attached_elm)
294 if (tmp_drvdata == ctx_drvdata) {
295 ret = -EBUSY;
296 goto fail;
297 }
298
299 ret = __enable_clocks(iommu_drvdata);
300 if (ret)
301 goto fail;
302
303 __program_context(iommu_drvdata->base, ctx_dev->num,
304 __pa(priv->pgtable));
305
306 __disable_clocks(iommu_drvdata);
307 list_add(&(ctx_drvdata->attached_elm), &priv->list_attached);
308 ret = __flush_iotlb(domain);
309
310fail:
311 spin_unlock_irqrestore(&msm_iommu_lock, flags);
312 return ret;
313}
314
315static void msm_iommu_detach_dev(struct iommu_domain *domain,
316 struct device *dev)
317{
318 struct msm_priv *priv;
319 struct msm_iommu_ctx_dev *ctx_dev;
320 struct msm_iommu_drvdata *iommu_drvdata;
321 struct msm_iommu_ctx_drvdata *ctx_drvdata;
322 unsigned long flags;
323 int ret;
324
325 spin_lock_irqsave(&msm_iommu_lock, flags);
326 priv = domain->priv;
327
328 if (!priv || !dev)
329 goto fail;
330
331 iommu_drvdata = dev_get_drvdata(dev->parent);
332 ctx_drvdata = dev_get_drvdata(dev);
333 ctx_dev = dev->platform_data;
334
335 if (!iommu_drvdata || !ctx_drvdata || !ctx_dev)
336 goto fail;
337
338 ret = __flush_iotlb(domain);
339 if (ret)
340 goto fail;
341
342 ret = __enable_clocks(iommu_drvdata);
343 if (ret)
344 goto fail;
345
346 __reset_context(iommu_drvdata->base, ctx_dev->num);
347 __disable_clocks(iommu_drvdata);
348 list_del_init(&ctx_drvdata->attached_elm);
349
350fail:
351 spin_unlock_irqrestore(&msm_iommu_lock, flags);
352}
353
354static int msm_iommu_map(struct iommu_domain *domain, unsigned long va,
355 phys_addr_t pa, int order, int prot)
356{
357 struct msm_priv *priv;
358 unsigned long flags;
359 unsigned long *fl_table;
360 unsigned long *fl_pte;
361 unsigned long fl_offset;
362 unsigned long *sl_table;
363 unsigned long *sl_pte;
364 unsigned long sl_offset;
365 unsigned int pgprot;
366 size_t len = 0x1000UL << order;
367 int ret = 0, tex, sh;
368
369 spin_lock_irqsave(&msm_iommu_lock, flags);
370
371 sh = (prot & MSM_IOMMU_ATTR_SH) ? 1 : 0;
372 tex = msm_iommu_tex_class[prot & MSM_IOMMU_CP_MASK];
373
374 if (tex < 0 || tex > NUM_TEX_CLASS - 1) {
375 ret = -EINVAL;
376 goto fail;
377 }
378
379 priv = domain->priv;
380 if (!priv) {
381 ret = -EINVAL;
382 goto fail;
383 }
384
385 fl_table = priv->pgtable;
386
387 if (len != SZ_16M && len != SZ_1M &&
388 len != SZ_64K && len != SZ_4K) {
389 pr_debug("Bad size: %d\n", len);
390 ret = -EINVAL;
391 goto fail;
392 }
393
394 if (!fl_table) {
395 pr_debug("Null page table\n");
396 ret = -EINVAL;
397 goto fail;
398 }
399
400 if (len == SZ_16M || len == SZ_1M) {
401 pgprot = sh ? FL_SHARED : 0;
402 pgprot |= tex & 0x01 ? FL_BUFFERABLE : 0;
403 pgprot |= tex & 0x02 ? FL_CACHEABLE : 0;
404 pgprot |= tex & 0x04 ? FL_TEX0 : 0;
405 } else {
406 pgprot = sh ? SL_SHARED : 0;
407 pgprot |= tex & 0x01 ? SL_BUFFERABLE : 0;
408 pgprot |= tex & 0x02 ? SL_CACHEABLE : 0;
409 pgprot |= tex & 0x04 ? SL_TEX0 : 0;
410 }
411
412 fl_offset = FL_OFFSET(va); /* Upper 12 bits */
413 fl_pte = fl_table + fl_offset; /* int pointers, 4 bytes */
414
415 if (len == SZ_16M) {
416 int i = 0;
417 for (i = 0; i < 16; i++)
418 *(fl_pte+i) = (pa & 0xFF000000) | FL_SUPERSECTION |
419 FL_AP_READ | FL_AP_WRITE | FL_TYPE_SECT |
420 FL_SHARED | FL_NG | pgprot;
421 }
422
423 if (len == SZ_1M)
424 *fl_pte = (pa & 0xFFF00000) | FL_AP_READ | FL_AP_WRITE | FL_NG |
425 FL_TYPE_SECT | FL_SHARED | pgprot;
426
427 /* Need a 2nd level table */
428 if ((len == SZ_4K || len == SZ_64K) && (*fl_pte) == 0) {
429 unsigned long *sl;
430 sl = (unsigned long *) __get_free_pages(GFP_ATOMIC,
431 get_order(SZ_4K));
432
433 if (!sl) {
434 pr_debug("Could not allocate second level table\n");
435 ret = -ENOMEM;
436 goto fail;
437 }
438
439 memset(sl, 0, SZ_4K);
440 *fl_pte = ((((int)__pa(sl)) & FL_BASE_MASK) | FL_TYPE_TABLE);
441 }
442
443 sl_table = (unsigned long *) __va(((*fl_pte) & FL_BASE_MASK));
444 sl_offset = SL_OFFSET(va);
445 sl_pte = sl_table + sl_offset;
446
447
448 if (len == SZ_4K)
449 *sl_pte = (pa & SL_BASE_MASK_SMALL) | SL_AP0 | SL_AP1 | SL_NG |
450 SL_SHARED | SL_TYPE_SMALL | pgprot;
451
452 if (len == SZ_64K) {
453 int i;
454
455 for (i = 0; i < 16; i++)
456 *(sl_pte+i) = (pa & SL_BASE_MASK_LARGE) | SL_AP0 |
457 SL_NG | SL_AP1 | SL_SHARED | SL_TYPE_LARGE | pgprot;
458 }
459
460 ret = __flush_iotlb(domain);
461fail:
462 spin_unlock_irqrestore(&msm_iommu_lock, flags);
463 return ret;
464}
465
466static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va,
467 int order)
468{
469 struct msm_priv *priv;
470 unsigned long flags;
471 unsigned long *fl_table;
472 unsigned long *fl_pte;
473 unsigned long fl_offset;
474 unsigned long *sl_table;
475 unsigned long *sl_pte;
476 unsigned long sl_offset;
477 size_t len = 0x1000UL << order;
478 int i, ret = 0;
479
480 spin_lock_irqsave(&msm_iommu_lock, flags);
481
482 priv = domain->priv;
483
484 if (!priv) {
485 ret = -ENODEV;
486 goto fail;
487 }
488
489 fl_table = priv->pgtable;
490
491 if (len != SZ_16M && len != SZ_1M &&
492 len != SZ_64K && len != SZ_4K) {
493 pr_debug("Bad length: %d\n", len);
494 ret = -EINVAL;
495 goto fail;
496 }
497
498 if (!fl_table) {
499 pr_debug("Null page table\n");
500 ret = -EINVAL;
501 goto fail;
502 }
503
504 fl_offset = FL_OFFSET(va); /* Upper 12 bits */
505 fl_pte = fl_table + fl_offset; /* int pointers, 4 bytes */
506
507 if (*fl_pte == 0) {
508 pr_debug("First level PTE is 0\n");
509 ret = -ENODEV;
510 goto fail;
511 }
512
513 /* Unmap supersection */
514 if (len == SZ_16M)
515 for (i = 0; i < 16; i++)
516 *(fl_pte+i) = 0;
517
518 if (len == SZ_1M)
519 *fl_pte = 0;
520
521 sl_table = (unsigned long *) __va(((*fl_pte) & FL_BASE_MASK));
522 sl_offset = SL_OFFSET(va);
523 sl_pte = sl_table + sl_offset;
524
525 if (len == SZ_64K) {
526 for (i = 0; i < 16; i++)
527 *(sl_pte+i) = 0;
528 }
529
530 if (len == SZ_4K)
531 *sl_pte = 0;
532
533 if (len == SZ_4K || len == SZ_64K) {
534 int used = 0;
535
536 for (i = 0; i < NUM_SL_PTE; i++)
537 if (sl_table[i])
538 used = 1;
539 if (!used) {
540 free_page((unsigned long)sl_table);
541 *fl_pte = 0;
542 }
543 }
544
545 ret = __flush_iotlb(domain);
546fail:
547 spin_unlock_irqrestore(&msm_iommu_lock, flags);
548 return ret;
549}
550
551static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain,
552 unsigned long va)
553{
554 struct msm_priv *priv;
555 struct msm_iommu_drvdata *iommu_drvdata;
556 struct msm_iommu_ctx_drvdata *ctx_drvdata;
557 unsigned int par;
558 unsigned long flags;
559 void __iomem *base;
560 phys_addr_t ret = 0;
561 int ctx;
562
563 spin_lock_irqsave(&msm_iommu_lock, flags);
564
565 priv = domain->priv;
566 if (list_empty(&priv->list_attached))
567 goto fail;
568
569 ctx_drvdata = list_entry(priv->list_attached.next,
570 struct msm_iommu_ctx_drvdata, attached_elm);
571 iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent);
572
573 base = iommu_drvdata->base;
574 ctx = ctx_drvdata->num;
575
576 ret = __enable_clocks(iommu_drvdata);
577 if (ret)
578 goto fail;
579
580 /* Invalidate context TLB */
581 SET_CTX_TLBIALL(base, ctx, 0);
582 SET_V2PPR(base, ctx, va & V2Pxx_VA);
583
584 par = GET_PAR(base, ctx);
585
586 /* We are dealing with a supersection */
587 if (GET_NOFAULT_SS(base, ctx))
588 ret = (par & 0xFF000000) | (va & 0x00FFFFFF);
589 else /* Upper 20 bits from PAR, lower 12 from VA */
590 ret = (par & 0xFFFFF000) | (va & 0x00000FFF);
591
592 if (GET_FAULT(base, ctx))
593 ret = 0;
594
595 __disable_clocks(iommu_drvdata);
596fail:
597 spin_unlock_irqrestore(&msm_iommu_lock, flags);
598 return ret;
599}
600
601static int msm_iommu_domain_has_cap(struct iommu_domain *domain,
602 unsigned long cap)
603{
604 return 0;
605}
606
607static void print_ctx_regs(void __iomem *base, int ctx)
608{
609 unsigned int fsr = GET_FSR(base, ctx);
610 pr_err("FAR = %08x PAR = %08x\n",
611 GET_FAR(base, ctx), GET_PAR(base, ctx));
612 pr_err("FSR = %08x [%s%s%s%s%s%s%s%s%s%s]\n", fsr,
613 (fsr & 0x02) ? "TF " : "",
614 (fsr & 0x04) ? "AFF " : "",
615 (fsr & 0x08) ? "APF " : "",
616 (fsr & 0x10) ? "TLBMF " : "",
617 (fsr & 0x20) ? "HTWDEEF " : "",
618 (fsr & 0x40) ? "HTWSEEF " : "",
619 (fsr & 0x80) ? "MHF " : "",
620 (fsr & 0x10000) ? "SL " : "",
621 (fsr & 0x40000000) ? "SS " : "",
622 (fsr & 0x80000000) ? "MULTI " : "");
623
624 pr_err("FSYNR0 = %08x FSYNR1 = %08x\n",
625 GET_FSYNR0(base, ctx), GET_FSYNR1(base, ctx));
626 pr_err("TTBR0 = %08x TTBR1 = %08x\n",
627 GET_TTBR0(base, ctx), GET_TTBR1(base, ctx));
628 pr_err("SCTLR = %08x ACTLR = %08x\n",
629 GET_SCTLR(base, ctx), GET_ACTLR(base, ctx));
630 pr_err("PRRR = %08x NMRR = %08x\n",
631 GET_PRRR(base, ctx), GET_NMRR(base, ctx));
632}
633
634irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
635{
636 struct msm_iommu_drvdata *drvdata = dev_id;
637 void __iomem *base;
638 unsigned int fsr;
639 int i, ret;
640
641 spin_lock(&msm_iommu_lock);
642
643 if (!drvdata) {
644 pr_err("Invalid device ID in context interrupt handler\n");
645 goto fail;
646 }
647
648 base = drvdata->base;
649
650 pr_err("Unexpected IOMMU page fault!\n");
651 pr_err("base = %08x\n", (unsigned int) base);
652
653 ret = __enable_clocks(drvdata);
654 if (ret)
655 goto fail;
656
657 for (i = 0; i < drvdata->ncb; i++) {
658 fsr = GET_FSR(base, i);
659 if (fsr) {
660 pr_err("Fault occurred in context %d.\n", i);
661 pr_err("Interesting registers:\n");
662 print_ctx_regs(base, i);
663 SET_FSR(base, i, 0x4000000F);
664 }
665 }
666 __disable_clocks(drvdata);
667fail:
668 spin_unlock(&msm_iommu_lock);
669 return 0;
670}
671
672static struct iommu_ops msm_iommu_ops = {
673 .domain_init = msm_iommu_domain_init,
674 .domain_destroy = msm_iommu_domain_destroy,
675 .attach_dev = msm_iommu_attach_dev,
676 .detach_dev = msm_iommu_detach_dev,
677 .map = msm_iommu_map,
678 .unmap = msm_iommu_unmap,
679 .iova_to_phys = msm_iommu_iova_to_phys,
680 .domain_has_cap = msm_iommu_domain_has_cap
681};
682
683static int __init get_tex_class(int icp, int ocp, int mt, int nos)
684{
685 int i = 0;
686 unsigned int prrr = 0;
687 unsigned int nmrr = 0;
688 int c_icp, c_ocp, c_mt, c_nos;
689
690 RCP15_PRRR(prrr);
691 RCP15_NMRR(nmrr);
692
693 for (i = 0; i < NUM_TEX_CLASS; i++) {
694 c_nos = PRRR_NOS(prrr, i);
695 c_mt = PRRR_MT(prrr, i);
696 c_icp = NMRR_ICP(nmrr, i);
697 c_ocp = NMRR_OCP(nmrr, i);
698
699 if (icp == c_icp && ocp == c_ocp && c_mt == mt && c_nos == nos)
700 return i;
701 }
702
703 return -ENODEV;
704}
705
706static void __init setup_iommu_tex_classes(void)
707{
708 msm_iommu_tex_class[MSM_IOMMU_ATTR_NONCACHED] =
709 get_tex_class(CP_NONCACHED, CP_NONCACHED, MT_NORMAL, 1);
710
711 msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WB_WA] =
712 get_tex_class(CP_WB_WA, CP_WB_WA, MT_NORMAL, 1);
713
714 msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WB_NWA] =
715 get_tex_class(CP_WB_NWA, CP_WB_NWA, MT_NORMAL, 1);
716
717 msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WT] =
718 get_tex_class(CP_WT, CP_WT, MT_NORMAL, 1);
719}
720
721static int __init msm_iommu_init(void)
722{
723 setup_iommu_tex_classes();
724 register_iommu(&msm_iommu_ops);
725 return 0;
726}
727
728subsys_initcall(msm_iommu_init);
729
730MODULE_LICENSE("GPL v2");
731MODULE_AUTHOR("Stepan Moskovchenko <stepanm@codeaurora.org>");
diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c
new file mode 100644
index 000000000000..8e8fb079852d
--- /dev/null
+++ b/drivers/iommu/msm_iommu_dev.c
@@ -0,0 +1,422 @@
1/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 and
5 * only version 2 as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
15 * 02110-1301, USA.
16 */
17
18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/platform_device.h>
23#include <linux/io.h>
24#include <linux/clk.h>
25#include <linux/iommu.h>
26#include <linux/interrupt.h>
27#include <linux/err.h>
28#include <linux/slab.h>
29
30#include <mach/iommu_hw-8xxx.h>
31#include <mach/iommu.h>
32#include <mach/clk.h>
33
34struct iommu_ctx_iter_data {
35 /* input */
36 const char *name;
37
38 /* output */
39 struct device *dev;
40};
41
42static struct platform_device *msm_iommu_root_dev;
43
44static int each_iommu_ctx(struct device *dev, void *data)
45{
46 struct iommu_ctx_iter_data *res = data;
47 struct msm_iommu_ctx_dev *c = dev->platform_data;
48
49 if (!res || !c || !c->name || !res->name)
50 return -EINVAL;
51
52 if (!strcmp(res->name, c->name)) {
53 res->dev = dev;
54 return 1;
55 }
56 return 0;
57}
58
59static int each_iommu(struct device *dev, void *data)
60{
61 return device_for_each_child(dev, data, each_iommu_ctx);
62}
63
64struct device *msm_iommu_get_ctx(const char *ctx_name)
65{
66 struct iommu_ctx_iter_data r;
67 int found;
68
69 if (!msm_iommu_root_dev) {
70 pr_err("No root IOMMU device.\n");
71 goto fail;
72 }
73
74 r.name = ctx_name;
75 found = device_for_each_child(&msm_iommu_root_dev->dev, &r, each_iommu);
76
77 if (!found) {
78 pr_err("Could not find context <%s>\n", ctx_name);
79 goto fail;
80 }
81
82 return r.dev;
83fail:
84 return NULL;
85}
86EXPORT_SYMBOL(msm_iommu_get_ctx);
87
88static void msm_iommu_reset(void __iomem *base, int ncb)
89{
90 int ctx;
91
92 SET_RPUE(base, 0);
93 SET_RPUEIE(base, 0);
94 SET_ESRRESTORE(base, 0);
95 SET_TBE(base, 0);
96 SET_CR(base, 0);
97 SET_SPDMBE(base, 0);
98 SET_TESTBUSCR(base, 0);
99 SET_TLBRSW(base, 0);
100 SET_GLOBAL_TLBIALL(base, 0);
101 SET_RPU_ACR(base, 0);
102 SET_TLBLKCRWE(base, 1);
103
104 for (ctx = 0; ctx < ncb; ctx++) {
105 SET_BPRCOSH(base, ctx, 0);
106 SET_BPRCISH(base, ctx, 0);
107 SET_BPRCNSH(base, ctx, 0);
108 SET_BPSHCFG(base, ctx, 0);
109 SET_BPMTCFG(base, ctx, 0);
110 SET_ACTLR(base, ctx, 0);
111 SET_SCTLR(base, ctx, 0);
112 SET_FSRRESTORE(base, ctx, 0);
113 SET_TTBR0(base, ctx, 0);
114 SET_TTBR1(base, ctx, 0);
115 SET_TTBCR(base, ctx, 0);
116 SET_BFBCR(base, ctx, 0);
117 SET_PAR(base, ctx, 0);
118 SET_FAR(base, ctx, 0);
119 SET_CTX_TLBIALL(base, ctx, 0);
120 SET_TLBFLPTER(base, ctx, 0);
121 SET_TLBSLPTER(base, ctx, 0);
122 SET_TLBLKCR(base, ctx, 0);
123 SET_PRRR(base, ctx, 0);
124 SET_NMRR(base, ctx, 0);
125 SET_CONTEXTIDR(base, ctx, 0);
126 }
127}
128
129static int msm_iommu_probe(struct platform_device *pdev)
130{
131 struct resource *r, *r2;
132 struct clk *iommu_clk;
133 struct clk *iommu_pclk;
134 struct msm_iommu_drvdata *drvdata;
135 struct msm_iommu_dev *iommu_dev = pdev->dev.platform_data;
136 void __iomem *regs_base;
137 resource_size_t len;
138 int ret, irq, par;
139
140 if (pdev->id == -1) {
141 msm_iommu_root_dev = pdev;
142 return 0;
143 }
144
145 drvdata = kzalloc(sizeof(*drvdata), GFP_KERNEL);
146
147 if (!drvdata) {
148 ret = -ENOMEM;
149 goto fail;
150 }
151
152 if (!iommu_dev) {
153 ret = -ENODEV;
154 goto fail;
155 }
156
157 iommu_pclk = clk_get(NULL, "smmu_pclk");
158 if (IS_ERR(iommu_pclk)) {
159 ret = -ENODEV;
160 goto fail;
161 }
162
163 ret = clk_enable(iommu_pclk);
164 if (ret)
165 goto fail_enable;
166
167 iommu_clk = clk_get(&pdev->dev, "iommu_clk");
168
169 if (!IS_ERR(iommu_clk)) {
170 if (clk_get_rate(iommu_clk) == 0)
171 clk_set_min_rate(iommu_clk, 1);
172
173 ret = clk_enable(iommu_clk);
174 if (ret) {
175 clk_put(iommu_clk);
176 goto fail_pclk;
177 }
178 } else
179 iommu_clk = NULL;
180
181 r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "physbase");
182
183 if (!r) {
184 ret = -ENODEV;
185 goto fail_clk;
186 }
187
188 len = resource_size(r);
189
190 r2 = request_mem_region(r->start, len, r->name);
191 if (!r2) {
192 pr_err("Could not request memory region: start=%p, len=%d\n",
193 (void *) r->start, len);
194 ret = -EBUSY;
195 goto fail_clk;
196 }
197
198 regs_base = ioremap(r2->start, len);
199
200 if (!regs_base) {
201 pr_err("Could not ioremap: start=%p, len=%d\n",
202 (void *) r2->start, len);
203 ret = -EBUSY;
204 goto fail_mem;
205 }
206
207 irq = platform_get_irq_byname(pdev, "secure_irq");
208 if (irq < 0) {
209 ret = -ENODEV;
210 goto fail_io;
211 }
212
213 msm_iommu_reset(regs_base, iommu_dev->ncb);
214
215 SET_M(regs_base, 0, 1);
216 SET_PAR(regs_base, 0, 0);
217 SET_V2PCFG(regs_base, 0, 1);
218 SET_V2PPR(regs_base, 0, 0);
219 par = GET_PAR(regs_base, 0);
220 SET_V2PCFG(regs_base, 0, 0);
221 SET_M(regs_base, 0, 0);
222
223 if (!par) {
224 pr_err("%s: Invalid PAR value detected\n", iommu_dev->name);
225 ret = -ENODEV;
226 goto fail_io;
227 }
228
229 ret = request_irq(irq, msm_iommu_fault_handler, 0,
230 "msm_iommu_secure_irpt_handler", drvdata);
231 if (ret) {
232 pr_err("Request IRQ %d failed with ret=%d\n", irq, ret);
233 goto fail_io;
234 }
235
236
237 drvdata->pclk = iommu_pclk;
238 drvdata->clk = iommu_clk;
239 drvdata->base = regs_base;
240 drvdata->irq = irq;
241 drvdata->ncb = iommu_dev->ncb;
242
243 pr_info("device %s mapped at %p, irq %d with %d ctx banks\n",
244 iommu_dev->name, regs_base, irq, iommu_dev->ncb);
245
246 platform_set_drvdata(pdev, drvdata);
247
248 if (iommu_clk)
249 clk_disable(iommu_clk);
250
251 clk_disable(iommu_pclk);
252
253 return 0;
254fail_io:
255 iounmap(regs_base);
256fail_mem:
257 release_mem_region(r->start, len);
258fail_clk:
259 if (iommu_clk) {
260 clk_disable(iommu_clk);
261 clk_put(iommu_clk);
262 }
263fail_pclk:
264 clk_disable(iommu_pclk);
265fail_enable:
266 clk_put(iommu_pclk);
267fail:
268 kfree(drvdata);
269 return ret;
270}
271
272static int msm_iommu_remove(struct platform_device *pdev)
273{
274 struct msm_iommu_drvdata *drv = NULL;
275
276 drv = platform_get_drvdata(pdev);
277 if (drv) {
278 if (drv->clk)
279 clk_put(drv->clk);
280 clk_put(drv->pclk);
281 memset(drv, 0, sizeof(*drv));
282 kfree(drv);
283 platform_set_drvdata(pdev, NULL);
284 }
285 return 0;
286}
287
288static int msm_iommu_ctx_probe(struct platform_device *pdev)
289{
290 struct msm_iommu_ctx_dev *c = pdev->dev.platform_data;
291 struct msm_iommu_drvdata *drvdata;
292 struct msm_iommu_ctx_drvdata *ctx_drvdata = NULL;
293 int i, ret;
294 if (!c || !pdev->dev.parent) {
295 ret = -EINVAL;
296 goto fail;
297 }
298
299 drvdata = dev_get_drvdata(pdev->dev.parent);
300
301 if (!drvdata) {
302 ret = -ENODEV;
303 goto fail;
304 }
305
306 ctx_drvdata = kzalloc(sizeof(*ctx_drvdata), GFP_KERNEL);
307 if (!ctx_drvdata) {
308 ret = -ENOMEM;
309 goto fail;
310 }
311 ctx_drvdata->num = c->num;
312 ctx_drvdata->pdev = pdev;
313
314 INIT_LIST_HEAD(&ctx_drvdata->attached_elm);
315 platform_set_drvdata(pdev, ctx_drvdata);
316
317 ret = clk_enable(drvdata->pclk);
318 if (ret)
319 goto fail;
320
321 if (drvdata->clk) {
322 ret = clk_enable(drvdata->clk);
323 if (ret) {
324 clk_disable(drvdata->pclk);
325 goto fail;
326 }
327 }
328
329 /* Program the M2V tables for this context */
330 for (i = 0; i < MAX_NUM_MIDS; i++) {
331 int mid = c->mids[i];
332 if (mid == -1)
333 break;
334
335 SET_M2VCBR_N(drvdata->base, mid, 0);
336 SET_CBACR_N(drvdata->base, c->num, 0);
337
338 /* Set VMID = 0 */
339 SET_VMID(drvdata->base, mid, 0);
340
341 /* Set the context number for that MID to this context */
342 SET_CBNDX(drvdata->base, mid, c->num);
343
344 /* Set MID associated with this context bank to 0*/
345 SET_CBVMID(drvdata->base, c->num, 0);
346
347 /* Set the ASID for TLB tagging for this context */
348 SET_CONTEXTIDR_ASID(drvdata->base, c->num, c->num);
349
350 /* Set security bit override to be Non-secure */
351 SET_NSCFG(drvdata->base, mid, 3);
352 }
353
354 if (drvdata->clk)
355 clk_disable(drvdata->clk);
356 clk_disable(drvdata->pclk);
357
358 dev_info(&pdev->dev, "context %s using bank %d\n", c->name, c->num);
359 return 0;
360fail:
361 kfree(ctx_drvdata);
362 return ret;
363}
364
365static int msm_iommu_ctx_remove(struct platform_device *pdev)
366{
367 struct msm_iommu_ctx_drvdata *drv = NULL;
368 drv = platform_get_drvdata(pdev);
369 if (drv) {
370 memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata));
371 kfree(drv);
372 platform_set_drvdata(pdev, NULL);
373 }
374 return 0;
375}
376
377static struct platform_driver msm_iommu_driver = {
378 .driver = {
379 .name = "msm_iommu",
380 },
381 .probe = msm_iommu_probe,
382 .remove = msm_iommu_remove,
383};
384
385static struct platform_driver msm_iommu_ctx_driver = {
386 .driver = {
387 .name = "msm_iommu_ctx",
388 },
389 .probe = msm_iommu_ctx_probe,
390 .remove = msm_iommu_ctx_remove,
391};
392
393static int __init msm_iommu_driver_init(void)
394{
395 int ret;
396 ret = platform_driver_register(&msm_iommu_driver);
397 if (ret != 0) {
398 pr_err("Failed to register IOMMU driver\n");
399 goto error;
400 }
401
402 ret = platform_driver_register(&msm_iommu_ctx_driver);
403 if (ret != 0) {
404 pr_err("Failed to register IOMMU context driver\n");
405 goto error;
406 }
407
408error:
409 return ret;
410}
411
412static void __exit msm_iommu_driver_exit(void)
413{
414 platform_driver_unregister(&msm_iommu_ctx_driver);
415 platform_driver_unregister(&msm_iommu_driver);
416}
417
418subsys_initcall(msm_iommu_driver_init);
419module_exit(msm_iommu_driver_exit);
420
421MODULE_LICENSE("GPL v2");
422MODULE_AUTHOR("Stepan Moskovchenko <stepanm@codeaurora.org>");