summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS3
-rw-r--r--drivers/gpu/drm/panfrost/panfrost_mmu.c24
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd_iommu.c11
-rw-r--r--drivers/iommu/arm-smmu-impl.c174
-rw-r--r--drivers/iommu/arm-smmu-regs.h210
-rw-r--r--drivers/iommu/arm-smmu-v3.c973
-rw-r--r--drivers/iommu/arm-smmu.c662
-rw-r--r--drivers/iommu/arm-smmu.h402
-rw-r--r--drivers/iommu/dma-iommu.c9
-rw-r--r--drivers/iommu/exynos-iommu.c3
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/iommu/io-pgtable-arm-v7s.c57
-rw-r--r--drivers/iommu/io-pgtable-arm.c48
-rw-r--r--drivers/iommu/iommu.c24
-rw-r--r--drivers/iommu/ipmmu-vmsa.c28
-rw-r--r--drivers/iommu/msm_iommu.c42
-rw-r--r--drivers/iommu/mtk_iommu.c45
-rw-r--r--drivers/iommu/mtk_iommu_v1.c3
-rw-r--r--drivers/iommu/omap-iommu.c2
-rw-r--r--drivers/iommu/qcom_iommu.c61
-rw-r--r--drivers/iommu/rockchip-iommu.c2
-rw-r--r--drivers/iommu/s390-iommu.c3
-rw-r--r--drivers/iommu/tegra-gart.c12
-rw-r--r--drivers/iommu/tegra-smmu.c2
-rw-r--r--drivers/iommu/virtio-iommu.c5
-rw-r--r--drivers/vfio/vfio_iommu_type1.c27
-rw-r--r--include/linux/io-pgtable.h57
-rw-r--r--include/linux/iommu.h92
29 files changed, 1943 insertions, 1043 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 43604d6ab96c..cf04f72ca79f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1350,8 +1350,7 @@ M: Will Deacon <will@kernel.org>
1350R: Robin Murphy <robin.murphy@arm.com> 1350R: Robin Murphy <robin.murphy@arm.com>
1351L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 1351L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
1352S: Maintained 1352S: Maintained
1353F: drivers/iommu/arm-smmu.c 1353F: drivers/iommu/arm-smmu*
1354F: drivers/iommu/arm-smmu-v3.c
1355F: drivers/iommu/io-pgtable-arm.c 1354F: drivers/iommu/io-pgtable-arm.c
1356F: drivers/iommu/io-pgtable-arm-v7s.c 1355F: drivers/iommu/io-pgtable-arm-v7s.c
1357 1356
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 92ac995dd9c6..6e8145c36e93 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -222,7 +222,7 @@ void panfrost_mmu_unmap(struct panfrost_gem_object *bo)
222 size_t unmapped_page; 222 size_t unmapped_page;
223 size_t pgsize = get_pgsize(iova, len - unmapped_len); 223 size_t pgsize = get_pgsize(iova, len - unmapped_len);
224 224
225 unmapped_page = ops->unmap(ops, iova, pgsize); 225 unmapped_page = ops->unmap(ops, iova, pgsize, NULL);
226 if (!unmapped_page) 226 if (!unmapped_page)
227 break; 227 break;
228 228
@@ -247,20 +247,28 @@ static void mmu_tlb_inv_context_s1(void *cookie)
247 mmu_hw_do_operation(pfdev, 0, 0, ~0UL, AS_COMMAND_FLUSH_MEM); 247 mmu_hw_do_operation(pfdev, 0, 0, ~0UL, AS_COMMAND_FLUSH_MEM);
248} 248}
249 249
250static void mmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
251 size_t granule, bool leaf, void *cookie)
252{}
253
254static void mmu_tlb_sync_context(void *cookie) 250static void mmu_tlb_sync_context(void *cookie)
255{ 251{
256 //struct panfrost_device *pfdev = cookie; 252 //struct panfrost_device *pfdev = cookie;
257 // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X 253 // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X
258} 254}
259 255
260static const struct iommu_gather_ops mmu_tlb_ops = { 256static void mmu_tlb_flush_walk(unsigned long iova, size_t size, size_t granule,
257 void *cookie)
258{
259 mmu_tlb_sync_context(cookie);
260}
261
262static void mmu_tlb_flush_leaf(unsigned long iova, size_t size, size_t granule,
263 void *cookie)
264{
265 mmu_tlb_sync_context(cookie);
266}
267
268static const struct iommu_flush_ops mmu_tlb_ops = {
261 .tlb_flush_all = mmu_tlb_inv_context_s1, 269 .tlb_flush_all = mmu_tlb_inv_context_s1,
262 .tlb_add_flush = mmu_tlb_inv_range_nosync, 270 .tlb_flush_walk = mmu_tlb_flush_walk,
263 .tlb_sync = mmu_tlb_sync_context, 271 .tlb_flush_leaf = mmu_tlb_flush_leaf,
264}; 272};
265 273
266static const char *access_type_name(struct panfrost_device *pfdev, 274static const char *access_type_name(struct panfrost_device *pfdev,
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index f13f36ae1af6..a2729aadd300 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
13obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o 13obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
14obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o 14obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
15obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o 15obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
16obj-$(CONFIG_ARM_SMMU) += arm-smmu.o 16obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
17obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o 17obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
18obj-$(CONFIG_DMAR_TABLE) += dmar.o 18obj-$(CONFIG_DMAR_TABLE) += dmar.o
19obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o 19obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index b607a92791d3..29eeea914660 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3055,7 +3055,8 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
3055} 3055}
3056 3056
3057static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, 3057static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
3058 size_t page_size) 3058 size_t page_size,
3059 struct iommu_iotlb_gather *gather)
3059{ 3060{
3060 struct protection_domain *domain = to_pdomain(dom); 3061 struct protection_domain *domain = to_pdomain(dom);
3061 size_t unmap_size; 3062 size_t unmap_size;
@@ -3196,9 +3197,10 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
3196 domain_flush_complete(dom); 3197 domain_flush_complete(dom);
3197} 3198}
3198 3199
3199static void amd_iommu_iotlb_range_add(struct iommu_domain *domain, 3200static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
3200 unsigned long iova, size_t size) 3201 struct iommu_iotlb_gather *gather)
3201{ 3202{
3203 amd_iommu_flush_iotlb_all(domain);
3202} 3204}
3203 3205
3204const struct iommu_ops amd_iommu_ops = { 3206const struct iommu_ops amd_iommu_ops = {
@@ -3219,8 +3221,7 @@ const struct iommu_ops amd_iommu_ops = {
3219 .is_attach_deferred = amd_iommu_is_attach_deferred, 3221 .is_attach_deferred = amd_iommu_is_attach_deferred,
3220 .pgsize_bitmap = AMD_IOMMU_PGSIZES, 3222 .pgsize_bitmap = AMD_IOMMU_PGSIZES,
3221 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 3223 .flush_iotlb_all = amd_iommu_flush_iotlb_all,
3222 .iotlb_range_add = amd_iommu_iotlb_range_add, 3224 .iotlb_sync = amd_iommu_iotlb_sync,
3223 .iotlb_sync = amd_iommu_flush_iotlb_all,
3224}; 3225};
3225 3226
3226/***************************************************************************** 3227/*****************************************************************************
diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c
new file mode 100644
index 000000000000..5c87a38620c4
--- /dev/null
+++ b/drivers/iommu/arm-smmu-impl.c
@@ -0,0 +1,174 @@
1// SPDX-License-Identifier: GPL-2.0-only
2// Miscellaneous Arm SMMU implementation and integration quirks
3// Copyright (C) 2019 Arm Limited
4
5#define pr_fmt(fmt) "arm-smmu: " fmt
6
7#include <linux/bitfield.h>
8#include <linux/of.h>
9
10#include "arm-smmu.h"
11
12
13static int arm_smmu_gr0_ns(int offset)
14{
15 switch(offset) {
16 case ARM_SMMU_GR0_sCR0:
17 case ARM_SMMU_GR0_sACR:
18 case ARM_SMMU_GR0_sGFSR:
19 case ARM_SMMU_GR0_sGFSYNR0:
20 case ARM_SMMU_GR0_sGFSYNR1:
21 case ARM_SMMU_GR0_sGFSYNR2:
22 return offset + 0x400;
23 default:
24 return offset;
25 }
26}
27
28static u32 arm_smmu_read_ns(struct arm_smmu_device *smmu, int page,
29 int offset)
30{
31 if (page == ARM_SMMU_GR0)
32 offset = arm_smmu_gr0_ns(offset);
33 return readl_relaxed(arm_smmu_page(smmu, page) + offset);
34}
35
36static void arm_smmu_write_ns(struct arm_smmu_device *smmu, int page,
37 int offset, u32 val)
38{
39 if (page == ARM_SMMU_GR0)
40 offset = arm_smmu_gr0_ns(offset);
41 writel_relaxed(val, arm_smmu_page(smmu, page) + offset);
42}
43
44/* Since we don't care for sGFAR, we can do without 64-bit accessors */
45static const struct arm_smmu_impl calxeda_impl = {
46 .read_reg = arm_smmu_read_ns,
47 .write_reg = arm_smmu_write_ns,
48};
49
50
51struct cavium_smmu {
52 struct arm_smmu_device smmu;
53 u32 id_base;
54};
55
56static int cavium_cfg_probe(struct arm_smmu_device *smmu)
57{
58 static atomic_t context_count = ATOMIC_INIT(0);
59 struct cavium_smmu *cs = container_of(smmu, struct cavium_smmu, smmu);
60 /*
61 * Cavium CN88xx erratum #27704.
62 * Ensure ASID and VMID allocation is unique across all SMMUs in
63 * the system.
64 */
65 cs->id_base = atomic_fetch_add(smmu->num_context_banks, &context_count);
66 dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
67
68 return 0;
69}
70
71static int cavium_init_context(struct arm_smmu_domain *smmu_domain)
72{
73 struct cavium_smmu *cs = container_of(smmu_domain->smmu,
74 struct cavium_smmu, smmu);
75
76 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
77 smmu_domain->cfg.vmid += cs->id_base;
78 else
79 smmu_domain->cfg.asid += cs->id_base;
80
81 return 0;
82}
83
84static const struct arm_smmu_impl cavium_impl = {
85 .cfg_probe = cavium_cfg_probe,
86 .init_context = cavium_init_context,
87};
88
89static struct arm_smmu_device *cavium_smmu_impl_init(struct arm_smmu_device *smmu)
90{
91 struct cavium_smmu *cs;
92
93 cs = devm_kzalloc(smmu->dev, sizeof(*cs), GFP_KERNEL);
94 if (!cs)
95 return ERR_PTR(-ENOMEM);
96
97 cs->smmu = *smmu;
98 cs->smmu.impl = &cavium_impl;
99
100 devm_kfree(smmu->dev, smmu);
101
102 return &cs->smmu;
103}
104
105
106#define ARM_MMU500_ACTLR_CPRE (1 << 1)
107
108#define ARM_MMU500_ACR_CACHE_LOCK (1 << 26)
109#define ARM_MMU500_ACR_S2CRB_TLBEN (1 << 10)
110#define ARM_MMU500_ACR_SMTNMB_TLBEN (1 << 8)
111
112static int arm_mmu500_reset(struct arm_smmu_device *smmu)
113{
114 u32 reg, major;
115 int i;
116 /*
117 * On MMU-500 r2p0 onwards we need to clear ACR.CACHE_LOCK before
118 * writes to the context bank ACTLRs will stick. And we just hope that
119 * Secure has also cleared SACR.CACHE_LOCK for this to take effect...
120 */
121 reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID7);
122 major = FIELD_GET(ID7_MAJOR, reg);
123 reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sACR);
124 if (major >= 2)
125 reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
126 /*
127 * Allow unmatched Stream IDs to allocate bypass
128 * TLB entries for reduced latency.
129 */
130 reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
131 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sACR, reg);
132
133 /*
134 * Disable MMU-500's not-particularly-beneficial next-page
135 * prefetcher for the sake of errata #841119 and #826419.
136 */
137 for (i = 0; i < smmu->num_context_banks; ++i) {
138 reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR);
139 reg &= ~ARM_MMU500_ACTLR_CPRE;
140 arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg);
141 }
142
143 return 0;
144}
145
146static const struct arm_smmu_impl arm_mmu500_impl = {
147 .reset = arm_mmu500_reset,
148};
149
150
151struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
152{
153 /*
154 * We will inevitably have to combine model-specific implementation
155 * quirks with platform-specific integration quirks, but everything
156 * we currently support happens to work out as straightforward
157 * mutually-exclusive assignments.
158 */
159 switch (smmu->model) {
160 case ARM_MMU500:
161 smmu->impl = &arm_mmu500_impl;
162 break;
163 case CAVIUM_SMMUV2:
164 return cavium_smmu_impl_init(smmu);
165 default:
166 break;
167 }
168
169 if (of_property_read_bool(smmu->dev->of_node,
170 "calxeda,smmu-secure-config-access"))
171 smmu->impl = &calxeda_impl;
172
173 return smmu;
174}
diff --git a/drivers/iommu/arm-smmu-regs.h b/drivers/iommu/arm-smmu-regs.h
deleted file mode 100644
index 1c278f7ae888..000000000000
--- a/drivers/iommu/arm-smmu-regs.h
+++ /dev/null
@@ -1,210 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * IOMMU API for ARM architected SMMU implementations.
4 *
5 * Copyright (C) 2013 ARM Limited
6 *
7 * Author: Will Deacon <will.deacon@arm.com>
8 */
9
10#ifndef _ARM_SMMU_REGS_H
11#define _ARM_SMMU_REGS_H
12
13/* Configuration registers */
14#define ARM_SMMU_GR0_sCR0 0x0
15#define sCR0_CLIENTPD (1 << 0)
16#define sCR0_GFRE (1 << 1)
17#define sCR0_GFIE (1 << 2)
18#define sCR0_EXIDENABLE (1 << 3)
19#define sCR0_GCFGFRE (1 << 4)
20#define sCR0_GCFGFIE (1 << 5)
21#define sCR0_USFCFG (1 << 10)
22#define sCR0_VMIDPNE (1 << 11)
23#define sCR0_PTM (1 << 12)
24#define sCR0_FB (1 << 13)
25#define sCR0_VMID16EN (1 << 31)
26#define sCR0_BSU_SHIFT 14
27#define sCR0_BSU_MASK 0x3
28
29/* Auxiliary Configuration register */
30#define ARM_SMMU_GR0_sACR 0x10
31
32/* Identification registers */
33#define ARM_SMMU_GR0_ID0 0x20
34#define ARM_SMMU_GR0_ID1 0x24
35#define ARM_SMMU_GR0_ID2 0x28
36#define ARM_SMMU_GR0_ID3 0x2c
37#define ARM_SMMU_GR0_ID4 0x30
38#define ARM_SMMU_GR0_ID5 0x34
39#define ARM_SMMU_GR0_ID6 0x38
40#define ARM_SMMU_GR0_ID7 0x3c
41#define ARM_SMMU_GR0_sGFSR 0x48
42#define ARM_SMMU_GR0_sGFSYNR0 0x50
43#define ARM_SMMU_GR0_sGFSYNR1 0x54
44#define ARM_SMMU_GR0_sGFSYNR2 0x58
45
46#define ID0_S1TS (1 << 30)
47#define ID0_S2TS (1 << 29)
48#define ID0_NTS (1 << 28)
49#define ID0_SMS (1 << 27)
50#define ID0_ATOSNS (1 << 26)
51#define ID0_PTFS_NO_AARCH32 (1 << 25)
52#define ID0_PTFS_NO_AARCH32S (1 << 24)
53#define ID0_CTTW (1 << 14)
54#define ID0_NUMIRPT_SHIFT 16
55#define ID0_NUMIRPT_MASK 0xff
56#define ID0_NUMSIDB_SHIFT 9
57#define ID0_NUMSIDB_MASK 0xf
58#define ID0_EXIDS (1 << 8)
59#define ID0_NUMSMRG_SHIFT 0
60#define ID0_NUMSMRG_MASK 0xff
61
62#define ID1_PAGESIZE (1 << 31)
63#define ID1_NUMPAGENDXB_SHIFT 28
64#define ID1_NUMPAGENDXB_MASK 7
65#define ID1_NUMS2CB_SHIFT 16
66#define ID1_NUMS2CB_MASK 0xff
67#define ID1_NUMCB_SHIFT 0
68#define ID1_NUMCB_MASK 0xff
69
70#define ID2_OAS_SHIFT 4
71#define ID2_OAS_MASK 0xf
72#define ID2_IAS_SHIFT 0
73#define ID2_IAS_MASK 0xf
74#define ID2_UBS_SHIFT 8
75#define ID2_UBS_MASK 0xf
76#define ID2_PTFS_4K (1 << 12)
77#define ID2_PTFS_16K (1 << 13)
78#define ID2_PTFS_64K (1 << 14)
79#define ID2_VMID16 (1 << 15)
80
81#define ID7_MAJOR_SHIFT 4
82#define ID7_MAJOR_MASK 0xf
83
84/* Global TLB invalidation */
85#define ARM_SMMU_GR0_TLBIVMID 0x64
86#define ARM_SMMU_GR0_TLBIALLNSNH 0x68
87#define ARM_SMMU_GR0_TLBIALLH 0x6c
88#define ARM_SMMU_GR0_sTLBGSYNC 0x70
89#define ARM_SMMU_GR0_sTLBGSTATUS 0x74
90#define sTLBGSTATUS_GSACTIVE (1 << 0)
91
92/* Stream mapping registers */
93#define ARM_SMMU_GR0_SMR(n) (0x800 + ((n) << 2))
94#define SMR_VALID (1 << 31)
95#define SMR_MASK_SHIFT 16
96#define SMR_ID_SHIFT 0
97
98#define ARM_SMMU_GR0_S2CR(n) (0xc00 + ((n) << 2))
99#define S2CR_CBNDX_SHIFT 0
100#define S2CR_CBNDX_MASK 0xff
101#define S2CR_EXIDVALID (1 << 10)
102#define S2CR_TYPE_SHIFT 16
103#define S2CR_TYPE_MASK 0x3
104enum arm_smmu_s2cr_type {
105 S2CR_TYPE_TRANS,
106 S2CR_TYPE_BYPASS,
107 S2CR_TYPE_FAULT,
108};
109
110#define S2CR_PRIVCFG_SHIFT 24
111#define S2CR_PRIVCFG_MASK 0x3
112enum arm_smmu_s2cr_privcfg {
113 S2CR_PRIVCFG_DEFAULT,
114 S2CR_PRIVCFG_DIPAN,
115 S2CR_PRIVCFG_UNPRIV,
116 S2CR_PRIVCFG_PRIV,
117};
118
119/* Context bank attribute registers */
120#define ARM_SMMU_GR1_CBAR(n) (0x0 + ((n) << 2))
121#define CBAR_VMID_SHIFT 0
122#define CBAR_VMID_MASK 0xff
123#define CBAR_S1_BPSHCFG_SHIFT 8
124#define CBAR_S1_BPSHCFG_MASK 3
125#define CBAR_S1_BPSHCFG_NSH 3
126#define CBAR_S1_MEMATTR_SHIFT 12
127#define CBAR_S1_MEMATTR_MASK 0xf
128#define CBAR_S1_MEMATTR_WB 0xf
129#define CBAR_TYPE_SHIFT 16
130#define CBAR_TYPE_MASK 0x3
131#define CBAR_TYPE_S2_TRANS (0 << CBAR_TYPE_SHIFT)
132#define CBAR_TYPE_S1_TRANS_S2_BYPASS (1 << CBAR_TYPE_SHIFT)
133#define CBAR_TYPE_S1_TRANS_S2_FAULT (2 << CBAR_TYPE_SHIFT)
134#define CBAR_TYPE_S1_TRANS_S2_TRANS (3 << CBAR_TYPE_SHIFT)
135#define CBAR_IRPTNDX_SHIFT 24
136#define CBAR_IRPTNDX_MASK 0xff
137
138#define ARM_SMMU_GR1_CBFRSYNRA(n) (0x400 + ((n) << 2))
139
140#define ARM_SMMU_GR1_CBA2R(n) (0x800 + ((n) << 2))
141#define CBA2R_RW64_32BIT (0 << 0)
142#define CBA2R_RW64_64BIT (1 << 0)
143#define CBA2R_VMID_SHIFT 16
144#define CBA2R_VMID_MASK 0xffff
145
146#define ARM_SMMU_CB_SCTLR 0x0
147#define ARM_SMMU_CB_ACTLR 0x4
148#define ARM_SMMU_CB_RESUME 0x8
149#define ARM_SMMU_CB_TTBCR2 0x10
150#define ARM_SMMU_CB_TTBR0 0x20
151#define ARM_SMMU_CB_TTBR1 0x28
152#define ARM_SMMU_CB_TTBCR 0x30
153#define ARM_SMMU_CB_CONTEXTIDR 0x34
154#define ARM_SMMU_CB_S1_MAIR0 0x38
155#define ARM_SMMU_CB_S1_MAIR1 0x3c
156#define ARM_SMMU_CB_PAR 0x50
157#define ARM_SMMU_CB_FSR 0x58
158#define ARM_SMMU_CB_FAR 0x60
159#define ARM_SMMU_CB_FSYNR0 0x68
160#define ARM_SMMU_CB_S1_TLBIVA 0x600
161#define ARM_SMMU_CB_S1_TLBIASID 0x610
162#define ARM_SMMU_CB_S1_TLBIVAL 0x620
163#define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
164#define ARM_SMMU_CB_S2_TLBIIPAS2L 0x638
165#define ARM_SMMU_CB_TLBSYNC 0x7f0
166#define ARM_SMMU_CB_TLBSTATUS 0x7f4
167#define ARM_SMMU_CB_ATS1PR 0x800
168#define ARM_SMMU_CB_ATSR 0x8f0
169
170#define SCTLR_S1_ASIDPNE (1 << 12)
171#define SCTLR_CFCFG (1 << 7)
172#define SCTLR_CFIE (1 << 6)
173#define SCTLR_CFRE (1 << 5)
174#define SCTLR_E (1 << 4)
175#define SCTLR_AFE (1 << 2)
176#define SCTLR_TRE (1 << 1)
177#define SCTLR_M (1 << 0)
178
179#define CB_PAR_F (1 << 0)
180
181#define ATSR_ACTIVE (1 << 0)
182
183#define RESUME_RETRY (0 << 0)
184#define RESUME_TERMINATE (1 << 0)
185
186#define TTBCR2_SEP_SHIFT 15
187#define TTBCR2_SEP_UPSTREAM (0x7 << TTBCR2_SEP_SHIFT)
188#define TTBCR2_AS (1 << 4)
189
190#define TTBRn_ASID_SHIFT 48
191
192#define FSR_MULTI (1 << 31)
193#define FSR_SS (1 << 30)
194#define FSR_UUT (1 << 8)
195#define FSR_ASF (1 << 7)
196#define FSR_TLBLKF (1 << 6)
197#define FSR_TLBMCF (1 << 5)
198#define FSR_EF (1 << 4)
199#define FSR_PF (1 << 3)
200#define FSR_AFF (1 << 2)
201#define FSR_TF (1 << 1)
202
203#define FSR_IGN (FSR_AFF | FSR_ASF | \
204 FSR_TLBMCF | FSR_TLBLKF)
205#define FSR_FAULT (FSR_MULTI | FSR_SS | FSR_UUT | \
206 FSR_EF | FSR_PF | FSR_TF | FSR_IGN)
207
208#define FSYNR0_WNR (1 << 4)
209
210#endif /* _ARM_SMMU_REGS_H */
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index c5c93e48b4db..29b773e186c8 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -181,12 +181,13 @@
181#define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1 181#define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1
182#define ARM_SMMU_MEMATTR_OIWB 0xf 182#define ARM_SMMU_MEMATTR_OIWB 0xf
183 183
184#define Q_IDX(q, p) ((p) & ((1 << (q)->max_n_shift) - 1)) 184#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
185#define Q_WRP(q, p) ((p) & (1 << (q)->max_n_shift)) 185#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
186#define Q_OVERFLOW_FLAG (1 << 31) 186#define Q_OVERFLOW_FLAG (1U << 31)
187#define Q_OVF(q, p) ((p) & Q_OVERFLOW_FLAG) 187#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
188#define Q_ENT(q, p) ((q)->base + \ 188#define Q_ENT(q, p) ((q)->base + \
189 Q_IDX(q, p) * (q)->ent_dwords) 189 Q_IDX(&((q)->llq), p) * \
190 (q)->ent_dwords)
190 191
191#define Q_BASE_RWA (1UL << 62) 192#define Q_BASE_RWA (1UL << 62)
192#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5) 193#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5)
@@ -306,6 +307,15 @@
306#define CMDQ_ERR_CERROR_ABT_IDX 2 307#define CMDQ_ERR_CERROR_ABT_IDX 2
307#define CMDQ_ERR_CERROR_ATC_INV_IDX 3 308#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
308 309
310#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
311
312/*
313 * This is used to size the command queue and therefore must be at least
314 * BITS_PER_LONG so that the valid_map works correctly (it relies on the
315 * total number of queue entries being a multiple of BITS_PER_LONG).
316 */
317#define CMDQ_BATCH_ENTRIES BITS_PER_LONG
318
309#define CMDQ_0_OP GENMASK_ULL(7, 0) 319#define CMDQ_0_OP GENMASK_ULL(7, 0)
310#define CMDQ_0_SSV (1UL << 11) 320#define CMDQ_0_SSV (1UL << 11)
311 321
@@ -368,9 +378,8 @@
368#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) 378#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
369 379
370/* High-level queue structures */ 380/* High-level queue structures */
371#define ARM_SMMU_POLL_TIMEOUT_US 100 381#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
372#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ 382#define ARM_SMMU_POLL_SPIN_COUNT 10
373#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
374 383
375#define MSI_IOVA_BASE 0x8000000 384#define MSI_IOVA_BASE 0x8000000
376#define MSI_IOVA_LENGTH 0x100000 385#define MSI_IOVA_LENGTH 0x100000
@@ -472,13 +481,29 @@ struct arm_smmu_cmdq_ent {
472 481
473 #define CMDQ_OP_CMD_SYNC 0x46 482 #define CMDQ_OP_CMD_SYNC 0x46
474 struct { 483 struct {
475 u32 msidata;
476 u64 msiaddr; 484 u64 msiaddr;
477 } sync; 485 } sync;
478 }; 486 };
479}; 487};
480 488
489struct arm_smmu_ll_queue {
490 union {
491 u64 val;
492 struct {
493 u32 prod;
494 u32 cons;
495 };
496 struct {
497 atomic_t prod;
498 atomic_t cons;
499 } atomic;
500 u8 __pad[SMP_CACHE_BYTES];
501 } ____cacheline_aligned_in_smp;
502 u32 max_n_shift;
503};
504
481struct arm_smmu_queue { 505struct arm_smmu_queue {
506 struct arm_smmu_ll_queue llq;
482 int irq; /* Wired interrupt */ 507 int irq; /* Wired interrupt */
483 508
484 __le64 *base; 509 __le64 *base;
@@ -486,17 +511,23 @@ struct arm_smmu_queue {
486 u64 q_base; 511 u64 q_base;
487 512
488 size_t ent_dwords; 513 size_t ent_dwords;
489 u32 max_n_shift;
490 u32 prod;
491 u32 cons;
492 514
493 u32 __iomem *prod_reg; 515 u32 __iomem *prod_reg;
494 u32 __iomem *cons_reg; 516 u32 __iomem *cons_reg;
495}; 517};
496 518
519struct arm_smmu_queue_poll {
520 ktime_t timeout;
521 unsigned int delay;
522 unsigned int spin_cnt;
523 bool wfe;
524};
525
497struct arm_smmu_cmdq { 526struct arm_smmu_cmdq {
498 struct arm_smmu_queue q; 527 struct arm_smmu_queue q;
499 spinlock_t lock; 528 atomic_long_t *valid_map;
529 atomic_t owner_prod;
530 atomic_t lock;
500}; 531};
501 532
502struct arm_smmu_evtq { 533struct arm_smmu_evtq {
@@ -576,8 +607,6 @@ struct arm_smmu_device {
576 607
577 int gerr_irq; 608 int gerr_irq;
578 int combined_irq; 609 int combined_irq;
579 u32 sync_nr;
580 u8 prev_cmd_opcode;
581 610
582 unsigned long ias; /* IPA */ 611 unsigned long ias; /* IPA */
583 unsigned long oas; /* PA */ 612 unsigned long oas; /* PA */
@@ -596,12 +625,6 @@ struct arm_smmu_device {
596 625
597 struct arm_smmu_strtab_cfg strtab_cfg; 626 struct arm_smmu_strtab_cfg strtab_cfg;
598 627
599 /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
600 union {
601 u32 sync_count;
602 u64 padding;
603 };
604
605 /* IOMMU core code handle */ 628 /* IOMMU core code handle */
606 struct iommu_device iommu; 629 struct iommu_device iommu;
607}; 630};
@@ -614,7 +637,7 @@ struct arm_smmu_master {
614 struct list_head domain_head; 637 struct list_head domain_head;
615 u32 *sids; 638 u32 *sids;
616 unsigned int num_sids; 639 unsigned int num_sids;
617 bool ats_enabled :1; 640 bool ats_enabled;
618}; 641};
619 642
620/* SMMU private data for an IOMMU domain */ 643/* SMMU private data for an IOMMU domain */
@@ -631,6 +654,7 @@ struct arm_smmu_domain {
631 654
632 struct io_pgtable_ops *pgtbl_ops; 655 struct io_pgtable_ops *pgtbl_ops;
633 bool non_strict; 656 bool non_strict;
657 atomic_t nr_ats_masters;
634 658
635 enum arm_smmu_domain_stage stage; 659 enum arm_smmu_domain_stage stage;
636 union { 660 union {
@@ -685,85 +709,97 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
685} 709}
686 710
687/* Low-level queue manipulation functions */ 711/* Low-level queue manipulation functions */
688static bool queue_full(struct arm_smmu_queue *q) 712static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
713{
714 u32 space, prod, cons;
715
716 prod = Q_IDX(q, q->prod);
717 cons = Q_IDX(q, q->cons);
718
719 if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
720 space = (1 << q->max_n_shift) - (prod - cons);
721 else
722 space = cons - prod;
723
724 return space >= n;
725}
726
727static bool queue_full(struct arm_smmu_ll_queue *q)
689{ 728{
690 return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && 729 return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
691 Q_WRP(q, q->prod) != Q_WRP(q, q->cons); 730 Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
692} 731}
693 732
694static bool queue_empty(struct arm_smmu_queue *q) 733static bool queue_empty(struct arm_smmu_ll_queue *q)
695{ 734{
696 return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && 735 return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
697 Q_WRP(q, q->prod) == Q_WRP(q, q->cons); 736 Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
698} 737}
699 738
700static void queue_sync_cons(struct arm_smmu_queue *q) 739static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
701{ 740{
702 q->cons = readl_relaxed(q->cons_reg); 741 return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
742 (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
743 ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
744 (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
703} 745}
704 746
705static void queue_inc_cons(struct arm_smmu_queue *q) 747static void queue_sync_cons_out(struct arm_smmu_queue *q)
706{ 748{
707 u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
708
709 q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
710
711 /* 749 /*
712 * Ensure that all CPU accesses (reads and writes) to the queue 750 * Ensure that all CPU accesses (reads and writes) to the queue
713 * are complete before we update the cons pointer. 751 * are complete before we update the cons pointer.
714 */ 752 */
715 mb(); 753 mb();
716 writel_relaxed(q->cons, q->cons_reg); 754 writel_relaxed(q->llq.cons, q->cons_reg);
755}
756
757static void queue_inc_cons(struct arm_smmu_ll_queue *q)
758{
759 u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
760 q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
717} 761}
718 762
719static int queue_sync_prod(struct arm_smmu_queue *q) 763static int queue_sync_prod_in(struct arm_smmu_queue *q)
720{ 764{
721 int ret = 0; 765 int ret = 0;
722 u32 prod = readl_relaxed(q->prod_reg); 766 u32 prod = readl_relaxed(q->prod_reg);
723 767
724 if (Q_OVF(q, prod) != Q_OVF(q, q->prod)) 768 if (Q_OVF(prod) != Q_OVF(q->llq.prod))
725 ret = -EOVERFLOW; 769 ret = -EOVERFLOW;
726 770
727 q->prod = prod; 771 q->llq.prod = prod;
728 return ret; 772 return ret;
729} 773}
730 774
731static void queue_inc_prod(struct arm_smmu_queue *q) 775static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
732{ 776{
733 u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; 777 u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
734 778 return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
735 q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
736 writel(q->prod, q->prod_reg);
737} 779}
738 780
739/* 781static void queue_poll_init(struct arm_smmu_device *smmu,
740 * Wait for the SMMU to consume items. If sync is true, wait until the queue 782 struct arm_smmu_queue_poll *qp)
741 * is empty. Otherwise, wait until there is at least one free slot.
742 */
743static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
744{ 783{
745 ktime_t timeout; 784 qp->delay = 1;
746 unsigned int delay = 1, spin_cnt = 0; 785 qp->spin_cnt = 0;
747 786 qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
748 /* Wait longer if it's a CMD_SYNC */ 787 qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
749 timeout = ktime_add_us(ktime_get(), sync ? 788}
750 ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
751 ARM_SMMU_POLL_TIMEOUT_US);
752 789
753 while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) { 790static int queue_poll(struct arm_smmu_queue_poll *qp)
754 if (ktime_compare(ktime_get(), timeout) > 0) 791{
755 return -ETIMEDOUT; 792 if (ktime_compare(ktime_get(), qp->timeout) > 0)
793 return -ETIMEDOUT;
756 794
757 if (wfe) { 795 if (qp->wfe) {
758 wfe(); 796 wfe();
759 } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { 797 } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
760 cpu_relax(); 798 cpu_relax();
761 continue; 799 } else {
762 } else { 800 udelay(qp->delay);
763 udelay(delay); 801 qp->delay *= 2;
764 delay *= 2; 802 qp->spin_cnt = 0;
765 spin_cnt = 0;
766 }
767 } 803 }
768 804
769 return 0; 805 return 0;
@@ -777,16 +813,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
777 *dst++ = cpu_to_le64(*src++); 813 *dst++ = cpu_to_le64(*src++);
778} 814}
779 815
780static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
781{
782 if (queue_full(q))
783 return -ENOSPC;
784
785 queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
786 queue_inc_prod(q);
787 return 0;
788}
789
790static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) 816static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
791{ 817{
792 int i; 818 int i;
@@ -797,11 +823,12 @@ static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
797 823
798static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) 824static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
799{ 825{
800 if (queue_empty(q)) 826 if (queue_empty(&q->llq))
801 return -EAGAIN; 827 return -EAGAIN;
802 828
803 queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords); 829 queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
804 queue_inc_cons(q); 830 queue_inc_cons(&q->llq);
831 queue_sync_cons_out(q);
805 return 0; 832 return 0;
806} 833}
807 834
@@ -868,20 +895,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
868 cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); 895 cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
869 break; 896 break;
870 case CMDQ_OP_CMD_SYNC: 897 case CMDQ_OP_CMD_SYNC:
871 if (ent->sync.msiaddr) 898 if (ent->sync.msiaddr) {
872 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ); 899 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
873 else 900 cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
901 } else {
874 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV); 902 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
903 }
875 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); 904 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
876 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB); 905 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
877 /*
878 * Commands are written little-endian, but we want the SMMU to
879 * receive MSIData, and thus write it back to memory, in CPU
880 * byte order, so big-endian needs an extra byteswap here.
881 */
882 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
883 cpu_to_le32(ent->sync.msidata));
884 cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
885 break; 906 break;
886 default: 907 default:
887 return -ENOENT; 908 return -ENOENT;
@@ -890,6 +911,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
890 return 0; 911 return 0;
891} 912}
892 913
914static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
915 u32 prod)
916{
917 struct arm_smmu_queue *q = &smmu->cmdq.q;
918 struct arm_smmu_cmdq_ent ent = {
919 .opcode = CMDQ_OP_CMD_SYNC,
920 };
921
922 /*
923 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
924 * payload, so the write will zero the entire command on that platform.
925 */
926 if (smmu->features & ARM_SMMU_FEAT_MSI &&
927 smmu->features & ARM_SMMU_FEAT_COHERENCY) {
928 ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
929 q->ent_dwords * 8;
930 }
931
932 arm_smmu_cmdq_build_cmd(cmd, &ent);
933}
934
893static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) 935static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
894{ 936{
895 static const char *cerror_str[] = { 937 static const char *cerror_str[] = {
@@ -948,109 +990,456 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
948 queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); 990 queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
949} 991}
950 992
951static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) 993/*
994 * Command queue locking.
995 * This is a form of bastardised rwlock with the following major changes:
996 *
997 * - The only LOCK routines are exclusive_trylock() and shared_lock().
998 * Neither have barrier semantics, and instead provide only a control
999 * dependency.
1000 *
1001 * - The UNLOCK routines are supplemented with shared_tryunlock(), which
1002 * fails if the caller appears to be the last lock holder (yes, this is
1003 * racy). All successful UNLOCK routines have RELEASE semantics.
1004 */
1005static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
952{ 1006{
953 struct arm_smmu_queue *q = &smmu->cmdq.q; 1007 int val;
954 bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); 1008
1009 /*
1010 * We can try to avoid the cmpxchg() loop by simply incrementing the
1011 * lock counter. When held in exclusive state, the lock counter is set
1012 * to INT_MIN so these increments won't hurt as the value will remain
1013 * negative.
1014 */
1015 if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
1016 return;
1017
1018 do {
1019 val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
1020 } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
1021}
1022
1023static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
1024{
1025 (void)atomic_dec_return_release(&cmdq->lock);
1026}
1027
1028static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
1029{
1030 if (atomic_read(&cmdq->lock) == 1)
1031 return false;
1032
1033 arm_smmu_cmdq_shared_unlock(cmdq);
1034 return true;
1035}
1036
1037#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
1038({ \
1039 bool __ret; \
1040 local_irq_save(flags); \
1041 __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
1042 if (!__ret) \
1043 local_irq_restore(flags); \
1044 __ret; \
1045})
1046
1047#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
1048({ \
1049 atomic_set_release(&cmdq->lock, 0); \
1050 local_irq_restore(flags); \
1051})
1052
1053
1054/*
1055 * Command queue insertion.
1056 * This is made fiddly by our attempts to achieve some sort of scalability
1057 * since there is one queue shared amongst all of the CPUs in the system. If
1058 * you like mixed-size concurrency, dependency ordering and relaxed atomics,
1059 * then you'll *love* this monstrosity.
1060 *
1061 * The basic idea is to split the queue up into ranges of commands that are
1062 * owned by a given CPU; the owner may not have written all of the commands
1063 * itself, but is responsible for advancing the hardware prod pointer when
1064 * the time comes. The algorithm is roughly:
1065 *
1066 * 1. Allocate some space in the queue. At this point we also discover
1067 * whether the head of the queue is currently owned by another CPU,
1068 * or whether we are the owner.
1069 *
1070 * 2. Write our commands into our allocated slots in the queue.
1071 *
1072 * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
1073 *
1074 * 4. If we are an owner:
1075 * a. Wait for the previous owner to finish.
1076 * b. Mark the queue head as unowned, which tells us the range
1077 * that we are responsible for publishing.
1078 * c. Wait for all commands in our owned range to become valid.
1079 * d. Advance the hardware prod pointer.
1080 * e. Tell the next owner we've finished.
1081 *
1082 * 5. If we are inserting a CMD_SYNC (we may or may not have been an
1083 * owner), then we need to stick around until it has completed:
1084 * a. If we have MSIs, the SMMU can write back into the CMD_SYNC
1085 * to clear the first 4 bytes.
1086 * b. Otherwise, we spin waiting for the hardware cons pointer to
1087 * advance past our command.
1088 *
1089 * The devil is in the details, particularly the use of locking for handling
1090 * SYNC completion and freeing up space in the queue before we think that it is
1091 * full.
1092 */
1093static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
1094 u32 sprod, u32 eprod, bool set)
1095{
1096 u32 swidx, sbidx, ewidx, ebidx;
1097 struct arm_smmu_ll_queue llq = {
1098 .max_n_shift = cmdq->q.llq.max_n_shift,
1099 .prod = sprod,
1100 };
1101
1102 ewidx = BIT_WORD(Q_IDX(&llq, eprod));
1103 ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
955 1104
956 smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); 1105 while (llq.prod != eprod) {
1106 unsigned long mask;
1107 atomic_long_t *ptr;
1108 u32 limit = BITS_PER_LONG;
957 1109
958 while (queue_insert_raw(q, cmd) == -ENOSPC) { 1110 swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
959 if (queue_poll_cons(q, false, wfe)) 1111 sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
960 dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); 1112
1113 ptr = &cmdq->valid_map[swidx];
1114
1115 if ((swidx == ewidx) && (sbidx < ebidx))
1116 limit = ebidx;
1117
1118 mask = GENMASK(limit - 1, sbidx);
1119
1120 /*
1121 * The valid bit is the inverse of the wrap bit. This means
1122 * that a zero-initialised queue is invalid and, after marking
1123 * all entries as valid, they become invalid again when we
1124 * wrap.
1125 */
1126 if (set) {
1127 atomic_long_xor(mask, ptr);
1128 } else { /* Poll */
1129 unsigned long valid;
1130
1131 valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
1132 atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
1133 }
1134
1135 llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
961 } 1136 }
962} 1137}
963 1138
964static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, 1139/* Mark all entries in the range [sprod, eprod) as valid */
965 struct arm_smmu_cmdq_ent *ent) 1140static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
1141 u32 sprod, u32 eprod)
1142{
1143 __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
1144}
1145
1146/* Wait for all entries in the range [sprod, eprod) to become valid */
1147static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
1148 u32 sprod, u32 eprod)
1149{
1150 __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
1151}
1152
1153/* Wait for the command queue to become non-full */
1154static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
1155 struct arm_smmu_ll_queue *llq)
966{ 1156{
967 u64 cmd[CMDQ_ENT_DWORDS];
968 unsigned long flags; 1157 unsigned long flags;
1158 struct arm_smmu_queue_poll qp;
1159 struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
1160 int ret = 0;
969 1161
970 if (arm_smmu_cmdq_build_cmd(cmd, ent)) { 1162 /*
971 dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", 1163 * Try to update our copy of cons by grabbing exclusive cmdq access. If
972 ent->opcode); 1164 * that fails, spin until somebody else updates it for us.
973 return; 1165 */
1166 if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
1167 WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
1168 arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
1169 llq->val = READ_ONCE(cmdq->q.llq.val);
1170 return 0;
974 } 1171 }
975 1172
976 spin_lock_irqsave(&smmu->cmdq.lock, flags); 1173 queue_poll_init(smmu, &qp);
977 arm_smmu_cmdq_insert_cmd(smmu, cmd); 1174 do {
978 spin_unlock_irqrestore(&smmu->cmdq.lock, flags); 1175 llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
1176 if (!queue_full(llq))
1177 break;
1178
1179 ret = queue_poll(&qp);
1180 } while (!ret);
1181
1182 return ret;
979} 1183}
980 1184
981/* 1185/*
982 * The difference between val and sync_idx is bounded by the maximum size of 1186 * Wait until the SMMU signals a CMD_SYNC completion MSI.
983 * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. 1187 * Must be called with the cmdq lock held in some capacity.
984 */ 1188 */
985static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) 1189static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
1190 struct arm_smmu_ll_queue *llq)
986{ 1191{
987 ktime_t timeout; 1192 int ret = 0;
988 u32 val; 1193 struct arm_smmu_queue_poll qp;
1194 struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
1195 u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
989 1196
990 timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); 1197 queue_poll_init(smmu, &qp);
991 val = smp_cond_load_acquire(&smmu->sync_count,
992 (int)(VAL - sync_idx) >= 0 ||
993 !ktime_before(ktime_get(), timeout));
994 1198
995 return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; 1199 /*
1200 * The MSI won't generate an event, since it's being written back
1201 * into the command queue.
1202 */
1203 qp.wfe = false;
1204 smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
1205 llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
1206 return ret;
996} 1207}
997 1208
998static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) 1209/*
1210 * Wait until the SMMU cons index passes llq->prod.
1211 * Must be called with the cmdq lock held in some capacity.
1212 */
1213static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
1214 struct arm_smmu_ll_queue *llq)
999{ 1215{
1000 u64 cmd[CMDQ_ENT_DWORDS]; 1216 struct arm_smmu_queue_poll qp;
1001 unsigned long flags; 1217 struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
1002 struct arm_smmu_cmdq_ent ent = { 1218 u32 prod = llq->prod;
1003 .opcode = CMDQ_OP_CMD_SYNC, 1219 int ret = 0;
1004 .sync = {
1005 .msiaddr = virt_to_phys(&smmu->sync_count),
1006 },
1007 };
1008 1220
1009 spin_lock_irqsave(&smmu->cmdq.lock, flags); 1221 queue_poll_init(smmu, &qp);
1222 llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
1223 do {
1224 if (queue_consumed(llq, prod))
1225 break;
1010 1226
1011 /* Piggy-back on the previous command if it's a SYNC */ 1227 ret = queue_poll(&qp);
1012 if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
1013 ent.sync.msidata = smmu->sync_nr;
1014 } else {
1015 ent.sync.msidata = ++smmu->sync_nr;
1016 arm_smmu_cmdq_build_cmd(cmd, &ent);
1017 arm_smmu_cmdq_insert_cmd(smmu, cmd);
1018 }
1019 1228
1020 spin_unlock_irqrestore(&smmu->cmdq.lock, flags); 1229 /*
1230 * This needs to be a readl() so that our subsequent call
1231 * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
1232 *
1233 * Specifically, we need to ensure that we observe all
1234 * shared_lock()s by other CMD_SYNCs that share our owner,
1235 * so that a failing call to tryunlock() means that we're
1236 * the last one out and therefore we can safely advance
1237 * cmdq->q.llq.cons. Roughly speaking:
1238 *
1239 * CPU 0 CPU1 CPU2 (us)
1240 *
1241 * if (sync)
1242 * shared_lock();
1243 *
1244 * dma_wmb();
1245 * set_valid_map();
1246 *
1247 * if (owner) {
1248 * poll_valid_map();
1249 * <control dependency>
1250 * writel(prod_reg);
1251 *
1252 * readl(cons_reg);
1253 * tryunlock();
1254 *
1255 * Requires us to see CPU 0's shared_lock() acquisition.
1256 */
1257 llq->cons = readl(cmdq->q.cons_reg);
1258 } while (!ret);
1021 1259
1022 return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); 1260 return ret;
1023} 1261}
1024 1262
1025static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) 1263static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
1264 struct arm_smmu_ll_queue *llq)
1026{ 1265{
1027 u64 cmd[CMDQ_ENT_DWORDS]; 1266 if (smmu->features & ARM_SMMU_FEAT_MSI &&
1267 smmu->features & ARM_SMMU_FEAT_COHERENCY)
1268 return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
1269
1270 return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
1271}
1272
1273static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
1274 u32 prod, int n)
1275{
1276 int i;
1277 struct arm_smmu_ll_queue llq = {
1278 .max_n_shift = cmdq->q.llq.max_n_shift,
1279 .prod = prod,
1280 };
1281
1282 for (i = 0; i < n; ++i) {
1283 u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
1284
1285 prod = queue_inc_prod_n(&llq, i);
1286 queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
1287 }
1288}
1289
1290/*
1291 * This is the actual insertion function, and provides the following
1292 * ordering guarantees to callers:
1293 *
1294 * - There is a dma_wmb() before publishing any commands to the queue.
1295 * This can be relied upon to order prior writes to data structures
1296 * in memory (such as a CD or an STE) before the command.
1297 *
1298 * - On completion of a CMD_SYNC, there is a control dependency.
1299 * This can be relied upon to order subsequent writes to memory (e.g.
1300 * freeing an IOVA) after completion of the CMD_SYNC.
1301 *
1302 * - Command insertion is totally ordered, so if two CPUs each race to
1303 * insert their own list of commands then all of the commands from one
1304 * CPU will appear before any of the commands from the other CPU.
1305 */
1306static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
1307 u64 *cmds, int n, bool sync)
1308{
1309 u64 cmd_sync[CMDQ_ENT_DWORDS];
1310 u32 prod;
1028 unsigned long flags; 1311 unsigned long flags;
1029 bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); 1312 bool owner;
1030 struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; 1313 struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
1031 int ret; 1314 struct arm_smmu_ll_queue llq = {
1315 .max_n_shift = cmdq->q.llq.max_n_shift,
1316 }, head = llq;
1317 int ret = 0;
1032 1318
1033 arm_smmu_cmdq_build_cmd(cmd, &ent); 1319 /* 1. Allocate some space in the queue */
1320 local_irq_save(flags);
1321 llq.val = READ_ONCE(cmdq->q.llq.val);
1322 do {
1323 u64 old;
1324
1325 while (!queue_has_space(&llq, n + sync)) {
1326 local_irq_restore(flags);
1327 if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
1328 dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
1329 local_irq_save(flags);
1330 }
1331
1332 head.cons = llq.cons;
1333 head.prod = queue_inc_prod_n(&llq, n + sync) |
1334 CMDQ_PROD_OWNED_FLAG;
1335
1336 old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
1337 if (old == llq.val)
1338 break;
1339
1340 llq.val = old;
1341 } while (1);
1342 owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
1343 head.prod &= ~CMDQ_PROD_OWNED_FLAG;
1344 llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
1345
1346 /*
1347 * 2. Write our commands into the queue
1348 * Dependency ordering from the cmpxchg() loop above.
1349 */
1350 arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
1351 if (sync) {
1352 prod = queue_inc_prod_n(&llq, n);
1353 arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
1354 queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
1355
1356 /*
1357 * In order to determine completion of our CMD_SYNC, we must
1358 * ensure that the queue can't wrap twice without us noticing.
1359 * We achieve that by taking the cmdq lock as shared before
1360 * marking our slot as valid.
1361 */
1362 arm_smmu_cmdq_shared_lock(cmdq);
1363 }
1364
1365 /* 3. Mark our slots as valid, ensuring commands are visible first */
1366 dma_wmb();
1367 arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
1368
1369 /* 4. If we are the owner, take control of the SMMU hardware */
1370 if (owner) {
1371 /* a. Wait for previous owner to finish */
1372 atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
1373
1374 /* b. Stop gathering work by clearing the owned flag */
1375 prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
1376 &cmdq->q.llq.atomic.prod);
1377 prod &= ~CMDQ_PROD_OWNED_FLAG;
1378
1379 /*
1380 * c. Wait for any gathered work to be written to the queue.
1381 * Note that we read our own entries so that we have the control
1382 * dependency required by (d).
1383 */
1384 arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
1385
1386 /*
1387 * d. Advance the hardware prod pointer
1388 * Control dependency ordering from the entries becoming valid.
1389 */
1390 writel_relaxed(prod, cmdq->q.prod_reg);
1391
1392 /*
1393 * e. Tell the next owner we're done
1394 * Make sure we've updated the hardware first, so that we don't
1395 * race to update prod and potentially move it backwards.
1396 */
1397 atomic_set_release(&cmdq->owner_prod, prod);
1398 }
1399
1400 /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
1401 if (sync) {
1402 llq.prod = queue_inc_prod_n(&llq, n);
1403 ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
1404 if (ret) {
1405 dev_err_ratelimited(smmu->dev,
1406 "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
1407 llq.prod,
1408 readl_relaxed(cmdq->q.prod_reg),
1409 readl_relaxed(cmdq->q.cons_reg));
1410 }
1034 1411
1035 spin_lock_irqsave(&smmu->cmdq.lock, flags); 1412 /*
1036 arm_smmu_cmdq_insert_cmd(smmu, cmd); 1413 * Try to unlock the cmq lock. This will fail if we're the last
1037 ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); 1414 * reader, in which case we can safely update cmdq->q.llq.cons
1038 spin_unlock_irqrestore(&smmu->cmdq.lock, flags); 1415 */
1416 if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
1417 WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
1418 arm_smmu_cmdq_shared_unlock(cmdq);
1419 }
1420 }
1039 1421
1422 local_irq_restore(flags);
1040 return ret; 1423 return ret;
1041} 1424}
1042 1425
1043static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) 1426static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
1427 struct arm_smmu_cmdq_ent *ent)
1044{ 1428{
1045 int ret; 1429 u64 cmd[CMDQ_ENT_DWORDS];
1046 bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
1047 (smmu->features & ARM_SMMU_FEAT_COHERENCY);
1048 1430
1049 ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) 1431 if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
1050 : __arm_smmu_cmdq_issue_sync(smmu); 1432 dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
1051 if (ret) 1433 ent->opcode);
1052 dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); 1434 return -EINVAL;
1053 return ret; 1435 }
1436
1437 return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
1438}
1439
1440static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
1441{
1442 return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
1054} 1443}
1055 1444
1056/* Context descriptor manipulation functions */ 1445/* Context descriptor manipulation functions */
@@ -1305,6 +1694,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
1305 int i; 1694 int i;
1306 struct arm_smmu_device *smmu = dev; 1695 struct arm_smmu_device *smmu = dev;
1307 struct arm_smmu_queue *q = &smmu->evtq.q; 1696 struct arm_smmu_queue *q = &smmu->evtq.q;
1697 struct arm_smmu_ll_queue *llq = &q->llq;
1308 u64 evt[EVTQ_ENT_DWORDS]; 1698 u64 evt[EVTQ_ENT_DWORDS];
1309 1699
1310 do { 1700 do {
@@ -1322,12 +1712,13 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
1322 * Not much we can do on overflow, so scream and pretend we're 1712 * Not much we can do on overflow, so scream and pretend we're
1323 * trying harder. 1713 * trying harder.
1324 */ 1714 */
1325 if (queue_sync_prod(q) == -EOVERFLOW) 1715 if (queue_sync_prod_in(q) == -EOVERFLOW)
1326 dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n"); 1716 dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
1327 } while (!queue_empty(q)); 1717 } while (!queue_empty(llq));
1328 1718
1329 /* Sync our overflow flag, as we believe we're up to speed */ 1719 /* Sync our overflow flag, as we believe we're up to speed */
1330 q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); 1720 llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
1721 Q_IDX(llq, llq->cons);
1331 return IRQ_HANDLED; 1722 return IRQ_HANDLED;
1332} 1723}
1333 1724
@@ -1373,19 +1764,21 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
1373{ 1764{
1374 struct arm_smmu_device *smmu = dev; 1765 struct arm_smmu_device *smmu = dev;
1375 struct arm_smmu_queue *q = &smmu->priq.q; 1766 struct arm_smmu_queue *q = &smmu->priq.q;
1767 struct arm_smmu_ll_queue *llq = &q->llq;
1376 u64 evt[PRIQ_ENT_DWORDS]; 1768 u64 evt[PRIQ_ENT_DWORDS];
1377 1769
1378 do { 1770 do {
1379 while (!queue_remove_raw(q, evt)) 1771 while (!queue_remove_raw(q, evt))
1380 arm_smmu_handle_ppr(smmu, evt); 1772 arm_smmu_handle_ppr(smmu, evt);
1381 1773
1382 if (queue_sync_prod(q) == -EOVERFLOW) 1774 if (queue_sync_prod_in(q) == -EOVERFLOW)
1383 dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n"); 1775 dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
1384 } while (!queue_empty(q)); 1776 } while (!queue_empty(llq));
1385 1777
1386 /* Sync our overflow flag, as we believe we're up to speed */ 1778 /* Sync our overflow flag, as we believe we're up to speed */
1387 q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); 1779 llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
1388 writel(q->cons, q->cons_reg); 1780 Q_IDX(llq, llq->cons);
1781 queue_sync_cons_out(q);
1389 return IRQ_HANDLED; 1782 return IRQ_HANDLED;
1390} 1783}
1391 1784
@@ -1534,6 +1927,23 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
1534 if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) 1927 if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
1535 return 0; 1928 return 0;
1536 1929
1930 /*
1931 * Ensure that we've completed prior invalidation of the main TLBs
1932 * before we read 'nr_ats_masters' in case of a concurrent call to
1933 * arm_smmu_enable_ats():
1934 *
1935 * // unmap() // arm_smmu_enable_ats()
1936 * TLBI+SYNC atomic_inc(&nr_ats_masters);
1937 * smp_mb(); [...]
1938 * atomic_read(&nr_ats_masters); pci_enable_ats() // writel()
1939 *
1940 * Ensures that we always see the incremented 'nr_ats_masters' count if
1941 * ATS was enabled at the PCI device before completion of the TLBI.
1942 */
1943 smp_mb();
1944 if (!atomic_read(&smmu_domain->nr_ats_masters))
1945 return 0;
1946
1537 arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); 1947 arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
1538 1948
1539 spin_lock_irqsave(&smmu_domain->devices_lock, flags); 1949 spin_lock_irqsave(&smmu_domain->devices_lock, flags);
@@ -1545,13 +1955,6 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
1545} 1955}
1546 1956
1547/* IO_PGTABLE API */ 1957/* IO_PGTABLE API */
1548static void arm_smmu_tlb_sync(void *cookie)
1549{
1550 struct arm_smmu_domain *smmu_domain = cookie;
1551
1552 arm_smmu_cmdq_issue_sync(smmu_domain->smmu);
1553}
1554
1555static void arm_smmu_tlb_inv_context(void *cookie) 1958static void arm_smmu_tlb_inv_context(void *cookie)
1556{ 1959{
1557 struct arm_smmu_domain *smmu_domain = cookie; 1960 struct arm_smmu_domain *smmu_domain = cookie;
@@ -1570,25 +1973,32 @@ static void arm_smmu_tlb_inv_context(void *cookie)
1570 /* 1973 /*
1571 * NOTE: when io-pgtable is in non-strict mode, we may get here with 1974 * NOTE: when io-pgtable is in non-strict mode, we may get here with
1572 * PTEs previously cleared by unmaps on the current CPU not yet visible 1975 * PTEs previously cleared by unmaps on the current CPU not yet visible
1573 * to the SMMU. We are relying on the DSB implicit in queue_inc_prod() 1976 * to the SMMU. We are relying on the dma_wmb() implicit during cmd
1574 * to guarantee those are observed before the TLBI. Do be careful, 007. 1977 * insertion to guarantee those are observed before the TLBI. Do be
1978 * careful, 007.
1575 */ 1979 */
1576 arm_smmu_cmdq_issue_cmd(smmu, &cmd); 1980 arm_smmu_cmdq_issue_cmd(smmu, &cmd);
1577 arm_smmu_cmdq_issue_sync(smmu); 1981 arm_smmu_cmdq_issue_sync(smmu);
1982 arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
1578} 1983}
1579 1984
1580static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, 1985static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
1581 size_t granule, bool leaf, void *cookie) 1986 size_t granule, bool leaf,
1987 struct arm_smmu_domain *smmu_domain)
1582{ 1988{
1583 struct arm_smmu_domain *smmu_domain = cookie; 1989 u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
1584 struct arm_smmu_device *smmu = smmu_domain->smmu; 1990 struct arm_smmu_device *smmu = smmu_domain->smmu;
1991 unsigned long start = iova, end = iova + size;
1992 int i = 0;
1585 struct arm_smmu_cmdq_ent cmd = { 1993 struct arm_smmu_cmdq_ent cmd = {
1586 .tlbi = { 1994 .tlbi = {
1587 .leaf = leaf, 1995 .leaf = leaf,
1588 .addr = iova,
1589 }, 1996 },
1590 }; 1997 };
1591 1998
1999 if (!size)
2000 return;
2001
1592 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { 2002 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
1593 cmd.opcode = CMDQ_OP_TLBI_NH_VA; 2003 cmd.opcode = CMDQ_OP_TLBI_NH_VA;
1594 cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; 2004 cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid;
@@ -1597,16 +2007,54 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
1597 cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; 2007 cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
1598 } 2008 }
1599 2009
1600 do { 2010 while (iova < end) {
1601 arm_smmu_cmdq_issue_cmd(smmu, &cmd); 2011 if (i == CMDQ_BATCH_ENTRIES) {
1602 cmd.tlbi.addr += granule; 2012 arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
1603 } while (size -= granule); 2013 i = 0;
2014 }
2015
2016 cmd.tlbi.addr = iova;
2017 arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
2018 iova += granule;
2019 i++;
2020 }
2021
2022 arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true);
2023
2024 /*
2025 * Unfortunately, this can't be leaf-only since we may have
2026 * zapped an entire table.
2027 */
2028 arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
2029}
2030
2031static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
2032 unsigned long iova, size_t granule,
2033 void *cookie)
2034{
2035 struct arm_smmu_domain *smmu_domain = cookie;
2036 struct iommu_domain *domain = &smmu_domain->domain;
2037
2038 iommu_iotlb_gather_add_page(domain, gather, iova, granule);
2039}
2040
2041static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
2042 size_t granule, void *cookie)
2043{
2044 arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
2045}
2046
2047static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
2048 size_t granule, void *cookie)
2049{
2050 arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
1604} 2051}
1605 2052
1606static const struct iommu_gather_ops arm_smmu_gather_ops = { 2053static const struct iommu_flush_ops arm_smmu_flush_ops = {
1607 .tlb_flush_all = arm_smmu_tlb_inv_context, 2054 .tlb_flush_all = arm_smmu_tlb_inv_context,
1608 .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, 2055 .tlb_flush_walk = arm_smmu_tlb_inv_walk,
1609 .tlb_sync = arm_smmu_tlb_sync, 2056 .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
2057 .tlb_add_page = arm_smmu_tlb_inv_page_nosync,
1610}; 2058};
1611 2059
1612/* IOMMU API */ 2060/* IOMMU API */
@@ -1796,7 +2244,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
1796 .ias = ias, 2244 .ias = ias,
1797 .oas = oas, 2245 .oas = oas,
1798 .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, 2246 .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY,
1799 .tlb = &arm_smmu_gather_ops, 2247 .tlb = &arm_smmu_flush_ops,
1800 .iommu_dev = smmu->dev, 2248 .iommu_dev = smmu->dev,
1801 }; 2249 };
1802 2250
@@ -1863,44 +2311,58 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
1863 } 2311 }
1864} 2312}
1865 2313
1866static int arm_smmu_enable_ats(struct arm_smmu_master *master) 2314static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
1867{ 2315{
1868 int ret;
1869 size_t stu;
1870 struct pci_dev *pdev; 2316 struct pci_dev *pdev;
1871 struct arm_smmu_device *smmu = master->smmu; 2317 struct arm_smmu_device *smmu = master->smmu;
1872 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); 2318 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
1873 2319
1874 if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) || 2320 if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
1875 !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled()) 2321 !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled())
1876 return -ENXIO; 2322 return false;
1877 2323
1878 pdev = to_pci_dev(master->dev); 2324 pdev = to_pci_dev(master->dev);
1879 if (pdev->untrusted) 2325 return !pdev->untrusted && pdev->ats_cap;
1880 return -EPERM; 2326}
2327
2328static void arm_smmu_enable_ats(struct arm_smmu_master *master)
2329{
2330 size_t stu;
2331 struct pci_dev *pdev;
2332 struct arm_smmu_device *smmu = master->smmu;
2333 struct arm_smmu_domain *smmu_domain = master->domain;
2334
2335 /* Don't enable ATS at the endpoint if it's not enabled in the STE */
2336 if (!master->ats_enabled)
2337 return;
1881 2338
1882 /* Smallest Translation Unit: log2 of the smallest supported granule */ 2339 /* Smallest Translation Unit: log2 of the smallest supported granule */
1883 stu = __ffs(smmu->pgsize_bitmap); 2340 stu = __ffs(smmu->pgsize_bitmap);
2341 pdev = to_pci_dev(master->dev);
1884 2342
1885 ret = pci_enable_ats(pdev, stu); 2343 atomic_inc(&smmu_domain->nr_ats_masters);
1886 if (ret) 2344 arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
1887 return ret; 2345 if (pci_enable_ats(pdev, stu))
1888 2346 dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
1889 master->ats_enabled = true;
1890 return 0;
1891} 2347}
1892 2348
1893static void arm_smmu_disable_ats(struct arm_smmu_master *master) 2349static void arm_smmu_disable_ats(struct arm_smmu_master *master)
1894{ 2350{
1895 struct arm_smmu_cmdq_ent cmd; 2351 struct arm_smmu_cmdq_ent cmd;
2352 struct arm_smmu_domain *smmu_domain = master->domain;
1896 2353
1897 if (!master->ats_enabled || !dev_is_pci(master->dev)) 2354 if (!master->ats_enabled)
1898 return; 2355 return;
1899 2356
2357 pci_disable_ats(to_pci_dev(master->dev));
2358 /*
2359 * Ensure ATS is disabled at the endpoint before we issue the
2360 * ATC invalidation via the SMMU.
2361 */
2362 wmb();
1900 arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); 2363 arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
1901 arm_smmu_atc_inv_master(master, &cmd); 2364 arm_smmu_atc_inv_master(master, &cmd);
1902 pci_disable_ats(to_pci_dev(master->dev)); 2365 atomic_dec(&smmu_domain->nr_ats_masters);
1903 master->ats_enabled = false;
1904} 2366}
1905 2367
1906static void arm_smmu_detach_dev(struct arm_smmu_master *master) 2368static void arm_smmu_detach_dev(struct arm_smmu_master *master)
@@ -1911,14 +2373,15 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
1911 if (!smmu_domain) 2373 if (!smmu_domain)
1912 return; 2374 return;
1913 2375
2376 arm_smmu_disable_ats(master);
2377
1914 spin_lock_irqsave(&smmu_domain->devices_lock, flags); 2378 spin_lock_irqsave(&smmu_domain->devices_lock, flags);
1915 list_del(&master->domain_head); 2379 list_del(&master->domain_head);
1916 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); 2380 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
1917 2381
1918 master->domain = NULL; 2382 master->domain = NULL;
2383 master->ats_enabled = false;
1919 arm_smmu_install_ste_for_dev(master); 2384 arm_smmu_install_ste_for_dev(master);
1920
1921 arm_smmu_disable_ats(master);
1922} 2385}
1923 2386
1924static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) 2387static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -1958,17 +2421,20 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
1958 2421
1959 master->domain = smmu_domain; 2422 master->domain = smmu_domain;
1960 2423
1961 spin_lock_irqsave(&smmu_domain->devices_lock, flags);
1962 list_add(&master->domain_head, &smmu_domain->devices);
1963 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
1964
1965 if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) 2424 if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
1966 arm_smmu_enable_ats(master); 2425 master->ats_enabled = arm_smmu_ats_supported(master);
1967 2426
1968 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) 2427 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
1969 arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg); 2428 arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg);
1970 2429
1971 arm_smmu_install_ste_for_dev(master); 2430 arm_smmu_install_ste_for_dev(master);
2431
2432 spin_lock_irqsave(&smmu_domain->devices_lock, flags);
2433 list_add(&master->domain_head, &smmu_domain->devices);
2434 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
2435
2436 arm_smmu_enable_ats(master);
2437
1972out_unlock: 2438out_unlock:
1973 mutex_unlock(&smmu_domain->init_mutex); 2439 mutex_unlock(&smmu_domain->init_mutex);
1974 return ret; 2440 return ret;
@@ -1985,21 +2451,16 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
1985 return ops->map(ops, iova, paddr, size, prot); 2451 return ops->map(ops, iova, paddr, size, prot);
1986} 2452}
1987 2453
1988static size_t 2454static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
1989arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) 2455 size_t size, struct iommu_iotlb_gather *gather)
1990{ 2456{
1991 int ret;
1992 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 2457 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
1993 struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; 2458 struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
1994 2459
1995 if (!ops) 2460 if (!ops)
1996 return 0; 2461 return 0;
1997 2462
1998 ret = ops->unmap(ops, iova, size); 2463 return ops->unmap(ops, iova, size, gather);
1999 if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size))
2000 return 0;
2001
2002 return ret;
2003} 2464}
2004 2465
2005static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) 2466static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
@@ -2010,12 +2471,13 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
2010 arm_smmu_tlb_inv_context(smmu_domain); 2471 arm_smmu_tlb_inv_context(smmu_domain);
2011} 2472}
2012 2473
2013static void arm_smmu_iotlb_sync(struct iommu_domain *domain) 2474static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
2475 struct iommu_iotlb_gather *gather)
2014{ 2476{
2015 struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; 2477 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
2016 2478
2017 if (smmu) 2479 arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
2018 arm_smmu_cmdq_issue_sync(smmu); 2480 gather->pgsize, true, smmu_domain);
2019} 2481}
2020 2482
2021static phys_addr_t 2483static phys_addr_t
@@ -2286,13 +2748,13 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
2286 size_t qsz; 2748 size_t qsz;
2287 2749
2288 do { 2750 do {
2289 qsz = ((1 << q->max_n_shift) * dwords) << 3; 2751 qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
2290 q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, 2752 q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
2291 GFP_KERNEL); 2753 GFP_KERNEL);
2292 if (q->base || qsz < PAGE_SIZE) 2754 if (q->base || qsz < PAGE_SIZE)
2293 break; 2755 break;
2294 2756
2295 q->max_n_shift--; 2757 q->llq.max_n_shift--;
2296 } while (1); 2758 } while (1);
2297 2759
2298 if (!q->base) { 2760 if (!q->base) {
@@ -2304,7 +2766,7 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
2304 2766
2305 if (!WARN_ON(q->base_dma & (qsz - 1))) { 2767 if (!WARN_ON(q->base_dma & (qsz - 1))) {
2306 dev_info(smmu->dev, "allocated %u entries for %s\n", 2768 dev_info(smmu->dev, "allocated %u entries for %s\n",
2307 1 << q->max_n_shift, name); 2769 1 << q->llq.max_n_shift, name);
2308 } 2770 }
2309 2771
2310 q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu); 2772 q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu);
@@ -2313,24 +2775,55 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
2313 2775
2314 q->q_base = Q_BASE_RWA; 2776 q->q_base = Q_BASE_RWA;
2315 q->q_base |= q->base_dma & Q_BASE_ADDR_MASK; 2777 q->q_base |= q->base_dma & Q_BASE_ADDR_MASK;
2316 q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift); 2778 q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
2317 2779
2318 q->prod = q->cons = 0; 2780 q->llq.prod = q->llq.cons = 0;
2319 return 0; 2781 return 0;
2320} 2782}
2321 2783
2784static void arm_smmu_cmdq_free_bitmap(void *data)
2785{
2786 unsigned long *bitmap = data;
2787 bitmap_free(bitmap);
2788}
2789
2790static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
2791{
2792 int ret = 0;
2793 struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
2794 unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
2795 atomic_long_t *bitmap;
2796
2797 atomic_set(&cmdq->owner_prod, 0);
2798 atomic_set(&cmdq->lock, 0);
2799
2800 bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
2801 if (!bitmap) {
2802 dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
2803 ret = -ENOMEM;
2804 } else {
2805 cmdq->valid_map = bitmap;
2806 devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
2807 }
2808
2809 return ret;
2810}
2811
2322static int arm_smmu_init_queues(struct arm_smmu_device *smmu) 2812static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
2323{ 2813{
2324 int ret; 2814 int ret;
2325 2815
2326 /* cmdq */ 2816 /* cmdq */
2327 spin_lock_init(&smmu->cmdq.lock);
2328 ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, 2817 ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
2329 ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, 2818 ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
2330 "cmdq"); 2819 "cmdq");
2331 if (ret) 2820 if (ret)
2332 return ret; 2821 return ret;
2333 2822
2823 ret = arm_smmu_cmdq_init(smmu);
2824 if (ret)
2825 return ret;
2826
2334 /* evtq */ 2827 /* evtq */
2335 ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, 2828 ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
2336 ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, 2829 ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@@ -2708,8 +3201,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
2708 3201
2709 /* Command queue */ 3202 /* Command queue */
2710 writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE); 3203 writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
2711 writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD); 3204 writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
2712 writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS); 3205 writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
2713 3206
2714 enables = CR0_CMDQEN; 3207 enables = CR0_CMDQEN;
2715 ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, 3208 ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
@@ -2736,9 +3229,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
2736 3229
2737 /* Event queue */ 3230 /* Event queue */
2738 writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); 3231 writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
2739 writel_relaxed(smmu->evtq.q.prod, 3232 writel_relaxed(smmu->evtq.q.llq.prod,
2740 arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu)); 3233 arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu));
2741 writel_relaxed(smmu->evtq.q.cons, 3234 writel_relaxed(smmu->evtq.q.llq.cons,
2742 arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu)); 3235 arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu));
2743 3236
2744 enables |= CR0_EVTQEN; 3237 enables |= CR0_EVTQEN;
@@ -2753,9 +3246,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
2753 if (smmu->features & ARM_SMMU_FEAT_PRI) { 3246 if (smmu->features & ARM_SMMU_FEAT_PRI) {
2754 writeq_relaxed(smmu->priq.q.q_base, 3247 writeq_relaxed(smmu->priq.q.q_base,
2755 smmu->base + ARM_SMMU_PRIQ_BASE); 3248 smmu->base + ARM_SMMU_PRIQ_BASE);
2756 writel_relaxed(smmu->priq.q.prod, 3249 writel_relaxed(smmu->priq.q.llq.prod,
2757 arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu)); 3250 arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu));
2758 writel_relaxed(smmu->priq.q.cons, 3251 writel_relaxed(smmu->priq.q.llq.cons,
2759 arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu)); 3252 arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu));
2760 3253
2761 enables |= CR0_PRIQEN; 3254 enables |= CR0_PRIQEN;
@@ -2909,18 +3402,24 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
2909 } 3402 }
2910 3403
2911 /* Queue sizes, capped to ensure natural alignment */ 3404 /* Queue sizes, capped to ensure natural alignment */
2912 smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, 3405 smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
2913 FIELD_GET(IDR1_CMDQS, reg)); 3406 FIELD_GET(IDR1_CMDQS, reg));
2914 if (!smmu->cmdq.q.max_n_shift) { 3407 if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
2915 /* Odd alignment restrictions on the base, so ignore for now */ 3408 /*
2916 dev_err(smmu->dev, "unit-length command queue not supported\n"); 3409 * We don't support splitting up batches, so one batch of
3410 * commands plus an extra sync needs to fit inside the command
3411 * queue. There's also no way we can handle the weird alignment
3412 * restrictions on the base pointer for a unit-length queue.
3413 */
3414 dev_err(smmu->dev, "command queue size <= %d entries not supported\n",
3415 CMDQ_BATCH_ENTRIES);
2917 return -ENXIO; 3416 return -ENXIO;
2918 } 3417 }
2919 3418
2920 smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, 3419 smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
2921 FIELD_GET(IDR1_EVTQS, reg)); 3420 FIELD_GET(IDR1_EVTQS, reg));
2922 smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, 3421 smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
2923 FIELD_GET(IDR1_PRIQS, reg)); 3422 FIELD_GET(IDR1_PRIQS, reg));
2924 3423
2925 /* SID/SSID sizes */ 3424 /* SID/SSID sizes */
2926 smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); 3425 smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg);
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 64977c131ee6..5b93c79371e9 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -19,16 +19,13 @@
19 19
20#include <linux/acpi.h> 20#include <linux/acpi.h>
21#include <linux/acpi_iort.h> 21#include <linux/acpi_iort.h>
22#include <linux/atomic.h> 22#include <linux/bitfield.h>
23#include <linux/delay.h> 23#include <linux/delay.h>
24#include <linux/dma-iommu.h> 24#include <linux/dma-iommu.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
26#include <linux/err.h> 26#include <linux/err.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/io.h> 28#include <linux/io.h>
29#include <linux/io-64-nonatomic-hi-lo.h>
30#include <linux/io-pgtable.h>
31#include <linux/iommu.h>
32#include <linux/iopoll.h> 29#include <linux/iopoll.h>
33#include <linux/init.h> 30#include <linux/init.h>
34#include <linux/moduleparam.h> 31#include <linux/moduleparam.h>
@@ -40,12 +37,11 @@
40#include <linux/platform_device.h> 37#include <linux/platform_device.h>
41#include <linux/pm_runtime.h> 38#include <linux/pm_runtime.h>
42#include <linux/slab.h> 39#include <linux/slab.h>
43#include <linux/spinlock.h>
44 40
45#include <linux/amba/bus.h> 41#include <linux/amba/bus.h>
46#include <linux/fsl/mc.h> 42#include <linux/fsl/mc.h>
47 43
48#include "arm-smmu-regs.h" 44#include "arm-smmu.h"
49 45
50/* 46/*
51 * Apparently, some Qualcomm arm64 platforms which appear to expose their SMMU 47 * Apparently, some Qualcomm arm64 platforms which appear to expose their SMMU
@@ -56,46 +52,9 @@
56 */ 52 */
57#define QCOM_DUMMY_VAL -1 53#define QCOM_DUMMY_VAL -1
58 54
59#define ARM_MMU500_ACTLR_CPRE (1 << 1)
60
61#define ARM_MMU500_ACR_CACHE_LOCK (1 << 26)
62#define ARM_MMU500_ACR_S2CRB_TLBEN (1 << 10)
63#define ARM_MMU500_ACR_SMTNMB_TLBEN (1 << 8)
64
65#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ 55#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
66#define TLB_SPIN_COUNT 10 56#define TLB_SPIN_COUNT 10
67 57
68/* Maximum number of context banks per SMMU */
69#define ARM_SMMU_MAX_CBS 128
70
71/* SMMU global address space */
72#define ARM_SMMU_GR0(smmu) ((smmu)->base)
73#define ARM_SMMU_GR1(smmu) ((smmu)->base + (1 << (smmu)->pgshift))
74
75/*
76 * SMMU global address space with conditional offset to access secure
77 * aliases of non-secure registers (e.g. nsCR0: 0x400, nsGFSR: 0x448,
78 * nsGFSYNR0: 0x450)
79 */
80#define ARM_SMMU_GR0_NS(smmu) \
81 ((smmu)->base + \
82 ((smmu->options & ARM_SMMU_OPT_SECURE_CFG_ACCESS) \
83 ? 0x400 : 0))
84
85/*
86 * Some 64-bit registers only make sense to write atomically, but in such
87 * cases all the data relevant to AArch32 formats lies within the lower word,
88 * therefore this actually makes more sense than it might first appear.
89 */
90#ifdef CONFIG_64BIT
91#define smmu_write_atomic_lq writeq_relaxed
92#else
93#define smmu_write_atomic_lq writel_relaxed
94#endif
95
96/* Translation context bank */
97#define ARM_SMMU_CB(smmu, n) ((smmu)->cb_base + ((n) << (smmu)->pgshift))
98
99#define MSI_IOVA_BASE 0x8000000 58#define MSI_IOVA_BASE 0x8000000
100#define MSI_IOVA_LENGTH 0x100000 59#define MSI_IOVA_LENGTH 0x100000
101 60
@@ -113,19 +72,6 @@ module_param(disable_bypass, bool, S_IRUGO);
113MODULE_PARM_DESC(disable_bypass, 72MODULE_PARM_DESC(disable_bypass,
114 "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU."); 73 "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
115 74
116enum arm_smmu_arch_version {
117 ARM_SMMU_V1,
118 ARM_SMMU_V1_64K,
119 ARM_SMMU_V2,
120};
121
122enum arm_smmu_implementation {
123 GENERIC_SMMU,
124 ARM_MMU500,
125 CAVIUM_SMMUV2,
126 QCOM_SMMUV2,
127};
128
129struct arm_smmu_s2cr { 75struct arm_smmu_s2cr {
130 struct iommu_group *group; 76 struct iommu_group *group;
131 int count; 77 int count;
@@ -163,117 +109,8 @@ struct arm_smmu_master_cfg {
163#define for_each_cfg_sme(fw, i, idx) \ 109#define for_each_cfg_sme(fw, i, idx) \
164 for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i) 110 for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
165 111
166struct arm_smmu_device {
167 struct device *dev;
168
169 void __iomem *base;
170 void __iomem *cb_base;
171 unsigned long pgshift;
172
173#define ARM_SMMU_FEAT_COHERENT_WALK (1 << 0)
174#define ARM_SMMU_FEAT_STREAM_MATCH (1 << 1)
175#define ARM_SMMU_FEAT_TRANS_S1 (1 << 2)
176#define ARM_SMMU_FEAT_TRANS_S2 (1 << 3)
177#define ARM_SMMU_FEAT_TRANS_NESTED (1 << 4)
178#define ARM_SMMU_FEAT_TRANS_OPS (1 << 5)
179#define ARM_SMMU_FEAT_VMID16 (1 << 6)
180#define ARM_SMMU_FEAT_FMT_AARCH64_4K (1 << 7)
181#define ARM_SMMU_FEAT_FMT_AARCH64_16K (1 << 8)
182#define ARM_SMMU_FEAT_FMT_AARCH64_64K (1 << 9)
183#define ARM_SMMU_FEAT_FMT_AARCH32_L (1 << 10)
184#define ARM_SMMU_FEAT_FMT_AARCH32_S (1 << 11)
185#define ARM_SMMU_FEAT_EXIDS (1 << 12)
186 u32 features;
187
188#define ARM_SMMU_OPT_SECURE_CFG_ACCESS (1 << 0)
189 u32 options;
190 enum arm_smmu_arch_version version;
191 enum arm_smmu_implementation model;
192
193 u32 num_context_banks;
194 u32 num_s2_context_banks;
195 DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
196 struct arm_smmu_cb *cbs;
197 atomic_t irptndx;
198
199 u32 num_mapping_groups;
200 u16 streamid_mask;
201 u16 smr_mask_mask;
202 struct arm_smmu_smr *smrs;
203 struct arm_smmu_s2cr *s2crs;
204 struct mutex stream_map_mutex;
205
206 unsigned long va_size;
207 unsigned long ipa_size;
208 unsigned long pa_size;
209 unsigned long pgsize_bitmap;
210
211 u32 num_global_irqs;
212 u32 num_context_irqs;
213 unsigned int *irqs;
214 struct clk_bulk_data *clks;
215 int num_clks;
216
217 u32 cavium_id_base; /* Specific to Cavium */
218
219 spinlock_t global_sync_lock;
220
221 /* IOMMU core code handle */
222 struct iommu_device iommu;
223};
224
225enum arm_smmu_context_fmt {
226 ARM_SMMU_CTX_FMT_NONE,
227 ARM_SMMU_CTX_FMT_AARCH64,
228 ARM_SMMU_CTX_FMT_AARCH32_L,
229 ARM_SMMU_CTX_FMT_AARCH32_S,
230};
231
232struct arm_smmu_cfg {
233 u8 cbndx;
234 u8 irptndx;
235 union {
236 u16 asid;
237 u16 vmid;
238 };
239 u32 cbar;
240 enum arm_smmu_context_fmt fmt;
241};
242#define INVALID_IRPTNDX 0xff
243
244enum arm_smmu_domain_stage {
245 ARM_SMMU_DOMAIN_S1 = 0,
246 ARM_SMMU_DOMAIN_S2,
247 ARM_SMMU_DOMAIN_NESTED,
248 ARM_SMMU_DOMAIN_BYPASS,
249};
250
251struct arm_smmu_domain {
252 struct arm_smmu_device *smmu;
253 struct io_pgtable_ops *pgtbl_ops;
254 const struct iommu_gather_ops *tlb_ops;
255 struct arm_smmu_cfg cfg;
256 enum arm_smmu_domain_stage stage;
257 bool non_strict;
258 struct mutex init_mutex; /* Protects smmu pointer */
259 spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */
260 struct iommu_domain domain;
261};
262
263struct arm_smmu_option_prop {
264 u32 opt;
265 const char *prop;
266};
267
268static atomic_t cavium_smmu_context_count = ATOMIC_INIT(0);
269
270static bool using_legacy_binding, using_generic_binding; 112static bool using_legacy_binding, using_generic_binding;
271 113
272static struct arm_smmu_option_prop arm_smmu_options[] = {
273 { ARM_SMMU_OPT_SECURE_CFG_ACCESS, "calxeda,smmu-secure-config-access" },
274 { 0, NULL},
275};
276
277static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu) 114static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu)
278{ 115{
279 if (pm_runtime_enabled(smmu->dev)) 116 if (pm_runtime_enabled(smmu->dev))
@@ -293,20 +130,6 @@ static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
293 return container_of(dom, struct arm_smmu_domain, domain); 130 return container_of(dom, struct arm_smmu_domain, domain);
294} 131}
295 132
296static void parse_driver_options(struct arm_smmu_device *smmu)
297{
298 int i = 0;
299
300 do {
301 if (of_property_read_bool(smmu->dev->of_node,
302 arm_smmu_options[i].prop)) {
303 smmu->options |= arm_smmu_options[i].opt;
304 dev_notice(smmu->dev, "option %s\n",
305 arm_smmu_options[i].prop);
306 }
307 } while (arm_smmu_options[++i].opt);
308}
309
310static struct device_node *dev_get_dev_node(struct device *dev) 133static struct device_node *dev_get_dev_node(struct device *dev)
311{ 134{
312 if (dev_is_pci(dev)) { 135 if (dev_is_pci(dev)) {
@@ -415,15 +238,17 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
415} 238}
416 239
417/* Wait for any pending TLB invalidations to complete */ 240/* Wait for any pending TLB invalidations to complete */
418static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, 241static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
419 void __iomem *sync, void __iomem *status) 242 int sync, int status)
420{ 243{
421 unsigned int spin_cnt, delay; 244 unsigned int spin_cnt, delay;
245 u32 reg;
422 246
423 writel_relaxed(QCOM_DUMMY_VAL, sync); 247 arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
424 for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { 248 for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
425 for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { 249 for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
426 if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE)) 250 reg = arm_smmu_readl(smmu, page, status);
251 if (!(reg & sTLBGSTATUS_GSACTIVE))
427 return; 252 return;
428 cpu_relax(); 253 cpu_relax();
429 } 254 }
@@ -435,12 +260,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
435 260
436static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu) 261static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
437{ 262{
438 void __iomem *base = ARM_SMMU_GR0(smmu);
439 unsigned long flags; 263 unsigned long flags;
440 264
441 spin_lock_irqsave(&smmu->global_sync_lock, flags); 265 spin_lock_irqsave(&smmu->global_sync_lock, flags);
442 __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC, 266 __arm_smmu_tlb_sync(smmu, ARM_SMMU_GR0, ARM_SMMU_GR0_sTLBGSYNC,
443 base + ARM_SMMU_GR0_sTLBGSTATUS); 267 ARM_SMMU_GR0_sTLBGSTATUS);
444 spin_unlock_irqrestore(&smmu->global_sync_lock, flags); 268 spin_unlock_irqrestore(&smmu->global_sync_lock, flags);
445} 269}
446 270
@@ -448,12 +272,11 @@ static void arm_smmu_tlb_sync_context(void *cookie)
448{ 272{
449 struct arm_smmu_domain *smmu_domain = cookie; 273 struct arm_smmu_domain *smmu_domain = cookie;
450 struct arm_smmu_device *smmu = smmu_domain->smmu; 274 struct arm_smmu_device *smmu = smmu_domain->smmu;
451 void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
452 unsigned long flags; 275 unsigned long flags;
453 276
454 spin_lock_irqsave(&smmu_domain->cb_lock, flags); 277 spin_lock_irqsave(&smmu_domain->cb_lock, flags);
455 __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC, 278 __arm_smmu_tlb_sync(smmu, ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx),
456 base + ARM_SMMU_CB_TLBSTATUS); 279 ARM_SMMU_CB_TLBSYNC, ARM_SMMU_CB_TLBSTATUS);
457 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); 280 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
458} 281}
459 282
@@ -467,14 +290,13 @@ static void arm_smmu_tlb_sync_vmid(void *cookie)
467static void arm_smmu_tlb_inv_context_s1(void *cookie) 290static void arm_smmu_tlb_inv_context_s1(void *cookie)
468{ 291{
469 struct arm_smmu_domain *smmu_domain = cookie; 292 struct arm_smmu_domain *smmu_domain = cookie;
470 struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
471 void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
472
473 /* 293 /*
474 * NOTE: this is not a relaxed write; it needs to guarantee that PTEs 294 * The TLBI write may be relaxed, so ensure that PTEs cleared by the
475 * cleared by the current CPU are visible to the SMMU before the TLBI. 295 * current CPU are visible beforehand.
476 */ 296 */
477 writel(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID); 297 wmb();
298 arm_smmu_cb_write(smmu_domain->smmu, smmu_domain->cfg.cbndx,
299 ARM_SMMU_CB_S1_TLBIASID, smmu_domain->cfg.asid);
478 arm_smmu_tlb_sync_context(cookie); 300 arm_smmu_tlb_sync_context(cookie);
479} 301}
480 302
@@ -482,87 +304,143 @@ static void arm_smmu_tlb_inv_context_s2(void *cookie)
482{ 304{
483 struct arm_smmu_domain *smmu_domain = cookie; 305 struct arm_smmu_domain *smmu_domain = cookie;
484 struct arm_smmu_device *smmu = smmu_domain->smmu; 306 struct arm_smmu_device *smmu = smmu_domain->smmu;
485 void __iomem *base = ARM_SMMU_GR0(smmu);
486 307
487 /* NOTE: see above */ 308 /* See above */
488 writel(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID); 309 wmb();
310 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
489 arm_smmu_tlb_sync_global(smmu); 311 arm_smmu_tlb_sync_global(smmu);
490} 312}
491 313
492static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, 314static void arm_smmu_tlb_inv_range_s1(unsigned long iova, size_t size,
493 size_t granule, bool leaf, void *cookie) 315 size_t granule, bool leaf, void *cookie)
494{ 316{
495 struct arm_smmu_domain *smmu_domain = cookie; 317 struct arm_smmu_domain *smmu_domain = cookie;
318 struct arm_smmu_device *smmu = smmu_domain->smmu;
496 struct arm_smmu_cfg *cfg = &smmu_domain->cfg; 319 struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
497 bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS; 320 int reg, idx = cfg->cbndx;
498 void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
499 321
500 if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) 322 if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
501 wmb(); 323 wmb();
502 324
503 if (stage1) { 325 reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
504 reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA; 326
505 327 if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
506 if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) { 328 iova = (iova >> 12) << 12;
507 iova &= ~12UL; 329 iova |= cfg->asid;
508 iova |= cfg->asid; 330 do {
509 do { 331 arm_smmu_cb_write(smmu, idx, reg, iova);
510 writel_relaxed(iova, reg); 332 iova += granule;
511 iova += granule; 333 } while (size -= granule);
512 } while (size -= granule);
513 } else {
514 iova >>= 12;
515 iova |= (u64)cfg->asid << 48;
516 do {
517 writeq_relaxed(iova, reg);
518 iova += granule >> 12;
519 } while (size -= granule);
520 }
521 } else { 334 } else {
522 reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
523 ARM_SMMU_CB_S2_TLBIIPAS2;
524 iova >>= 12; 335 iova >>= 12;
336 iova |= (u64)cfg->asid << 48;
525 do { 337 do {
526 smmu_write_atomic_lq(iova, reg); 338 arm_smmu_cb_writeq(smmu, idx, reg, iova);
527 iova += granule >> 12; 339 iova += granule >> 12;
528 } while (size -= granule); 340 } while (size -= granule);
529 } 341 }
530} 342}
531 343
344static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size,
345 size_t granule, bool leaf, void *cookie)
346{
347 struct arm_smmu_domain *smmu_domain = cookie;
348 struct arm_smmu_device *smmu = smmu_domain->smmu;
349 int reg, idx = smmu_domain->cfg.cbndx;
350
351 if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
352 wmb();
353
354 reg = leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L : ARM_SMMU_CB_S2_TLBIIPAS2;
355 iova >>= 12;
356 do {
357 if (smmu_domain->cfg.fmt == ARM_SMMU_CTX_FMT_AARCH64)
358 arm_smmu_cb_writeq(smmu, idx, reg, iova);
359 else
360 arm_smmu_cb_write(smmu, idx, reg, iova);
361 iova += granule >> 12;
362 } while (size -= granule);
363}
364
532/* 365/*
533 * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears 366 * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
534 * almost negligible, but the benefit of getting the first one in as far ahead 367 * almost negligible, but the benefit of getting the first one in as far ahead
535 * of the sync as possible is significant, hence we don't just make this a 368 * of the sync as possible is significant, hence we don't just make this a
536 * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think. 369 * no-op and set .tlb_sync to arm_smmu_tlb_inv_context_s2() as you might think.
537 */ 370 */
538static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size, 371static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size,
539 size_t granule, bool leaf, void *cookie) 372 size_t granule, bool leaf, void *cookie)
540{ 373{
541 struct arm_smmu_domain *smmu_domain = cookie; 374 struct arm_smmu_domain *smmu_domain = cookie;
542 void __iomem *base = ARM_SMMU_GR0(smmu_domain->smmu); 375 struct arm_smmu_device *smmu = smmu_domain->smmu;
543 376
544 if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) 377 if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
545 wmb(); 378 wmb();
546 379
547 writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID); 380 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
381}
382
383static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
384 size_t granule, void *cookie)
385{
386 struct arm_smmu_domain *smmu_domain = cookie;
387 const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
388
389 ops->tlb_inv_range(iova, size, granule, false, cookie);
390 ops->tlb_sync(cookie);
391}
392
393static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
394 size_t granule, void *cookie)
395{
396 struct arm_smmu_domain *smmu_domain = cookie;
397 const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
398
399 ops->tlb_inv_range(iova, size, granule, true, cookie);
400 ops->tlb_sync(cookie);
401}
402
403static void arm_smmu_tlb_add_page(struct iommu_iotlb_gather *gather,
404 unsigned long iova, size_t granule,
405 void *cookie)
406{
407 struct arm_smmu_domain *smmu_domain = cookie;
408 const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
409
410 ops->tlb_inv_range(iova, granule, granule, true, cookie);
548} 411}
549 412
550static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = { 413static const struct arm_smmu_flush_ops arm_smmu_s1_tlb_ops = {
551 .tlb_flush_all = arm_smmu_tlb_inv_context_s1, 414 .tlb = {
552 .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, 415 .tlb_flush_all = arm_smmu_tlb_inv_context_s1,
553 .tlb_sync = arm_smmu_tlb_sync_context, 416 .tlb_flush_walk = arm_smmu_tlb_inv_walk,
417 .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
418 .tlb_add_page = arm_smmu_tlb_add_page,
419 },
420 .tlb_inv_range = arm_smmu_tlb_inv_range_s1,
421 .tlb_sync = arm_smmu_tlb_sync_context,
554}; 422};
555 423
556static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = { 424static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = {
557 .tlb_flush_all = arm_smmu_tlb_inv_context_s2, 425 .tlb = {
558 .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, 426 .tlb_flush_all = arm_smmu_tlb_inv_context_s2,
559 .tlb_sync = arm_smmu_tlb_sync_context, 427 .tlb_flush_walk = arm_smmu_tlb_inv_walk,
428 .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
429 .tlb_add_page = arm_smmu_tlb_add_page,
430 },
431 .tlb_inv_range = arm_smmu_tlb_inv_range_s2,
432 .tlb_sync = arm_smmu_tlb_sync_context,
560}; 433};
561 434
562static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = { 435static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
563 .tlb_flush_all = arm_smmu_tlb_inv_context_s2, 436 .tlb = {
564 .tlb_add_flush = arm_smmu_tlb_inv_vmid_nosync, 437 .tlb_flush_all = arm_smmu_tlb_inv_context_s2,
565 .tlb_sync = arm_smmu_tlb_sync_vmid, 438 .tlb_flush_walk = arm_smmu_tlb_inv_walk,
439 .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
440 .tlb_add_page = arm_smmu_tlb_add_page,
441 },
442 .tlb_inv_range = arm_smmu_tlb_inv_vmid_nosync,
443 .tlb_sync = arm_smmu_tlb_sync_vmid,
566}; 444};
567 445
568static irqreturn_t arm_smmu_context_fault(int irq, void *dev) 446static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -571,26 +449,22 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
571 unsigned long iova; 449 unsigned long iova;
572 struct iommu_domain *domain = dev; 450 struct iommu_domain *domain = dev;
573 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 451 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
574 struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
575 struct arm_smmu_device *smmu = smmu_domain->smmu; 452 struct arm_smmu_device *smmu = smmu_domain->smmu;
576 void __iomem *gr1_base = ARM_SMMU_GR1(smmu); 453 int idx = smmu_domain->cfg.cbndx;
577 void __iomem *cb_base;
578
579 cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
580 fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
581 454
455 fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
582 if (!(fsr & FSR_FAULT)) 456 if (!(fsr & FSR_FAULT))
583 return IRQ_NONE; 457 return IRQ_NONE;
584 458
585 fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0); 459 fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
586 iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR); 460 iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
587 cbfrsynra = readl_relaxed(gr1_base + ARM_SMMU_GR1_CBFRSYNRA(cfg->cbndx)); 461 cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
588 462
589 dev_err_ratelimited(smmu->dev, 463 dev_err_ratelimited(smmu->dev,
590 "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", 464 "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
591 fsr, iova, fsynr, cbfrsynra, cfg->cbndx); 465 fsr, iova, fsynr, cbfrsynra, idx);
592 466
593 writel(fsr, cb_base + ARM_SMMU_CB_FSR); 467 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
594 return IRQ_HANDLED; 468 return IRQ_HANDLED;
595} 469}
596 470
@@ -598,12 +472,11 @@ static irqreturn_t arm_smmu_global_fault(int irq, void *dev)
598{ 472{
599 u32 gfsr, gfsynr0, gfsynr1, gfsynr2; 473 u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
600 struct arm_smmu_device *smmu = dev; 474 struct arm_smmu_device *smmu = dev;
601 void __iomem *gr0_base = ARM_SMMU_GR0_NS(smmu);
602 475
603 gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR); 476 gfsr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
604 gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0); 477 gfsynr0 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR0);
605 gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1); 478 gfsynr1 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR1);
606 gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2); 479 gfsynr2 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR2);
607 480
608 if (!gfsr) 481 if (!gfsr)
609 return IRQ_NONE; 482 return IRQ_NONE;
@@ -614,7 +487,7 @@ static irqreturn_t arm_smmu_global_fault(int irq, void *dev)
614 "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n", 487 "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
615 gfsr, gfsynr0, gfsynr1, gfsynr2); 488 gfsr, gfsynr0, gfsynr1, gfsynr2);
616 489
617 writel(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR); 490 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, gfsr);
618 return IRQ_HANDLED; 491 return IRQ_HANDLED;
619} 492}
620 493
@@ -627,16 +500,16 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
627 500
628 cb->cfg = cfg; 501 cb->cfg = cfg;
629 502
630 /* TTBCR */ 503 /* TCR */
631 if (stage1) { 504 if (stage1) {
632 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) { 505 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
633 cb->tcr[0] = pgtbl_cfg->arm_v7s_cfg.tcr; 506 cb->tcr[0] = pgtbl_cfg->arm_v7s_cfg.tcr;
634 } else { 507 } else {
635 cb->tcr[0] = pgtbl_cfg->arm_lpae_s1_cfg.tcr; 508 cb->tcr[0] = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
636 cb->tcr[1] = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32; 509 cb->tcr[1] = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32;
637 cb->tcr[1] |= TTBCR2_SEP_UPSTREAM; 510 cb->tcr[1] |= FIELD_PREP(TCR2_SEP, TCR2_SEP_UPSTREAM);
638 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) 511 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
639 cb->tcr[1] |= TTBCR2_AS; 512 cb->tcr[1] |= TCR2_AS;
640 } 513 }
641 } else { 514 } else {
642 cb->tcr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vtcr; 515 cb->tcr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
@@ -649,9 +522,9 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
649 cb->ttbr[1] = pgtbl_cfg->arm_v7s_cfg.ttbr[1]; 522 cb->ttbr[1] = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
650 } else { 523 } else {
651 cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0]; 524 cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
652 cb->ttbr[0] |= (u64)cfg->asid << TTBRn_ASID_SHIFT; 525 cb->ttbr[0] |= FIELD_PREP(TTBRn_ASID, cfg->asid);
653 cb->ttbr[1] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1]; 526 cb->ttbr[1] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
654 cb->ttbr[1] |= (u64)cfg->asid << TTBRn_ASID_SHIFT; 527 cb->ttbr[1] |= FIELD_PREP(TTBRn_ASID, cfg->asid);
655 } 528 }
656 } else { 529 } else {
657 cb->ttbr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vttbr; 530 cb->ttbr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
@@ -675,74 +548,71 @@ static void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx)
675 bool stage1; 548 bool stage1;
676 struct arm_smmu_cb *cb = &smmu->cbs[idx]; 549 struct arm_smmu_cb *cb = &smmu->cbs[idx];
677 struct arm_smmu_cfg *cfg = cb->cfg; 550 struct arm_smmu_cfg *cfg = cb->cfg;
678 void __iomem *cb_base, *gr1_base;
679
680 cb_base = ARM_SMMU_CB(smmu, idx);
681 551
682 /* Unassigned context banks only need disabling */ 552 /* Unassigned context banks only need disabling */
683 if (!cfg) { 553 if (!cfg) {
684 writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR); 554 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, 0);
685 return; 555 return;
686 } 556 }
687 557
688 gr1_base = ARM_SMMU_GR1(smmu);
689 stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS; 558 stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
690 559
691 /* CBA2R */ 560 /* CBA2R */
692 if (smmu->version > ARM_SMMU_V1) { 561 if (smmu->version > ARM_SMMU_V1) {
693 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) 562 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
694 reg = CBA2R_RW64_64BIT; 563 reg = CBA2R_VA64;
695 else 564 else
696 reg = CBA2R_RW64_32BIT; 565 reg = 0;
697 /* 16-bit VMIDs live in CBA2R */ 566 /* 16-bit VMIDs live in CBA2R */
698 if (smmu->features & ARM_SMMU_FEAT_VMID16) 567 if (smmu->features & ARM_SMMU_FEAT_VMID16)
699 reg |= cfg->vmid << CBA2R_VMID_SHIFT; 568 reg |= FIELD_PREP(CBA2R_VMID16, cfg->vmid);
700 569
701 writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(idx)); 570 arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBA2R(idx), reg);
702 } 571 }
703 572
704 /* CBAR */ 573 /* CBAR */
705 reg = cfg->cbar; 574 reg = FIELD_PREP(CBAR_TYPE, cfg->cbar);
706 if (smmu->version < ARM_SMMU_V2) 575 if (smmu->version < ARM_SMMU_V2)
707 reg |= cfg->irptndx << CBAR_IRPTNDX_SHIFT; 576 reg |= FIELD_PREP(CBAR_IRPTNDX, cfg->irptndx);
708 577
709 /* 578 /*
710 * Use the weakest shareability/memory types, so they are 579 * Use the weakest shareability/memory types, so they are
711 * overridden by the ttbcr/pte. 580 * overridden by the ttbcr/pte.
712 */ 581 */
713 if (stage1) { 582 if (stage1) {
714 reg |= (CBAR_S1_BPSHCFG_NSH << CBAR_S1_BPSHCFG_SHIFT) | 583 reg |= FIELD_PREP(CBAR_S1_BPSHCFG, CBAR_S1_BPSHCFG_NSH) |
715 (CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT); 584 FIELD_PREP(CBAR_S1_MEMATTR, CBAR_S1_MEMATTR_WB);
716 } else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) { 585 } else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
717 /* 8-bit VMIDs live in CBAR */ 586 /* 8-bit VMIDs live in CBAR */
718 reg |= cfg->vmid << CBAR_VMID_SHIFT; 587 reg |= FIELD_PREP(CBAR_VMID, cfg->vmid);
719 } 588 }
720 writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(idx)); 589 arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(idx), reg);
721 590
722 /* 591 /*
723 * TTBCR 592 * TCR
724 * We must write this before the TTBRs, since it determines the 593 * We must write this before the TTBRs, since it determines the
725 * access behaviour of some fields (in particular, ASID[15:8]). 594 * access behaviour of some fields (in particular, ASID[15:8]).
726 */ 595 */
727 if (stage1 && smmu->version > ARM_SMMU_V1) 596 if (stage1 && smmu->version > ARM_SMMU_V1)
728 writel_relaxed(cb->tcr[1], cb_base + ARM_SMMU_CB_TTBCR2); 597 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR2, cb->tcr[1]);
729 writel_relaxed(cb->tcr[0], cb_base + ARM_SMMU_CB_TTBCR); 598 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR, cb->tcr[0]);
730 599
731 /* TTBRs */ 600 /* TTBRs */
732 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) { 601 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
733 writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR); 602 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_CONTEXTIDR, cfg->asid);
734 writel_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0); 603 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
735 writel_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1); 604 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR1, cb->ttbr[1]);
736 } else { 605 } else {
737 writeq_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0); 606 arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
738 if (stage1) 607 if (stage1)
739 writeq_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1); 608 arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR1,
609 cb->ttbr[1]);
740 } 610 }
741 611
742 /* MAIRs (stage-1 only) */ 612 /* MAIRs (stage-1 only) */
743 if (stage1) { 613 if (stage1) {
744 writel_relaxed(cb->mair[0], cb_base + ARM_SMMU_CB_S1_MAIR0); 614 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR0, cb->mair[0]);
745 writel_relaxed(cb->mair[1], cb_base + ARM_SMMU_CB_S1_MAIR1); 615 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR1, cb->mair[1]);
746 } 616 }
747 617
748 /* SCTLR */ 618 /* SCTLR */
@@ -752,7 +622,7 @@ static void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx)
752 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) 622 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
753 reg |= SCTLR_E; 623 reg |= SCTLR_E;
754 624
755 writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR); 625 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
756} 626}
757 627
758static int arm_smmu_init_domain_context(struct iommu_domain *domain, 628static int arm_smmu_init_domain_context(struct iommu_domain *domain,
@@ -842,7 +712,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
842 ias = min(ias, 32UL); 712 ias = min(ias, 32UL);
843 oas = min(oas, 32UL); 713 oas = min(oas, 32UL);
844 } 714 }
845 smmu_domain->tlb_ops = &arm_smmu_s1_tlb_ops; 715 smmu_domain->flush_ops = &arm_smmu_s1_tlb_ops;
846 break; 716 break;
847 case ARM_SMMU_DOMAIN_NESTED: 717 case ARM_SMMU_DOMAIN_NESTED:
848 /* 718 /*
@@ -862,9 +732,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
862 oas = min(oas, 40UL); 732 oas = min(oas, 40UL);
863 } 733 }
864 if (smmu->version == ARM_SMMU_V2) 734 if (smmu->version == ARM_SMMU_V2)
865 smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v2; 735 smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v2;
866 else 736 else
867 smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v1; 737 smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v1;
868 break; 738 break;
869 default: 739 default:
870 ret = -EINVAL; 740 ret = -EINVAL;
@@ -884,23 +754,29 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
884 } 754 }
885 755
886 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2) 756 if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
887 cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base; 757 cfg->vmid = cfg->cbndx + 1;
888 else 758 else
889 cfg->asid = cfg->cbndx + smmu->cavium_id_base; 759 cfg->asid = cfg->cbndx;
760
761 smmu_domain->smmu = smmu;
762 if (smmu->impl && smmu->impl->init_context) {
763 ret = smmu->impl->init_context(smmu_domain);
764 if (ret)
765 goto out_unlock;
766 }
890 767
891 pgtbl_cfg = (struct io_pgtable_cfg) { 768 pgtbl_cfg = (struct io_pgtable_cfg) {
892 .pgsize_bitmap = smmu->pgsize_bitmap, 769 .pgsize_bitmap = smmu->pgsize_bitmap,
893 .ias = ias, 770 .ias = ias,
894 .oas = oas, 771 .oas = oas,
895 .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK, 772 .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK,
896 .tlb = smmu_domain->tlb_ops, 773 .tlb = &smmu_domain->flush_ops->tlb,
897 .iommu_dev = smmu->dev, 774 .iommu_dev = smmu->dev,
898 }; 775 };
899 776
900 if (smmu_domain->non_strict) 777 if (smmu_domain->non_strict)
901 pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; 778 pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
902 779
903 smmu_domain->smmu = smmu;
904 pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); 780 pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
905 if (!pgtbl_ops) { 781 if (!pgtbl_ops) {
906 ret = -ENOMEM; 782 ret = -ENOMEM;
@@ -1019,24 +895,24 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
1019static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx) 895static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx)
1020{ 896{
1021 struct arm_smmu_smr *smr = smmu->smrs + idx; 897 struct arm_smmu_smr *smr = smmu->smrs + idx;
1022 u32 reg = smr->id << SMR_ID_SHIFT | smr->mask << SMR_MASK_SHIFT; 898 u32 reg = FIELD_PREP(SMR_ID, smr->id) | FIELD_PREP(SMR_MASK, smr->mask);
1023 899
1024 if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid) 900 if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid)
1025 reg |= SMR_VALID; 901 reg |= SMR_VALID;
1026 writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_SMR(idx)); 902 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(idx), reg);
1027} 903}
1028 904
1029static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx) 905static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
1030{ 906{
1031 struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx; 907 struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
1032 u32 reg = (s2cr->type & S2CR_TYPE_MASK) << S2CR_TYPE_SHIFT | 908 u32 reg = FIELD_PREP(S2CR_TYPE, s2cr->type) |
1033 (s2cr->cbndx & S2CR_CBNDX_MASK) << S2CR_CBNDX_SHIFT | 909 FIELD_PREP(S2CR_CBNDX, s2cr->cbndx) |
1034 (s2cr->privcfg & S2CR_PRIVCFG_MASK) << S2CR_PRIVCFG_SHIFT; 910 FIELD_PREP(S2CR_PRIVCFG, s2cr->privcfg);
1035 911
1036 if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs && 912 if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs &&
1037 smmu->smrs[idx].valid) 913 smmu->smrs[idx].valid)
1038 reg |= S2CR_EXIDVALID; 914 reg |= S2CR_EXIDVALID;
1039 writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_S2CR(idx)); 915 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_S2CR(idx), reg);
1040} 916}
1041 917
1042static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx) 918static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx)
@@ -1052,7 +928,6 @@ static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx)
1052 */ 928 */
1053static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu) 929static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu)
1054{ 930{
1055 void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
1056 u32 smr; 931 u32 smr;
1057 932
1058 if (!smmu->smrs) 933 if (!smmu->smrs)
@@ -1063,15 +938,15 @@ static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu)
1063 * bits are set, so check each one separately. We can reject 938 * bits are set, so check each one separately. We can reject
1064 * masters later if they try to claim IDs outside these masks. 939 * masters later if they try to claim IDs outside these masks.
1065 */ 940 */
1066 smr = smmu->streamid_mask << SMR_ID_SHIFT; 941 smr = FIELD_PREP(SMR_ID, smmu->streamid_mask);
1067 writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0)); 942 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(0), smr);
1068 smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0)); 943 smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(0));
1069 smmu->streamid_mask = smr >> SMR_ID_SHIFT; 944 smmu->streamid_mask = FIELD_GET(SMR_ID, smr);
1070 945
1071 smr = smmu->streamid_mask << SMR_MASK_SHIFT; 946 smr = FIELD_PREP(SMR_MASK, smmu->streamid_mask);
1072 writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0)); 947 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(0), smr);
1073 smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0)); 948 smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(0));
1074 smmu->smr_mask_mask = smr >> SMR_MASK_SHIFT; 949 smmu->smr_mask_mask = FIELD_GET(SMR_MASK, smr);
1075} 950}
1076 951
1077static int arm_smmu_find_sme(struct arm_smmu_device *smmu, u16 id, u16 mask) 952static int arm_smmu_find_sme(struct arm_smmu_device *smmu, u16 id, u16 mask)
@@ -1140,8 +1015,8 @@ static int arm_smmu_master_alloc_smes(struct device *dev)
1140 mutex_lock(&smmu->stream_map_mutex); 1015 mutex_lock(&smmu->stream_map_mutex);
1141 /* Figure out a viable stream map entry allocation */ 1016 /* Figure out a viable stream map entry allocation */
1142 for_each_cfg_sme(fwspec, i, idx) { 1017 for_each_cfg_sme(fwspec, i, idx) {
1143 u16 sid = fwspec->ids[i]; 1018 u16 sid = FIELD_GET(SMR_ID, fwspec->ids[i]);
1144 u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT; 1019 u16 mask = FIELD_GET(SMR_MASK, fwspec->ids[i]);
1145 1020
1146 if (idx != INVALID_SMENDX) { 1021 if (idx != INVALID_SMENDX) {
1147 ret = -EEXIST; 1022 ret = -EEXIST;
@@ -1301,7 +1176,7 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
1301} 1176}
1302 1177
1303static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, 1178static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
1304 size_t size) 1179 size_t size, struct iommu_iotlb_gather *gather)
1305{ 1180{
1306 struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; 1181 struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
1307 struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; 1182 struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
@@ -1311,7 +1186,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
1311 return 0; 1186 return 0;
1312 1187
1313 arm_smmu_rpm_get(smmu); 1188 arm_smmu_rpm_get(smmu);
1314 ret = ops->unmap(ops, iova, size); 1189 ret = ops->unmap(ops, iova, size, gather);
1315 arm_smmu_rpm_put(smmu); 1190 arm_smmu_rpm_put(smmu);
1316 1191
1317 return ret; 1192 return ret;
@@ -1322,21 +1197,22 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
1322 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 1197 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
1323 struct arm_smmu_device *smmu = smmu_domain->smmu; 1198 struct arm_smmu_device *smmu = smmu_domain->smmu;
1324 1199
1325 if (smmu_domain->tlb_ops) { 1200 if (smmu_domain->flush_ops) {
1326 arm_smmu_rpm_get(smmu); 1201 arm_smmu_rpm_get(smmu);
1327 smmu_domain->tlb_ops->tlb_flush_all(smmu_domain); 1202 smmu_domain->flush_ops->tlb.tlb_flush_all(smmu_domain);
1328 arm_smmu_rpm_put(smmu); 1203 arm_smmu_rpm_put(smmu);
1329 } 1204 }
1330} 1205}
1331 1206
1332static void arm_smmu_iotlb_sync(struct iommu_domain *domain) 1207static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
1208 struct iommu_iotlb_gather *gather)
1333{ 1209{
1334 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 1210 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
1335 struct arm_smmu_device *smmu = smmu_domain->smmu; 1211 struct arm_smmu_device *smmu = smmu_domain->smmu;
1336 1212
1337 if (smmu_domain->tlb_ops) { 1213 if (smmu_domain->flush_ops) {
1338 arm_smmu_rpm_get(smmu); 1214 arm_smmu_rpm_get(smmu);
1339 smmu_domain->tlb_ops->tlb_sync(smmu_domain); 1215 smmu_domain->flush_ops->tlb_sync(smmu_domain);
1340 arm_smmu_rpm_put(smmu); 1216 arm_smmu_rpm_put(smmu);
1341 } 1217 }
1342} 1218}
@@ -1349,28 +1225,25 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
1349 struct arm_smmu_cfg *cfg = &smmu_domain->cfg; 1225 struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
1350 struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops; 1226 struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
1351 struct device *dev = smmu->dev; 1227 struct device *dev = smmu->dev;
1352 void __iomem *cb_base; 1228 void __iomem *reg;
1353 u32 tmp; 1229 u32 tmp;
1354 u64 phys; 1230 u64 phys;
1355 unsigned long va, flags; 1231 unsigned long va, flags;
1356 int ret; 1232 int ret, idx = cfg->cbndx;
1357 1233
1358 ret = arm_smmu_rpm_get(smmu); 1234 ret = arm_smmu_rpm_get(smmu);
1359 if (ret < 0) 1235 if (ret < 0)
1360 return 0; 1236 return 0;
1361 1237
1362 cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
1363
1364 spin_lock_irqsave(&smmu_domain->cb_lock, flags); 1238 spin_lock_irqsave(&smmu_domain->cb_lock, flags);
1365 /* ATS1 registers can only be written atomically */
1366 va = iova & ~0xfffUL; 1239 va = iova & ~0xfffUL;
1367 if (smmu->version == ARM_SMMU_V2) 1240 if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
1368 smmu_write_atomic_lq(va, cb_base + ARM_SMMU_CB_ATS1PR); 1241 arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
1369 else /* Register is only 32-bit in v1 */ 1242 else
1370 writel_relaxed(va, cb_base + ARM_SMMU_CB_ATS1PR); 1243 arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
1371 1244
1372 if (readl_poll_timeout_atomic(cb_base + ARM_SMMU_CB_ATSR, tmp, 1245 reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR;
1373 !(tmp & ATSR_ACTIVE), 5, 50)) { 1246 if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ATSR_ACTIVE), 5, 50)) {
1374 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); 1247 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
1375 dev_err(dev, 1248 dev_err(dev,
1376 "iova to phys timed out on %pad. Falling back to software table walk.\n", 1249 "iova to phys timed out on %pad. Falling back to software table walk.\n",
@@ -1378,7 +1251,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
1378 return ops->iova_to_phys(ops, iova); 1251 return ops->iova_to_phys(ops, iova);
1379 } 1252 }
1380 1253
1381 phys = readq_relaxed(cb_base + ARM_SMMU_CB_PAR); 1254 phys = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_PAR);
1382 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); 1255 spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
1383 if (phys & CB_PAR_F) { 1256 if (phys & CB_PAR_F) {
1384 dev_err(dev, "translation fault!\n"); 1257 dev_err(dev, "translation fault!\n");
@@ -1466,8 +1339,8 @@ static int arm_smmu_add_device(struct device *dev)
1466 1339
1467 ret = -EINVAL; 1340 ret = -EINVAL;
1468 for (i = 0; i < fwspec->num_ids; i++) { 1341 for (i = 0; i < fwspec->num_ids; i++) {
1469 u16 sid = fwspec->ids[i]; 1342 u16 sid = FIELD_GET(SMR_ID, fwspec->ids[i]);
1470 u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT; 1343 u16 mask = FIELD_GET(SMR_MASK, fwspec->ids[i]);
1471 1344
1472 if (sid & ~smmu->streamid_mask) { 1345 if (sid & ~smmu->streamid_mask) {
1473 dev_err(dev, "stream ID 0x%x out of range for SMMU (0x%x)\n", 1346 dev_err(dev, "stream ID 0x%x out of range for SMMU (0x%x)\n",
@@ -1648,12 +1521,12 @@ static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
1648 u32 mask, fwid = 0; 1521 u32 mask, fwid = 0;
1649 1522
1650 if (args->args_count > 0) 1523 if (args->args_count > 0)
1651 fwid |= (u16)args->args[0]; 1524 fwid |= FIELD_PREP(SMR_ID, args->args[0]);
1652 1525
1653 if (args->args_count > 1) 1526 if (args->args_count > 1)
1654 fwid |= (u16)args->args[1] << SMR_MASK_SHIFT; 1527 fwid |= FIELD_PREP(SMR_MASK, args->args[1]);
1655 else if (!of_property_read_u32(args->np, "stream-match-mask", &mask)) 1528 else if (!of_property_read_u32(args->np, "stream-match-mask", &mask))
1656 fwid |= (u16)mask << SMR_MASK_SHIFT; 1529 fwid |= FIELD_PREP(SMR_MASK, mask);
1657 1530
1658 return iommu_fwspec_add_ids(dev, &fwid, 1); 1531 return iommu_fwspec_add_ids(dev, &fwid, 1);
1659} 1532}
@@ -1706,13 +1579,12 @@ static struct iommu_ops arm_smmu_ops = {
1706 1579
1707static void arm_smmu_device_reset(struct arm_smmu_device *smmu) 1580static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
1708{ 1581{
1709 void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
1710 int i; 1582 int i;
1711 u32 reg, major; 1583 u32 reg;
1712 1584
1713 /* clear global FSR */ 1585 /* clear global FSR */
1714 reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR); 1586 reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
1715 writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR); 1587 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, reg);
1716 1588
1717 /* 1589 /*
1718 * Reset stream mapping groups: Initial values mark all SMRn as 1590 * Reset stream mapping groups: Initial values mark all SMRn as
@@ -1721,47 +1593,17 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
1721 for (i = 0; i < smmu->num_mapping_groups; ++i) 1593 for (i = 0; i < smmu->num_mapping_groups; ++i)
1722 arm_smmu_write_sme(smmu, i); 1594 arm_smmu_write_sme(smmu, i);
1723 1595
1724 if (smmu->model == ARM_MMU500) {
1725 /*
1726 * Before clearing ARM_MMU500_ACTLR_CPRE, need to
1727 * clear CACHE_LOCK bit of ACR first. And, CACHE_LOCK
1728 * bit is only present in MMU-500r2 onwards.
1729 */
1730 reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID7);
1731 major = (reg >> ID7_MAJOR_SHIFT) & ID7_MAJOR_MASK;
1732 reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sACR);
1733 if (major >= 2)
1734 reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
1735 /*
1736 * Allow unmatched Stream IDs to allocate bypass
1737 * TLB entries for reduced latency.
1738 */
1739 reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
1740 writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_sACR);
1741 }
1742
1743 /* Make sure all context banks are disabled and clear CB_FSR */ 1596 /* Make sure all context banks are disabled and clear CB_FSR */
1744 for (i = 0; i < smmu->num_context_banks; ++i) { 1597 for (i = 0; i < smmu->num_context_banks; ++i) {
1745 void __iomem *cb_base = ARM_SMMU_CB(smmu, i);
1746
1747 arm_smmu_write_context_bank(smmu, i); 1598 arm_smmu_write_context_bank(smmu, i);
1748 writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR); 1599 arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, FSR_FAULT);
1749 /*
1750 * Disable MMU-500's not-particularly-beneficial next-page
1751 * prefetcher for the sake of errata #841119 and #826419.
1752 */
1753 if (smmu->model == ARM_MMU500) {
1754 reg = readl_relaxed(cb_base + ARM_SMMU_CB_ACTLR);
1755 reg &= ~ARM_MMU500_ACTLR_CPRE;
1756 writel_relaxed(reg, cb_base + ARM_SMMU_CB_ACTLR);
1757 }
1758 } 1600 }
1759 1601
1760 /* Invalidate the TLB, just in case */ 1602 /* Invalidate the TLB, just in case */
1761 writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLH); 1603 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLH, QCOM_DUMMY_VAL);
1762 writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH); 1604 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLNSNH, QCOM_DUMMY_VAL);
1763 1605
1764 reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0); 1606 reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sCR0);
1765 1607
1766 /* Enable fault reporting */ 1608 /* Enable fault reporting */
1767 reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE); 1609 reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
@@ -1780,7 +1622,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
1780 reg &= ~sCR0_FB; 1622 reg &= ~sCR0_FB;
1781 1623
1782 /* Don't upgrade barriers */ 1624 /* Don't upgrade barriers */
1783 reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT); 1625 reg &= ~(sCR0_BSU);
1784 1626
1785 if (smmu->features & ARM_SMMU_FEAT_VMID16) 1627 if (smmu->features & ARM_SMMU_FEAT_VMID16)
1786 reg |= sCR0_VMID16EN; 1628 reg |= sCR0_VMID16EN;
@@ -1788,9 +1630,12 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
1788 if (smmu->features & ARM_SMMU_FEAT_EXIDS) 1630 if (smmu->features & ARM_SMMU_FEAT_EXIDS)
1789 reg |= sCR0_EXIDENABLE; 1631 reg |= sCR0_EXIDENABLE;
1790 1632
1633 if (smmu->impl && smmu->impl->reset)
1634 smmu->impl->reset(smmu);
1635
1791 /* Push the button */ 1636 /* Push the button */
1792 arm_smmu_tlb_sync_global(smmu); 1637 arm_smmu_tlb_sync_global(smmu);
1793 writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0); 1638 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, reg);
1794} 1639}
1795 1640
1796static int arm_smmu_id_size_to_bits(int size) 1641static int arm_smmu_id_size_to_bits(int size)
@@ -1814,8 +1659,7 @@ static int arm_smmu_id_size_to_bits(int size)
1814 1659
1815static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu) 1660static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1816{ 1661{
1817 unsigned long size; 1662 unsigned int size;
1818 void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
1819 u32 id; 1663 u32 id;
1820 bool cttw_reg, cttw_fw = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK; 1664 bool cttw_reg, cttw_fw = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK;
1821 int i; 1665 int i;
@@ -1825,7 +1669,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1825 smmu->version == ARM_SMMU_V2 ? 2 : 1); 1669 smmu->version == ARM_SMMU_V2 ? 2 : 1);
1826 1670
1827 /* ID0 */ 1671 /* ID0 */
1828 id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID0); 1672 id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID0);
1829 1673
1830 /* Restrict available stages based on module parameter */ 1674 /* Restrict available stages based on module parameter */
1831 if (force_stage == 1) 1675 if (force_stage == 1)
@@ -1879,12 +1723,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1879 smmu->features |= ARM_SMMU_FEAT_EXIDS; 1723 smmu->features |= ARM_SMMU_FEAT_EXIDS;
1880 size = 1 << 16; 1724 size = 1 << 16;
1881 } else { 1725 } else {
1882 size = 1 << ((id >> ID0_NUMSIDB_SHIFT) & ID0_NUMSIDB_MASK); 1726 size = 1 << FIELD_GET(ID0_NUMSIDB, id);
1883 } 1727 }
1884 smmu->streamid_mask = size - 1; 1728 smmu->streamid_mask = size - 1;
1885 if (id & ID0_SMS) { 1729 if (id & ID0_SMS) {
1886 smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH; 1730 smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH;
1887 size = (id >> ID0_NUMSMRG_SHIFT) & ID0_NUMSMRG_MASK; 1731 size = FIELD_GET(ID0_NUMSMRG, id);
1888 if (size == 0) { 1732 if (size == 0) {
1889 dev_err(smmu->dev, 1733 dev_err(smmu->dev,
1890 "stream-matching supported, but no SMRs present!\n"); 1734 "stream-matching supported, but no SMRs present!\n");
@@ -1898,7 +1742,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1898 return -ENOMEM; 1742 return -ENOMEM;
1899 1743
1900 dev_notice(smmu->dev, 1744 dev_notice(smmu->dev,
1901 "\tstream matching with %lu register groups", size); 1745 "\tstream matching with %u register groups", size);
1902 } 1746 }
1903 /* s2cr->type == 0 means translation, so initialise explicitly */ 1747 /* s2cr->type == 0 means translation, so initialise explicitly */
1904 smmu->s2crs = devm_kmalloc_array(smmu->dev, size, sizeof(*smmu->s2crs), 1748 smmu->s2crs = devm_kmalloc_array(smmu->dev, size, sizeof(*smmu->s2crs),
@@ -1919,49 +1763,38 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1919 } 1763 }
1920 1764
1921 /* ID1 */ 1765 /* ID1 */
1922 id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1); 1766 id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID1);
1923 smmu->pgshift = (id & ID1_PAGESIZE) ? 16 : 12; 1767 smmu->pgshift = (id & ID1_PAGESIZE) ? 16 : 12;
1924 1768
1925 /* Check for size mismatch of SMMU address space from mapped region */ 1769 /* Check for size mismatch of SMMU address space from mapped region */
1926 size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1); 1770 size = 1 << (FIELD_GET(ID1_NUMPAGENDXB, id) + 1);
1927 size <<= smmu->pgshift; 1771 if (smmu->numpage != 2 * size << smmu->pgshift)
1928 if (smmu->cb_base != gr0_base + size)
1929 dev_warn(smmu->dev, 1772 dev_warn(smmu->dev,
1930 "SMMU address space size (0x%lx) differs from mapped region size (0x%tx)!\n", 1773 "SMMU address space size (0x%x) differs from mapped region size (0x%x)!\n",
1931 size * 2, (smmu->cb_base - gr0_base) * 2); 1774 2 * size << smmu->pgshift, smmu->numpage);
1775 /* Now properly encode NUMPAGE to subsequently derive SMMU_CB_BASE */
1776 smmu->numpage = size;
1932 1777
1933 smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK; 1778 smmu->num_s2_context_banks = FIELD_GET(ID1_NUMS2CB, id);
1934 smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK; 1779 smmu->num_context_banks = FIELD_GET(ID1_NUMCB, id);
1935 if (smmu->num_s2_context_banks > smmu->num_context_banks) { 1780 if (smmu->num_s2_context_banks > smmu->num_context_banks) {
1936 dev_err(smmu->dev, "impossible number of S2 context banks!\n"); 1781 dev_err(smmu->dev, "impossible number of S2 context banks!\n");
1937 return -ENODEV; 1782 return -ENODEV;
1938 } 1783 }
1939 dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n", 1784 dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n",
1940 smmu->num_context_banks, smmu->num_s2_context_banks); 1785 smmu->num_context_banks, smmu->num_s2_context_banks);
1941 /*
1942 * Cavium CN88xx erratum #27704.
1943 * Ensure ASID and VMID allocation is unique across all SMMUs in
1944 * the system.
1945 */
1946 if (smmu->model == CAVIUM_SMMUV2) {
1947 smmu->cavium_id_base =
1948 atomic_add_return(smmu->num_context_banks,
1949 &cavium_smmu_context_count);
1950 smmu->cavium_id_base -= smmu->num_context_banks;
1951 dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
1952 }
1953 smmu->cbs = devm_kcalloc(smmu->dev, smmu->num_context_banks, 1786 smmu->cbs = devm_kcalloc(smmu->dev, smmu->num_context_banks,
1954 sizeof(*smmu->cbs), GFP_KERNEL); 1787 sizeof(*smmu->cbs), GFP_KERNEL);
1955 if (!smmu->cbs) 1788 if (!smmu->cbs)
1956 return -ENOMEM; 1789 return -ENOMEM;
1957 1790
1958 /* ID2 */ 1791 /* ID2 */
1959 id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID2); 1792 id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID2);
1960 size = arm_smmu_id_size_to_bits((id >> ID2_IAS_SHIFT) & ID2_IAS_MASK); 1793 size = arm_smmu_id_size_to_bits(FIELD_GET(ID2_IAS, id));
1961 smmu->ipa_size = size; 1794 smmu->ipa_size = size;
1962 1795
1963 /* The output mask is also applied for bypass */ 1796 /* The output mask is also applied for bypass */
1964 size = arm_smmu_id_size_to_bits((id >> ID2_OAS_SHIFT) & ID2_OAS_MASK); 1797 size = arm_smmu_id_size_to_bits(FIELD_GET(ID2_OAS, id));
1965 smmu->pa_size = size; 1798 smmu->pa_size = size;
1966 1799
1967 if (id & ID2_VMID16) 1800 if (id & ID2_VMID16)
@@ -1981,7 +1814,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
1981 if (smmu->version == ARM_SMMU_V1_64K) 1814 if (smmu->version == ARM_SMMU_V1_64K)
1982 smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K; 1815 smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
1983 } else { 1816 } else {
1984 size = (id >> ID2_UBS_SHIFT) & ID2_UBS_MASK; 1817 size = FIELD_GET(ID2_UBS, id);
1985 smmu->va_size = arm_smmu_id_size_to_bits(size); 1818 smmu->va_size = arm_smmu_id_size_to_bits(size);
1986 if (id & ID2_PTFS_4K) 1819 if (id & ID2_PTFS_4K)
1987 smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_4K; 1820 smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_4K;
@@ -2018,6 +1851,9 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
2018 dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n", 1851 dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
2019 smmu->ipa_size, smmu->pa_size); 1852 smmu->ipa_size, smmu->pa_size);
2020 1853
1854 if (smmu->impl && smmu->impl->cfg_probe)
1855 return smmu->impl->cfg_probe(smmu);
1856
2021 return 0; 1857 return 0;
2022} 1858}
2023 1859
@@ -2130,8 +1966,6 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev,
2130 smmu->version = data->version; 1966 smmu->version = data->version;
2131 smmu->model = data->model; 1967 smmu->model = data->model;
2132 1968
2133 parse_driver_options(smmu);
2134
2135 legacy_binding = of_find_property(dev->of_node, "mmu-masters", NULL); 1969 legacy_binding = of_find_property(dev->of_node, "mmu-masters", NULL);
2136 if (legacy_binding && !using_generic_binding) { 1970 if (legacy_binding && !using_generic_binding) {
2137 if (!using_legacy_binding) 1971 if (!using_legacy_binding)
@@ -2194,12 +2028,20 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
2194 if (err) 2028 if (err)
2195 return err; 2029 return err;
2196 2030
2031 smmu = arm_smmu_impl_init(smmu);
2032 if (IS_ERR(smmu))
2033 return PTR_ERR(smmu);
2034
2197 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 2035 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
2198 ioaddr = res->start; 2036 ioaddr = res->start;
2199 smmu->base = devm_ioremap_resource(dev, res); 2037 smmu->base = devm_ioremap_resource(dev, res);
2200 if (IS_ERR(smmu->base)) 2038 if (IS_ERR(smmu->base))
2201 return PTR_ERR(smmu->base); 2039 return PTR_ERR(smmu->base);
2202 smmu->cb_base = smmu->base + resource_size(res) / 2; 2040 /*
2041 * The resource size should effectively match the value of SMMU_TOP;
2042 * stash that temporarily until we know PAGESIZE to validate it with.
2043 */
2044 smmu->numpage = resource_size(res);
2203 2045
2204 num_irqs = 0; 2046 num_irqs = 0;
2205 while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) { 2047 while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
@@ -2339,7 +2181,7 @@ static void arm_smmu_device_shutdown(struct platform_device *pdev)
2339 2181
2340 arm_smmu_rpm_get(smmu); 2182 arm_smmu_rpm_get(smmu);
2341 /* Turn the thing off */ 2183 /* Turn the thing off */
2342 writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0); 2184 arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, sCR0_CLIENTPD);
2343 arm_smmu_rpm_put(smmu); 2185 arm_smmu_rpm_put(smmu);
2344 2186
2345 if (pm_runtime_enabled(smmu->dev)) 2187 if (pm_runtime_enabled(smmu->dev))
diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
new file mode 100644
index 000000000000..b19b6cae9b5e
--- /dev/null
+++ b/drivers/iommu/arm-smmu.h
@@ -0,0 +1,402 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * IOMMU API for ARM architected SMMU implementations.
4 *
5 * Copyright (C) 2013 ARM Limited
6 *
7 * Author: Will Deacon <will.deacon@arm.com>
8 */
9
10#ifndef _ARM_SMMU_H
11#define _ARM_SMMU_H
12
13#include <linux/atomic.h>
14#include <linux/bits.h>
15#include <linux/clk.h>
16#include <linux/device.h>
17#include <linux/io-64-nonatomic-hi-lo.h>
18#include <linux/io-pgtable.h>
19#include <linux/iommu.h>
20#include <linux/mutex.h>
21#include <linux/spinlock.h>
22#include <linux/types.h>
23
24/* Configuration registers */
25#define ARM_SMMU_GR0_sCR0 0x0
26#define sCR0_VMID16EN BIT(31)
27#define sCR0_BSU GENMASK(15, 14)
28#define sCR0_FB BIT(13)
29#define sCR0_PTM BIT(12)
30#define sCR0_VMIDPNE BIT(11)
31#define sCR0_USFCFG BIT(10)
32#define sCR0_GCFGFIE BIT(5)
33#define sCR0_GCFGFRE BIT(4)
34#define sCR0_EXIDENABLE BIT(3)
35#define sCR0_GFIE BIT(2)
36#define sCR0_GFRE BIT(1)
37#define sCR0_CLIENTPD BIT(0)
38
39/* Auxiliary Configuration register */
40#define ARM_SMMU_GR0_sACR 0x10
41
42/* Identification registers */
43#define ARM_SMMU_GR0_ID0 0x20
44#define ID0_S1TS BIT(30)
45#define ID0_S2TS BIT(29)
46#define ID0_NTS BIT(28)
47#define ID0_SMS BIT(27)
48#define ID0_ATOSNS BIT(26)
49#define ID0_PTFS_NO_AARCH32 BIT(25)
50#define ID0_PTFS_NO_AARCH32S BIT(24)
51#define ID0_NUMIRPT GENMASK(23, 16)
52#define ID0_CTTW BIT(14)
53#define ID0_NUMSIDB GENMASK(12, 9)
54#define ID0_EXIDS BIT(8)
55#define ID0_NUMSMRG GENMASK(7, 0)
56
57#define ARM_SMMU_GR0_ID1 0x24
58#define ID1_PAGESIZE BIT(31)
59#define ID1_NUMPAGENDXB GENMASK(30, 28)
60#define ID1_NUMS2CB GENMASK(23, 16)
61#define ID1_NUMCB GENMASK(7, 0)
62
63#define ARM_SMMU_GR0_ID2 0x28
64#define ID2_VMID16 BIT(15)
65#define ID2_PTFS_64K BIT(14)
66#define ID2_PTFS_16K BIT(13)
67#define ID2_PTFS_4K BIT(12)
68#define ID2_UBS GENMASK(11, 8)
69#define ID2_OAS GENMASK(7, 4)
70#define ID2_IAS GENMASK(3, 0)
71
72#define ARM_SMMU_GR0_ID3 0x2c
73#define ARM_SMMU_GR0_ID4 0x30
74#define ARM_SMMU_GR0_ID5 0x34
75#define ARM_SMMU_GR0_ID6 0x38
76
77#define ARM_SMMU_GR0_ID7 0x3c
78#define ID7_MAJOR GENMASK(7, 4)
79#define ID7_MINOR GENMASK(3, 0)
80
81#define ARM_SMMU_GR0_sGFSR 0x48
82#define ARM_SMMU_GR0_sGFSYNR0 0x50
83#define ARM_SMMU_GR0_sGFSYNR1 0x54
84#define ARM_SMMU_GR0_sGFSYNR2 0x58
85
86/* Global TLB invalidation */
87#define ARM_SMMU_GR0_TLBIVMID 0x64
88#define ARM_SMMU_GR0_TLBIALLNSNH 0x68
89#define ARM_SMMU_GR0_TLBIALLH 0x6c
90#define ARM_SMMU_GR0_sTLBGSYNC 0x70
91
92#define ARM_SMMU_GR0_sTLBGSTATUS 0x74
93#define sTLBGSTATUS_GSACTIVE BIT(0)
94
95/* Stream mapping registers */
96#define ARM_SMMU_GR0_SMR(n) (0x800 + ((n) << 2))
97#define SMR_VALID BIT(31)
98#define SMR_MASK GENMASK(31, 16)
99#define SMR_ID GENMASK(15, 0)
100
101#define ARM_SMMU_GR0_S2CR(n) (0xc00 + ((n) << 2))
102#define S2CR_PRIVCFG GENMASK(25, 24)
103enum arm_smmu_s2cr_privcfg {
104 S2CR_PRIVCFG_DEFAULT,
105 S2CR_PRIVCFG_DIPAN,
106 S2CR_PRIVCFG_UNPRIV,
107 S2CR_PRIVCFG_PRIV,
108};
109#define S2CR_TYPE GENMASK(17, 16)
110enum arm_smmu_s2cr_type {
111 S2CR_TYPE_TRANS,
112 S2CR_TYPE_BYPASS,
113 S2CR_TYPE_FAULT,
114};
115#define S2CR_EXIDVALID BIT(10)
116#define S2CR_CBNDX GENMASK(7, 0)
117
118/* Context bank attribute registers */
119#define ARM_SMMU_GR1_CBAR(n) (0x0 + ((n) << 2))
120#define CBAR_IRPTNDX GENMASK(31, 24)
121#define CBAR_TYPE GENMASK(17, 16)
122enum arm_smmu_cbar_type {
123 CBAR_TYPE_S2_TRANS,
124 CBAR_TYPE_S1_TRANS_S2_BYPASS,
125 CBAR_TYPE_S1_TRANS_S2_FAULT,
126 CBAR_TYPE_S1_TRANS_S2_TRANS,
127};
128#define CBAR_S1_MEMATTR GENMASK(15, 12)
129#define CBAR_S1_MEMATTR_WB 0xf
130#define CBAR_S1_BPSHCFG GENMASK(9, 8)
131#define CBAR_S1_BPSHCFG_NSH 3
132#define CBAR_VMID GENMASK(7, 0)
133
134#define ARM_SMMU_GR1_CBFRSYNRA(n) (0x400 + ((n) << 2))
135
136#define ARM_SMMU_GR1_CBA2R(n) (0x800 + ((n) << 2))
137#define CBA2R_VMID16 GENMASK(31, 16)
138#define CBA2R_VA64 BIT(0)
139
140#define ARM_SMMU_CB_SCTLR 0x0
141#define SCTLR_S1_ASIDPNE BIT(12)
142#define SCTLR_CFCFG BIT(7)
143#define SCTLR_CFIE BIT(6)
144#define SCTLR_CFRE BIT(5)
145#define SCTLR_E BIT(4)
146#define SCTLR_AFE BIT(2)
147#define SCTLR_TRE BIT(1)
148#define SCTLR_M BIT(0)
149
150#define ARM_SMMU_CB_ACTLR 0x4
151
152#define ARM_SMMU_CB_RESUME 0x8
153#define RESUME_TERMINATE BIT(0)
154
155#define ARM_SMMU_CB_TCR2 0x10
156#define TCR2_SEP GENMASK(17, 15)
157#define TCR2_SEP_UPSTREAM 0x7
158#define TCR2_AS BIT(4)
159
160#define ARM_SMMU_CB_TTBR0 0x20
161#define ARM_SMMU_CB_TTBR1 0x28
162#define TTBRn_ASID GENMASK_ULL(63, 48)
163
164#define ARM_SMMU_CB_TCR 0x30
165#define ARM_SMMU_CB_CONTEXTIDR 0x34
166#define ARM_SMMU_CB_S1_MAIR0 0x38
167#define ARM_SMMU_CB_S1_MAIR1 0x3c
168
169#define ARM_SMMU_CB_PAR 0x50
170#define CB_PAR_F BIT(0)
171
172#define ARM_SMMU_CB_FSR 0x58
173#define FSR_MULTI BIT(31)
174#define FSR_SS BIT(30)
175#define FSR_UUT BIT(8)
176#define FSR_ASF BIT(7)
177#define FSR_TLBLKF BIT(6)
178#define FSR_TLBMCF BIT(5)
179#define FSR_EF BIT(4)
180#define FSR_PF BIT(3)
181#define FSR_AFF BIT(2)
182#define FSR_TF BIT(1)
183
184#define FSR_IGN (FSR_AFF | FSR_ASF | \
185 FSR_TLBMCF | FSR_TLBLKF)
186#define FSR_FAULT (FSR_MULTI | FSR_SS | FSR_UUT | \
187 FSR_EF | FSR_PF | FSR_TF | FSR_IGN)
188
189#define ARM_SMMU_CB_FAR 0x60
190
191#define ARM_SMMU_CB_FSYNR0 0x68
192#define FSYNR0_WNR BIT(4)
193
194#define ARM_SMMU_CB_S1_TLBIVA 0x600
195#define ARM_SMMU_CB_S1_TLBIASID 0x610
196#define ARM_SMMU_CB_S1_TLBIVAL 0x620
197#define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
198#define ARM_SMMU_CB_S2_TLBIIPAS2L 0x638
199#define ARM_SMMU_CB_TLBSYNC 0x7f0
200#define ARM_SMMU_CB_TLBSTATUS 0x7f4
201#define ARM_SMMU_CB_ATS1PR 0x800
202
203#define ARM_SMMU_CB_ATSR 0x8f0
204#define ATSR_ACTIVE BIT(0)
205
206
207/* Maximum number of context banks per SMMU */
208#define ARM_SMMU_MAX_CBS 128
209
210
211/* Shared driver definitions */
212enum arm_smmu_arch_version {
213 ARM_SMMU_V1,
214 ARM_SMMU_V1_64K,
215 ARM_SMMU_V2,
216};
217
218enum arm_smmu_implementation {
219 GENERIC_SMMU,
220 ARM_MMU500,
221 CAVIUM_SMMUV2,
222 QCOM_SMMUV2,
223};
224
225struct arm_smmu_device {
226 struct device *dev;
227
228 void __iomem *base;
229 unsigned int numpage;
230 unsigned int pgshift;
231
232#define ARM_SMMU_FEAT_COHERENT_WALK (1 << 0)
233#define ARM_SMMU_FEAT_STREAM_MATCH (1 << 1)
234#define ARM_SMMU_FEAT_TRANS_S1 (1 << 2)
235#define ARM_SMMU_FEAT_TRANS_S2 (1 << 3)
236#define ARM_SMMU_FEAT_TRANS_NESTED (1 << 4)
237#define ARM_SMMU_FEAT_TRANS_OPS (1 << 5)
238#define ARM_SMMU_FEAT_VMID16 (1 << 6)
239#define ARM_SMMU_FEAT_FMT_AARCH64_4K (1 << 7)
240#define ARM_SMMU_FEAT_FMT_AARCH64_16K (1 << 8)
241#define ARM_SMMU_FEAT_FMT_AARCH64_64K (1 << 9)
242#define ARM_SMMU_FEAT_FMT_AARCH32_L (1 << 10)
243#define ARM_SMMU_FEAT_FMT_AARCH32_S (1 << 11)
244#define ARM_SMMU_FEAT_EXIDS (1 << 12)
245 u32 features;
246
247 enum arm_smmu_arch_version version;
248 enum arm_smmu_implementation model;
249 const struct arm_smmu_impl *impl;
250
251 u32 num_context_banks;
252 u32 num_s2_context_banks;
253 DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
254 struct arm_smmu_cb *cbs;
255 atomic_t irptndx;
256
257 u32 num_mapping_groups;
258 u16 streamid_mask;
259 u16 smr_mask_mask;
260 struct arm_smmu_smr *smrs;
261 struct arm_smmu_s2cr *s2crs;
262 struct mutex stream_map_mutex;
263
264 unsigned long va_size;
265 unsigned long ipa_size;
266 unsigned long pa_size;
267 unsigned long pgsize_bitmap;
268
269 u32 num_global_irqs;
270 u32 num_context_irqs;
271 unsigned int *irqs;
272 struct clk_bulk_data *clks;
273 int num_clks;
274
275 spinlock_t global_sync_lock;
276
277 /* IOMMU core code handle */
278 struct iommu_device iommu;
279};
280
281enum arm_smmu_context_fmt {
282 ARM_SMMU_CTX_FMT_NONE,
283 ARM_SMMU_CTX_FMT_AARCH64,
284 ARM_SMMU_CTX_FMT_AARCH32_L,
285 ARM_SMMU_CTX_FMT_AARCH32_S,
286};
287
288struct arm_smmu_cfg {
289 u8 cbndx;
290 u8 irptndx;
291 union {
292 u16 asid;
293 u16 vmid;
294 };
295 enum arm_smmu_cbar_type cbar;
296 enum arm_smmu_context_fmt fmt;
297};
298#define INVALID_IRPTNDX 0xff
299
300enum arm_smmu_domain_stage {
301 ARM_SMMU_DOMAIN_S1 = 0,
302 ARM_SMMU_DOMAIN_S2,
303 ARM_SMMU_DOMAIN_NESTED,
304 ARM_SMMU_DOMAIN_BYPASS,
305};
306
307struct arm_smmu_flush_ops {
308 struct iommu_flush_ops tlb;
309 void (*tlb_inv_range)(unsigned long iova, size_t size, size_t granule,
310 bool leaf, void *cookie);
311 void (*tlb_sync)(void *cookie);
312};
313
314struct arm_smmu_domain {
315 struct arm_smmu_device *smmu;
316 struct io_pgtable_ops *pgtbl_ops;
317 const struct arm_smmu_flush_ops *flush_ops;
318 struct arm_smmu_cfg cfg;
319 enum arm_smmu_domain_stage stage;
320 bool non_strict;
321 struct mutex init_mutex; /* Protects smmu pointer */
322 spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */
323 struct iommu_domain domain;
324};
325
326
327/* Implementation details, yay! */
328struct arm_smmu_impl {
329 u32 (*read_reg)(struct arm_smmu_device *smmu, int page, int offset);
330 void (*write_reg)(struct arm_smmu_device *smmu, int page, int offset,
331 u32 val);
332 u64 (*read_reg64)(struct arm_smmu_device *smmu, int page, int offset);
333 void (*write_reg64)(struct arm_smmu_device *smmu, int page, int offset,
334 u64 val);
335 int (*cfg_probe)(struct arm_smmu_device *smmu);
336 int (*reset)(struct arm_smmu_device *smmu);
337 int (*init_context)(struct arm_smmu_domain *smmu_domain);
338};
339
340static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
341{
342 return smmu->base + (n << smmu->pgshift);
343}
344
345static inline u32 arm_smmu_readl(struct arm_smmu_device *smmu, int page, int offset)
346{
347 if (smmu->impl && unlikely(smmu->impl->read_reg))
348 return smmu->impl->read_reg(smmu, page, offset);
349 return readl_relaxed(arm_smmu_page(smmu, page) + offset);
350}
351
352static inline void arm_smmu_writel(struct arm_smmu_device *smmu, int page,
353 int offset, u32 val)
354{
355 if (smmu->impl && unlikely(smmu->impl->write_reg))
356 smmu->impl->write_reg(smmu, page, offset, val);
357 else
358 writel_relaxed(val, arm_smmu_page(smmu, page) + offset);
359}
360
361static inline u64 arm_smmu_readq(struct arm_smmu_device *smmu, int page, int offset)
362{
363 if (smmu->impl && unlikely(smmu->impl->read_reg64))
364 return smmu->impl->read_reg64(smmu, page, offset);
365 return readq_relaxed(arm_smmu_page(smmu, page) + offset);
366}
367
368static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page,
369 int offset, u64 val)
370{
371 if (smmu->impl && unlikely(smmu->impl->write_reg64))
372 smmu->impl->write_reg64(smmu, page, offset, val);
373 else
374 writeq_relaxed(val, arm_smmu_page(smmu, page) + offset);
375}
376
377#define ARM_SMMU_GR0 0
378#define ARM_SMMU_GR1 1
379#define ARM_SMMU_CB(s, n) ((s)->numpage + (n))
380
381#define arm_smmu_gr0_read(s, o) \
382 arm_smmu_readl((s), ARM_SMMU_GR0, (o))
383#define arm_smmu_gr0_write(s, o, v) \
384 arm_smmu_writel((s), ARM_SMMU_GR0, (o), (v))
385
386#define arm_smmu_gr1_read(s, o) \
387 arm_smmu_readl((s), ARM_SMMU_GR1, (o))
388#define arm_smmu_gr1_write(s, o, v) \
389 arm_smmu_writel((s), ARM_SMMU_GR1, (o), (v))
390
391#define arm_smmu_cb_read(s, n, o) \
392 arm_smmu_readl((s), ARM_SMMU_CB((s), (n)), (o))
393#define arm_smmu_cb_write(s, n, o, v) \
394 arm_smmu_writel((s), ARM_SMMU_CB((s), (n)), (o), (v))
395#define arm_smmu_cb_readq(s, n, o) \
396 arm_smmu_readq((s), ARM_SMMU_CB((s), (n)), (o))
397#define arm_smmu_cb_writeq(s, n, o, v) \
398 arm_smmu_writeq((s), ARM_SMMU_CB((s), (n)), (o), (v))
399
400struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
401
402#endif /* _ARM_SMMU_H */
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d991d40f797f..315e0087c19f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -444,13 +444,18 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
444 struct iommu_dma_cookie *cookie = domain->iova_cookie; 444 struct iommu_dma_cookie *cookie = domain->iova_cookie;
445 struct iova_domain *iovad = &cookie->iovad; 445 struct iova_domain *iovad = &cookie->iovad;
446 size_t iova_off = iova_offset(iovad, dma_addr); 446 size_t iova_off = iova_offset(iovad, dma_addr);
447 struct iommu_iotlb_gather iotlb_gather;
448 size_t unmapped;
447 449
448 dma_addr -= iova_off; 450 dma_addr -= iova_off;
449 size = iova_align(iovad, size + iova_off); 451 size = iova_align(iovad, size + iova_off);
452 iommu_iotlb_gather_init(&iotlb_gather);
453
454 unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
455 WARN_ON(unmapped != size);
450 456
451 WARN_ON(iommu_unmap_fast(domain, dma_addr, size) != size);
452 if (!cookie->fq_domain) 457 if (!cookie->fq_domain)
453 iommu_tlb_sync(domain); 458 iommu_tlb_sync(domain, &iotlb_gather);
454 iommu_dma_free_iova(cookie, dma_addr, size); 459 iommu_dma_free_iova(cookie, dma_addr, size);
455} 460}
456 461
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index b0c1e5f9daae..cf5af34cb681 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -1130,7 +1130,8 @@ static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain
1130} 1130}
1131 1131
1132static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain, 1132static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain,
1133 unsigned long l_iova, size_t size) 1133 unsigned long l_iova, size_t size,
1134 struct iommu_iotlb_gather *gather)
1134{ 1135{
1135 struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); 1136 struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
1136 sysmmu_iova_t iova = (sysmmu_iova_t)l_iova; 1137 sysmmu_iova_t iova = (sysmmu_iova_t)l_iova;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 12d094d08c0a..b7454ca4a87c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5153,7 +5153,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
5153} 5153}
5154 5154
5155static size_t intel_iommu_unmap(struct iommu_domain *domain, 5155static size_t intel_iommu_unmap(struct iommu_domain *domain,
5156 unsigned long iova, size_t size) 5156 unsigned long iova, size_t size,
5157 struct iommu_iotlb_gather *gather)
5157{ 5158{
5158 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5159 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5159 struct page *freelist = NULL; 5160 struct page *freelist = NULL;
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 0fc8dfab2abf..18e7d212c7de 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -362,7 +362,8 @@ static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl)
362 return false; 362 return false;
363} 363}
364 364
365static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long, 365static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *,
366 struct iommu_iotlb_gather *, unsigned long,
366 size_t, int, arm_v7s_iopte *); 367 size_t, int, arm_v7s_iopte *);
367 368
368static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, 369static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
@@ -383,7 +384,7 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
383 size_t sz = ARM_V7S_BLOCK_SIZE(lvl); 384 size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
384 385
385 tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl); 386 tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl);
386 if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz, 387 if (WARN_ON(__arm_v7s_unmap(data, NULL, iova + i * sz,
387 sz, lvl, tblp) != sz)) 388 sz, lvl, tblp) != sz))
388 return -EINVAL; 389 return -EINVAL;
389 } else if (ptep[i]) { 390 } else if (ptep[i]) {
@@ -493,9 +494,8 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
493 * a chance for anything to kick off a table walk for the new iova. 494 * a chance for anything to kick off a table walk for the new iova.
494 */ 495 */
495 if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) { 496 if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
496 io_pgtable_tlb_add_flush(iop, iova, size, 497 io_pgtable_tlb_flush_walk(iop, iova, size,
497 ARM_V7S_BLOCK_SIZE(2), false); 498 ARM_V7S_BLOCK_SIZE(2));
498 io_pgtable_tlb_sync(iop);
499 } else { 499 } else {
500 wmb(); 500 wmb();
501 } 501 }
@@ -541,12 +541,12 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
541 __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); 541 __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
542 542
543 size *= ARM_V7S_CONT_PAGES; 543 size *= ARM_V7S_CONT_PAGES;
544 io_pgtable_tlb_add_flush(iop, iova, size, size, true); 544 io_pgtable_tlb_flush_leaf(iop, iova, size, size);
545 io_pgtable_tlb_sync(iop);
546 return pte; 545 return pte;
547} 546}
548 547
549static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, 548static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
549 struct iommu_iotlb_gather *gather,
550 unsigned long iova, size_t size, 550 unsigned long iova, size_t size,
551 arm_v7s_iopte blk_pte, 551 arm_v7s_iopte blk_pte,
552 arm_v7s_iopte *ptep) 552 arm_v7s_iopte *ptep)
@@ -583,15 +583,15 @@ static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
583 return 0; 583 return 0;
584 584
585 tablep = iopte_deref(pte, 1); 585 tablep = iopte_deref(pte, 1);
586 return __arm_v7s_unmap(data, iova, size, 2, tablep); 586 return __arm_v7s_unmap(data, gather, iova, size, 2, tablep);
587 } 587 }
588 588
589 io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); 589 io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
590 io_pgtable_tlb_sync(&data->iop);
591 return size; 590 return size;
592} 591}
593 592
594static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, 593static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
594 struct iommu_iotlb_gather *gather,
595 unsigned long iova, size_t size, int lvl, 595 unsigned long iova, size_t size, int lvl,
596 arm_v7s_iopte *ptep) 596 arm_v7s_iopte *ptep)
597{ 597{
@@ -638,9 +638,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
638 for (i = 0; i < num_entries; i++) { 638 for (i = 0; i < num_entries; i++) {
639 if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) { 639 if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) {
640 /* Also flush any partial walks */ 640 /* Also flush any partial walks */
641 io_pgtable_tlb_add_flush(iop, iova, blk_size, 641 io_pgtable_tlb_flush_walk(iop, iova, blk_size,
642 ARM_V7S_BLOCK_SIZE(lvl + 1), false); 642 ARM_V7S_BLOCK_SIZE(lvl + 1));
643 io_pgtable_tlb_sync(iop);
644 ptep = iopte_deref(pte[i], lvl); 643 ptep = iopte_deref(pte[i], lvl);
645 __arm_v7s_free_table(ptep, lvl + 1, data); 644 __arm_v7s_free_table(ptep, lvl + 1, data);
646 } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { 645 } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
@@ -651,8 +650,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
651 */ 650 */
652 smp_wmb(); 651 smp_wmb();
653 } else { 652 } else {
654 io_pgtable_tlb_add_flush(iop, iova, blk_size, 653 io_pgtable_tlb_add_page(iop, gather, iova, blk_size);
655 blk_size, true);
656 } 654 }
657 iova += blk_size; 655 iova += blk_size;
658 } 656 }
@@ -662,23 +660,24 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
662 * Insert a table at the next level to map the old region, 660 * Insert a table at the next level to map the old region,
663 * minus the part we want to unmap 661 * minus the part we want to unmap
664 */ 662 */
665 return arm_v7s_split_blk_unmap(data, iova, size, pte[0], ptep); 663 return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0],
664 ptep);
666 } 665 }
667 666
668 /* Keep on walkin' */ 667 /* Keep on walkin' */
669 ptep = iopte_deref(pte[0], lvl); 668 ptep = iopte_deref(pte[0], lvl);
670 return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep); 669 return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep);
671} 670}
672 671
673static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, 672static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
674 size_t size) 673 size_t size, struct iommu_iotlb_gather *gather)
675{ 674{
676 struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); 675 struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
677 676
678 if (WARN_ON(upper_32_bits(iova))) 677 if (WARN_ON(upper_32_bits(iova)))
679 return 0; 678 return 0;
680 679
681 return __arm_v7s_unmap(data, iova, size, 1, data->pgd); 680 return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd);
682} 681}
683 682
684static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, 683static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
@@ -806,22 +805,24 @@ static void dummy_tlb_flush_all(void *cookie)
806 WARN_ON(cookie != cfg_cookie); 805 WARN_ON(cookie != cfg_cookie);
807} 806}
808 807
809static void dummy_tlb_add_flush(unsigned long iova, size_t size, 808static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule,
810 size_t granule, bool leaf, void *cookie) 809 void *cookie)
811{ 810{
812 WARN_ON(cookie != cfg_cookie); 811 WARN_ON(cookie != cfg_cookie);
813 WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); 812 WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
814} 813}
815 814
816static void dummy_tlb_sync(void *cookie) 815static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
816 unsigned long iova, size_t granule, void *cookie)
817{ 817{
818 WARN_ON(cookie != cfg_cookie); 818 dummy_tlb_flush(iova, granule, granule, cookie);
819} 819}
820 820
821static const struct iommu_gather_ops dummy_tlb_ops = { 821static const struct iommu_flush_ops dummy_tlb_ops = {
822 .tlb_flush_all = dummy_tlb_flush_all, 822 .tlb_flush_all = dummy_tlb_flush_all,
823 .tlb_add_flush = dummy_tlb_add_flush, 823 .tlb_flush_walk = dummy_tlb_flush,
824 .tlb_sync = dummy_tlb_sync, 824 .tlb_flush_leaf = dummy_tlb_flush,
825 .tlb_add_page = dummy_tlb_add_page,
825}; 826};
826 827
827#define __FAIL(ops) ({ \ 828#define __FAIL(ops) ({ \
@@ -896,7 +897,7 @@ static int __init arm_v7s_do_selftests(void)
896 size = 1UL << __ffs(cfg.pgsize_bitmap); 897 size = 1UL << __ffs(cfg.pgsize_bitmap);
897 while (i < loopnr) { 898 while (i < loopnr) {
898 iova_start = i * SZ_16M; 899 iova_start = i * SZ_16M;
899 if (ops->unmap(ops, iova_start + size, size) != size) 900 if (ops->unmap(ops, iova_start + size, size, NULL) != size)
900 return __FAIL(ops); 901 return __FAIL(ops);
901 902
902 /* Remap of partial unmap */ 903 /* Remap of partial unmap */
@@ -914,7 +915,7 @@ static int __init arm_v7s_do_selftests(void)
914 for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) { 915 for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) {
915 size = 1UL << i; 916 size = 1UL << i;
916 917
917 if (ops->unmap(ops, iova, size) != size) 918 if (ops->unmap(ops, iova, size, NULL) != size)
918 return __FAIL(ops); 919 return __FAIL(ops);
919 920
920 if (ops->iova_to_phys(ops, iova + 42)) 921 if (ops->iova_to_phys(ops, iova + 42))
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 161a7d56264d..4c91359057c5 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -12,7 +12,6 @@
12#include <linux/atomic.h> 12#include <linux/atomic.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/io-pgtable.h> 14#include <linux/io-pgtable.h>
15#include <linux/iommu.h>
16#include <linux/kernel.h> 15#include <linux/kernel.h>
17#include <linux/sizes.h> 16#include <linux/sizes.h>
18#include <linux/slab.h> 17#include <linux/slab.h>
@@ -290,6 +289,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
290} 289}
291 290
292static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, 291static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
292 struct iommu_iotlb_gather *gather,
293 unsigned long iova, size_t size, int lvl, 293 unsigned long iova, size_t size, int lvl,
294 arm_lpae_iopte *ptep); 294 arm_lpae_iopte *ptep);
295 295
@@ -335,8 +335,10 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
335 size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); 335 size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
336 336
337 tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); 337 tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
338 if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz)) 338 if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) {
339 WARN_ON(1);
339 return -EINVAL; 340 return -EINVAL;
341 }
340 } 342 }
341 343
342 __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); 344 __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
@@ -537,6 +539,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
537} 539}
538 540
539static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, 541static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
542 struct iommu_iotlb_gather *gather,
540 unsigned long iova, size_t size, 543 unsigned long iova, size_t size,
541 arm_lpae_iopte blk_pte, int lvl, 544 arm_lpae_iopte blk_pte, int lvl,
542 arm_lpae_iopte *ptep) 545 arm_lpae_iopte *ptep)
@@ -582,15 +585,15 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
582 585
583 tablep = iopte_deref(pte, data); 586 tablep = iopte_deref(pte, data);
584 } else if (unmap_idx >= 0) { 587 } else if (unmap_idx >= 0) {
585 io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); 588 io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
586 io_pgtable_tlb_sync(&data->iop);
587 return size; 589 return size;
588 } 590 }
589 591
590 return __arm_lpae_unmap(data, iova, size, lvl, tablep); 592 return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep);
591} 593}
592 594
593static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, 595static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
596 struct iommu_iotlb_gather *gather,
594 unsigned long iova, size_t size, int lvl, 597 unsigned long iova, size_t size, int lvl,
595 arm_lpae_iopte *ptep) 598 arm_lpae_iopte *ptep)
596{ 599{
@@ -612,9 +615,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
612 615
613 if (!iopte_leaf(pte, lvl, iop->fmt)) { 616 if (!iopte_leaf(pte, lvl, iop->fmt)) {
614 /* Also flush any partial walks */ 617 /* Also flush any partial walks */
615 io_pgtable_tlb_add_flush(iop, iova, size, 618 io_pgtable_tlb_flush_walk(iop, iova, size,
616 ARM_LPAE_GRANULE(data), false); 619 ARM_LPAE_GRANULE(data));
617 io_pgtable_tlb_sync(iop);
618 ptep = iopte_deref(pte, data); 620 ptep = iopte_deref(pte, data);
619 __arm_lpae_free_pgtable(data, lvl + 1, ptep); 621 __arm_lpae_free_pgtable(data, lvl + 1, ptep);
620 } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { 622 } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
@@ -625,7 +627,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
625 */ 627 */
626 smp_wmb(); 628 smp_wmb();
627 } else { 629 } else {
628 io_pgtable_tlb_add_flush(iop, iova, size, size, true); 630 io_pgtable_tlb_add_page(iop, gather, iova, size);
629 } 631 }
630 632
631 return size; 633 return size;
@@ -634,17 +636,17 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
634 * Insert a table at the next level to map the old region, 636 * Insert a table at the next level to map the old region,
635 * minus the part we want to unmap 637 * minus the part we want to unmap
636 */ 638 */
637 return arm_lpae_split_blk_unmap(data, iova, size, pte, 639 return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
638 lvl + 1, ptep); 640 lvl + 1, ptep);
639 } 641 }
640 642
641 /* Keep on walkin' */ 643 /* Keep on walkin' */
642 ptep = iopte_deref(pte, data); 644 ptep = iopte_deref(pte, data);
643 return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep); 645 return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep);
644} 646}
645 647
646static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, 648static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
647 size_t size) 649 size_t size, struct iommu_iotlb_gather *gather)
648{ 650{
649 struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); 651 struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
650 arm_lpae_iopte *ptep = data->pgd; 652 arm_lpae_iopte *ptep = data->pgd;
@@ -653,7 +655,7 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
653 if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) 655 if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
654 return 0; 656 return 0;
655 657
656 return __arm_lpae_unmap(data, iova, size, lvl, ptep); 658 return __arm_lpae_unmap(data, gather, iova, size, lvl, ptep);
657} 659}
658 660
659static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, 661static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
@@ -1070,22 +1072,24 @@ static void dummy_tlb_flush_all(void *cookie)
1070 WARN_ON(cookie != cfg_cookie); 1072 WARN_ON(cookie != cfg_cookie);
1071} 1073}
1072 1074
1073static void dummy_tlb_add_flush(unsigned long iova, size_t size, 1075static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule,
1074 size_t granule, bool leaf, void *cookie) 1076 void *cookie)
1075{ 1077{
1076 WARN_ON(cookie != cfg_cookie); 1078 WARN_ON(cookie != cfg_cookie);
1077 WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); 1079 WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
1078} 1080}
1079 1081
1080static void dummy_tlb_sync(void *cookie) 1082static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
1083 unsigned long iova, size_t granule, void *cookie)
1081{ 1084{
1082 WARN_ON(cookie != cfg_cookie); 1085 dummy_tlb_flush(iova, granule, granule, cookie);
1083} 1086}
1084 1087
1085static const struct iommu_gather_ops dummy_tlb_ops __initconst = { 1088static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
1086 .tlb_flush_all = dummy_tlb_flush_all, 1089 .tlb_flush_all = dummy_tlb_flush_all,
1087 .tlb_add_flush = dummy_tlb_add_flush, 1090 .tlb_flush_walk = dummy_tlb_flush,
1088 .tlb_sync = dummy_tlb_sync, 1091 .tlb_flush_leaf = dummy_tlb_flush,
1092 .tlb_add_page = dummy_tlb_add_page,
1089}; 1093};
1090 1094
1091static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) 1095static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
@@ -1168,7 +1172,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
1168 1172
1169 /* Partial unmap */ 1173 /* Partial unmap */
1170 size = 1UL << __ffs(cfg->pgsize_bitmap); 1174 size = 1UL << __ffs(cfg->pgsize_bitmap);
1171 if (ops->unmap(ops, SZ_1G + size, size) != size) 1175 if (ops->unmap(ops, SZ_1G + size, size, NULL) != size)
1172 return __FAIL(ops, i); 1176 return __FAIL(ops, i);
1173 1177
1174 /* Remap of partial unmap */ 1178 /* Remap of partial unmap */
@@ -1183,7 +1187,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
1183 for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { 1187 for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
1184 size = 1UL << j; 1188 size = 1UL << j;
1185 1189
1186 if (ops->unmap(ops, iova, size) != size) 1190 if (ops->unmap(ops, iova, size, NULL) != size)
1187 return __FAIL(ops, i); 1191 return __FAIL(ops, i);
1188 1192
1189 if (ops->iova_to_phys(ops, iova + 42)) 1193 if (ops->iova_to_phys(ops, iova + 42))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0c674d80c37f..70bfbcc09248 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1862,7 +1862,7 @@ EXPORT_SYMBOL_GPL(iommu_map);
1862 1862
1863static size_t __iommu_unmap(struct iommu_domain *domain, 1863static size_t __iommu_unmap(struct iommu_domain *domain,
1864 unsigned long iova, size_t size, 1864 unsigned long iova, size_t size,
1865 bool sync) 1865 struct iommu_iotlb_gather *iotlb_gather)
1866{ 1866{
1867 const struct iommu_ops *ops = domain->ops; 1867 const struct iommu_ops *ops = domain->ops;
1868 size_t unmapped_page, unmapped = 0; 1868 size_t unmapped_page, unmapped = 0;
@@ -1899,13 +1899,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
1899 while (unmapped < size) { 1899 while (unmapped < size) {
1900 size_t pgsize = iommu_pgsize(domain, iova, size - unmapped); 1900 size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
1901 1901
1902 unmapped_page = ops->unmap(domain, iova, pgsize); 1902 unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
1903 if (!unmapped_page) 1903 if (!unmapped_page)
1904 break; 1904 break;
1905 1905
1906 if (sync && ops->iotlb_range_add)
1907 ops->iotlb_range_add(domain, iova, pgsize);
1908
1909 pr_debug("unmapped: iova 0x%lx size 0x%zx\n", 1906 pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
1910 iova, unmapped_page); 1907 iova, unmapped_page);
1911 1908
@@ -1913,9 +1910,6 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
1913 unmapped += unmapped_page; 1910 unmapped += unmapped_page;
1914 } 1911 }
1915 1912
1916 if (sync && ops->iotlb_sync)
1917 ops->iotlb_sync(domain);
1918
1919 trace_unmap(orig_iova, size, unmapped); 1913 trace_unmap(orig_iova, size, unmapped);
1920 return unmapped; 1914 return unmapped;
1921} 1915}
@@ -1923,14 +1917,22 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
1923size_t iommu_unmap(struct iommu_domain *domain, 1917size_t iommu_unmap(struct iommu_domain *domain,
1924 unsigned long iova, size_t size) 1918 unsigned long iova, size_t size)
1925{ 1919{
1926 return __iommu_unmap(domain, iova, size, true); 1920 struct iommu_iotlb_gather iotlb_gather;
1921 size_t ret;
1922
1923 iommu_iotlb_gather_init(&iotlb_gather);
1924 ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
1925 iommu_tlb_sync(domain, &iotlb_gather);
1926
1927 return ret;
1927} 1928}
1928EXPORT_SYMBOL_GPL(iommu_unmap); 1929EXPORT_SYMBOL_GPL(iommu_unmap);
1929 1930
1930size_t iommu_unmap_fast(struct iommu_domain *domain, 1931size_t iommu_unmap_fast(struct iommu_domain *domain,
1931 unsigned long iova, size_t size) 1932 unsigned long iova, size_t size,
1933 struct iommu_iotlb_gather *iotlb_gather)
1932{ 1934{
1933 return __iommu_unmap(domain, iova, size, false); 1935 return __iommu_unmap(domain, iova, size, iotlb_gather);
1934} 1936}
1935EXPORT_SYMBOL_GPL(iommu_unmap_fast); 1937EXPORT_SYMBOL_GPL(iommu_unmap_fast);
1936 1938
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ad0098c0c87c..76a8ec343d53 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -361,16 +361,16 @@ static void ipmmu_tlb_flush_all(void *cookie)
361 ipmmu_tlb_invalidate(domain); 361 ipmmu_tlb_invalidate(domain);
362} 362}
363 363
364static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, 364static void ipmmu_tlb_flush(unsigned long iova, size_t size,
365 size_t granule, bool leaf, void *cookie) 365 size_t granule, void *cookie)
366{ 366{
367 /* The hardware doesn't support selective TLB flush. */ 367 ipmmu_tlb_flush_all(cookie);
368} 368}
369 369
370static const struct iommu_gather_ops ipmmu_gather_ops = { 370static const struct iommu_flush_ops ipmmu_flush_ops = {
371 .tlb_flush_all = ipmmu_tlb_flush_all, 371 .tlb_flush_all = ipmmu_tlb_flush_all,
372 .tlb_add_flush = ipmmu_tlb_add_flush, 372 .tlb_flush_walk = ipmmu_tlb_flush,
373 .tlb_sync = ipmmu_tlb_flush_all, 373 .tlb_flush_leaf = ipmmu_tlb_flush,
374}; 374};
375 375
376/* ----------------------------------------------------------------------------- 376/* -----------------------------------------------------------------------------
@@ -480,7 +480,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
480 domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; 480 domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K;
481 domain->cfg.ias = 32; 481 domain->cfg.ias = 32;
482 domain->cfg.oas = 40; 482 domain->cfg.oas = 40;
483 domain->cfg.tlb = &ipmmu_gather_ops; 483 domain->cfg.tlb = &ipmmu_flush_ops;
484 domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32); 484 domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
485 domain->io_domain.geometry.force_aperture = true; 485 domain->io_domain.geometry.force_aperture = true;
486 /* 486 /*
@@ -733,14 +733,14 @@ static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova,
733} 733}
734 734
735static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, 735static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova,
736 size_t size) 736 size_t size, struct iommu_iotlb_gather *gather)
737{ 737{
738 struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); 738 struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
739 739
740 return domain->iop->unmap(domain->iop, iova, size); 740 return domain->iop->unmap(domain->iop, iova, size, gather);
741} 741}
742 742
743static void ipmmu_iotlb_sync(struct iommu_domain *io_domain) 743static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain)
744{ 744{
745 struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); 745 struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
746 746
@@ -748,6 +748,12 @@ static void ipmmu_iotlb_sync(struct iommu_domain *io_domain)
748 ipmmu_tlb_flush_all(domain); 748 ipmmu_tlb_flush_all(domain);
749} 749}
750 750
751static void ipmmu_iotlb_sync(struct iommu_domain *io_domain,
752 struct iommu_iotlb_gather *gather)
753{
754 ipmmu_flush_iotlb_all(io_domain);
755}
756
751static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, 757static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
752 dma_addr_t iova) 758 dma_addr_t iova)
753{ 759{
@@ -957,7 +963,7 @@ static const struct iommu_ops ipmmu_ops = {
957 .detach_dev = ipmmu_detach_device, 963 .detach_dev = ipmmu_detach_device,
958 .map = ipmmu_map, 964 .map = ipmmu_map,
959 .unmap = ipmmu_unmap, 965 .unmap = ipmmu_unmap,
960 .flush_iotlb_all = ipmmu_iotlb_sync, 966 .flush_iotlb_all = ipmmu_flush_iotlb_all,
961 .iotlb_sync = ipmmu_iotlb_sync, 967 .iotlb_sync = ipmmu_iotlb_sync,
962 .iova_to_phys = ipmmu_iova_to_phys, 968 .iova_to_phys = ipmmu_iova_to_phys,
963 .add_device = ipmmu_add_device, 969 .add_device = ipmmu_add_device,
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index b25e2eb9e038..4c0be5b75c28 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -168,20 +168,29 @@ fail:
168 return; 168 return;
169} 169}
170 170
171static void __flush_iotlb_sync(void *cookie) 171static void __flush_iotlb_walk(unsigned long iova, size_t size,
172 size_t granule, void *cookie)
172{ 173{
173 /* 174 __flush_iotlb_range(iova, size, granule, false, cookie);
174 * Nothing is needed here, the barrier to guarantee 175}
175 * completion of the tlb sync operation is implicitly 176
176 * taken care when the iommu client does a writel before 177static void __flush_iotlb_leaf(unsigned long iova, size_t size,
177 * kick starting the other master. 178 size_t granule, void *cookie)
178 */ 179{
180 __flush_iotlb_range(iova, size, granule, true, cookie);
179} 181}
180 182
181static const struct iommu_gather_ops msm_iommu_gather_ops = { 183static void __flush_iotlb_page(struct iommu_iotlb_gather *gather,
184 unsigned long iova, size_t granule, void *cookie)
185{
186 __flush_iotlb_range(iova, granule, granule, true, cookie);
187}
188
189static const struct iommu_flush_ops msm_iommu_flush_ops = {
182 .tlb_flush_all = __flush_iotlb, 190 .tlb_flush_all = __flush_iotlb,
183 .tlb_add_flush = __flush_iotlb_range, 191 .tlb_flush_walk = __flush_iotlb_walk,
184 .tlb_sync = __flush_iotlb_sync, 192 .tlb_flush_leaf = __flush_iotlb_leaf,
193 .tlb_add_page = __flush_iotlb_page,
185}; 194};
186 195
187static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end) 196static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
@@ -345,7 +354,7 @@ static int msm_iommu_domain_config(struct msm_priv *priv)
345 .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap, 354 .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap,
346 .ias = 32, 355 .ias = 32,
347 .oas = 32, 356 .oas = 32,
348 .tlb = &msm_iommu_gather_ops, 357 .tlb = &msm_iommu_flush_ops,
349 .iommu_dev = priv->dev, 358 .iommu_dev = priv->dev,
350 }; 359 };
351 360
@@ -509,13 +518,13 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova,
509} 518}
510 519
511static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, 520static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
512 size_t len) 521 size_t len, struct iommu_iotlb_gather *gather)
513{ 522{
514 struct msm_priv *priv = to_msm_priv(domain); 523 struct msm_priv *priv = to_msm_priv(domain);
515 unsigned long flags; 524 unsigned long flags;
516 525
517 spin_lock_irqsave(&priv->pgtlock, flags); 526 spin_lock_irqsave(&priv->pgtlock, flags);
518 len = priv->iop->unmap(priv->iop, iova, len); 527 len = priv->iop->unmap(priv->iop, iova, len, gather);
519 spin_unlock_irqrestore(&priv->pgtlock, flags); 528 spin_unlock_irqrestore(&priv->pgtlock, flags);
520 529
521 return len; 530 return len;
@@ -691,6 +700,13 @@ static struct iommu_ops msm_iommu_ops = {
691 .detach_dev = msm_iommu_detach_dev, 700 .detach_dev = msm_iommu_detach_dev,
692 .map = msm_iommu_map, 701 .map = msm_iommu_map,
693 .unmap = msm_iommu_unmap, 702 .unmap = msm_iommu_unmap,
703 /*
704 * Nothing is needed here, the barrier to guarantee
705 * completion of the tlb sync operation is implicitly
706 * taken care when the iommu client does a writel before
707 * kick starting the other master.
708 */
709 .iotlb_sync = NULL,
694 .iova_to_phys = msm_iommu_iova_to_phys, 710 .iova_to_phys = msm_iommu_iova_to_phys,
695 .add_device = msm_iommu_add_device, 711 .add_device = msm_iommu_add_device,
696 .remove_device = msm_iommu_remove_device, 712 .remove_device = msm_iommu_remove_device,
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 82e4be4dfdaf..0827d51936fa 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -188,10 +188,32 @@ static void mtk_iommu_tlb_sync(void *cookie)
188 } 188 }
189} 189}
190 190
191static const struct iommu_gather_ops mtk_iommu_gather_ops = { 191static void mtk_iommu_tlb_flush_walk(unsigned long iova, size_t size,
192 size_t granule, void *cookie)
193{
194 mtk_iommu_tlb_add_flush_nosync(iova, size, granule, false, cookie);
195 mtk_iommu_tlb_sync(cookie);
196}
197
198static void mtk_iommu_tlb_flush_leaf(unsigned long iova, size_t size,
199 size_t granule, void *cookie)
200{
201 mtk_iommu_tlb_add_flush_nosync(iova, size, granule, true, cookie);
202 mtk_iommu_tlb_sync(cookie);
203}
204
205static void mtk_iommu_tlb_flush_page_nosync(struct iommu_iotlb_gather *gather,
206 unsigned long iova, size_t granule,
207 void *cookie)
208{
209 mtk_iommu_tlb_add_flush_nosync(iova, granule, granule, true, cookie);
210}
211
212static const struct iommu_flush_ops mtk_iommu_flush_ops = {
192 .tlb_flush_all = mtk_iommu_tlb_flush_all, 213 .tlb_flush_all = mtk_iommu_tlb_flush_all,
193 .tlb_add_flush = mtk_iommu_tlb_add_flush_nosync, 214 .tlb_flush_walk = mtk_iommu_tlb_flush_walk,
194 .tlb_sync = mtk_iommu_tlb_sync, 215 .tlb_flush_leaf = mtk_iommu_tlb_flush_leaf,
216 .tlb_add_page = mtk_iommu_tlb_flush_page_nosync,
195}; 217};
196 218
197static irqreturn_t mtk_iommu_isr(int irq, void *dev_id) 219static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
@@ -267,7 +289,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom)
267 .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap, 289 .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap,
268 .ias = 32, 290 .ias = 32,
269 .oas = 32, 291 .oas = 32,
270 .tlb = &mtk_iommu_gather_ops, 292 .tlb = &mtk_iommu_flush_ops,
271 .iommu_dev = data->dev, 293 .iommu_dev = data->dev,
272 }; 294 };
273 295
@@ -371,20 +393,27 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
371} 393}
372 394
373static size_t mtk_iommu_unmap(struct iommu_domain *domain, 395static size_t mtk_iommu_unmap(struct iommu_domain *domain,
374 unsigned long iova, size_t size) 396 unsigned long iova, size_t size,
397 struct iommu_iotlb_gather *gather)
375{ 398{
376 struct mtk_iommu_domain *dom = to_mtk_domain(domain); 399 struct mtk_iommu_domain *dom = to_mtk_domain(domain);
377 unsigned long flags; 400 unsigned long flags;
378 size_t unmapsz; 401 size_t unmapsz;
379 402
380 spin_lock_irqsave(&dom->pgtlock, flags); 403 spin_lock_irqsave(&dom->pgtlock, flags);
381 unmapsz = dom->iop->unmap(dom->iop, iova, size); 404 unmapsz = dom->iop->unmap(dom->iop, iova, size, gather);
382 spin_unlock_irqrestore(&dom->pgtlock, flags); 405 spin_unlock_irqrestore(&dom->pgtlock, flags);
383 406
384 return unmapsz; 407 return unmapsz;
385} 408}
386 409
387static void mtk_iommu_iotlb_sync(struct iommu_domain *domain) 410static void mtk_iommu_flush_iotlb_all(struct iommu_domain *domain)
411{
412 mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data());
413}
414
415static void mtk_iommu_iotlb_sync(struct iommu_domain *domain,
416 struct iommu_iotlb_gather *gather)
388{ 417{
389 mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data()); 418 mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data());
390} 419}
@@ -490,7 +519,7 @@ static const struct iommu_ops mtk_iommu_ops = {
490 .detach_dev = mtk_iommu_detach_device, 519 .detach_dev = mtk_iommu_detach_device,
491 .map = mtk_iommu_map, 520 .map = mtk_iommu_map,
492 .unmap = mtk_iommu_unmap, 521 .unmap = mtk_iommu_unmap,
493 .flush_iotlb_all = mtk_iommu_iotlb_sync, 522 .flush_iotlb_all = mtk_iommu_flush_iotlb_all,
494 .iotlb_sync = mtk_iommu_iotlb_sync, 523 .iotlb_sync = mtk_iommu_iotlb_sync,
495 .iova_to_phys = mtk_iommu_iova_to_phys, 524 .iova_to_phys = mtk_iommu_iova_to_phys,
496 .add_device = mtk_iommu_add_device, 525 .add_device = mtk_iommu_add_device,
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index abeeac488372..7b92ddd5d9fd 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -324,7 +324,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
324} 324}
325 325
326static size_t mtk_iommu_unmap(struct iommu_domain *domain, 326static size_t mtk_iommu_unmap(struct iommu_domain *domain,
327 unsigned long iova, size_t size) 327 unsigned long iova, size_t size,
328 struct iommu_iotlb_gather *gather)
328{ 329{
329 struct mtk_iommu_domain *dom = to_mtk_domain(domain); 330 struct mtk_iommu_domain *dom = to_mtk_domain(domain);
330 unsigned long flags; 331 unsigned long flags;
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index dfb961d8c21b..8039bc5ee425 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1149,7 +1149,7 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
1149} 1149}
1150 1150
1151static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, 1151static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
1152 size_t size) 1152 size_t size, struct iommu_iotlb_gather *gather)
1153{ 1153{
1154 struct omap_iommu_domain *omap_domain = to_omap_domain(domain); 1154 struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
1155 struct device *dev = omap_domain->dev; 1155 struct device *dev = omap_domain->dev;
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index 34d0b9783b3e..fd33cf5981d7 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/atomic.h> 9#include <linux/atomic.h>
10#include <linux/bitfield.h>
10#include <linux/clk.h> 11#include <linux/clk.h>
11#include <linux/delay.h> 12#include <linux/delay.h>
12#include <linux/dma-iommu.h> 13#include <linux/dma-iommu.h>
@@ -32,7 +33,7 @@
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34 35
35#include "arm-smmu-regs.h" 36#include "arm-smmu.h"
36 37
37#define SMMU_INTR_SEL_NS 0x2000 38#define SMMU_INTR_SEL_NS 0x2000
38 39
@@ -155,7 +156,7 @@ static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
155 struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]); 156 struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
156 size_t s = size; 157 size_t s = size;
157 158
158 iova &= ~12UL; 159 iova = (iova >> 12) << 12;
159 iova |= ctx->asid; 160 iova |= ctx->asid;
160 do { 161 do {
161 iommu_writel(ctx, reg, iova); 162 iommu_writel(ctx, reg, iova);
@@ -164,10 +165,32 @@ static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
164 } 165 }
165} 166}
166 167
167static const struct iommu_gather_ops qcom_gather_ops = { 168static void qcom_iommu_tlb_flush_walk(unsigned long iova, size_t size,
169 size_t granule, void *cookie)
170{
171 qcom_iommu_tlb_inv_range_nosync(iova, size, granule, false, cookie);
172 qcom_iommu_tlb_sync(cookie);
173}
174
175static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size,
176 size_t granule, void *cookie)
177{
178 qcom_iommu_tlb_inv_range_nosync(iova, size, granule, true, cookie);
179 qcom_iommu_tlb_sync(cookie);
180}
181
182static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
183 unsigned long iova, size_t granule,
184 void *cookie)
185{
186 qcom_iommu_tlb_inv_range_nosync(iova, granule, granule, true, cookie);
187}
188
189static const struct iommu_flush_ops qcom_flush_ops = {
168 .tlb_flush_all = qcom_iommu_tlb_inv_context, 190 .tlb_flush_all = qcom_iommu_tlb_inv_context,
169 .tlb_add_flush = qcom_iommu_tlb_inv_range_nosync, 191 .tlb_flush_walk = qcom_iommu_tlb_flush_walk,
170 .tlb_sync = qcom_iommu_tlb_sync, 192 .tlb_flush_leaf = qcom_iommu_tlb_flush_leaf,
193 .tlb_add_page = qcom_iommu_tlb_add_page,
171}; 194};
172 195
173static irqreturn_t qcom_iommu_fault(int irq, void *dev) 196static irqreturn_t qcom_iommu_fault(int irq, void *dev)
@@ -215,7 +238,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
215 .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap, 238 .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap,
216 .ias = 32, 239 .ias = 32,
217 .oas = 40, 240 .oas = 40,
218 .tlb = &qcom_gather_ops, 241 .tlb = &qcom_flush_ops,
219 .iommu_dev = qcom_iommu->dev, 242 .iommu_dev = qcom_iommu->dev,
220 }; 243 };
221 244
@@ -247,16 +270,16 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
247 /* TTBRs */ 270 /* TTBRs */
248 iommu_writeq(ctx, ARM_SMMU_CB_TTBR0, 271 iommu_writeq(ctx, ARM_SMMU_CB_TTBR0,
249 pgtbl_cfg.arm_lpae_s1_cfg.ttbr[0] | 272 pgtbl_cfg.arm_lpae_s1_cfg.ttbr[0] |
250 ((u64)ctx->asid << TTBRn_ASID_SHIFT)); 273 FIELD_PREP(TTBRn_ASID, ctx->asid));
251 iommu_writeq(ctx, ARM_SMMU_CB_TTBR1, 274 iommu_writeq(ctx, ARM_SMMU_CB_TTBR1,
252 pgtbl_cfg.arm_lpae_s1_cfg.ttbr[1] | 275 pgtbl_cfg.arm_lpae_s1_cfg.ttbr[1] |
253 ((u64)ctx->asid << TTBRn_ASID_SHIFT)); 276 FIELD_PREP(TTBRn_ASID, ctx->asid));
254 277
255 /* TTBCR */ 278 /* TCR */
256 iommu_writel(ctx, ARM_SMMU_CB_TTBCR2, 279 iommu_writel(ctx, ARM_SMMU_CB_TCR2,
257 (pgtbl_cfg.arm_lpae_s1_cfg.tcr >> 32) | 280 (pgtbl_cfg.arm_lpae_s1_cfg.tcr >> 32) |
258 TTBCR2_SEP_UPSTREAM); 281 FIELD_PREP(TCR2_SEP, TCR2_SEP_UPSTREAM));
259 iommu_writel(ctx, ARM_SMMU_CB_TTBCR, 282 iommu_writel(ctx, ARM_SMMU_CB_TCR,
260 pgtbl_cfg.arm_lpae_s1_cfg.tcr); 283 pgtbl_cfg.arm_lpae_s1_cfg.tcr);
261 284
262 /* MAIRs (stage-1 only) */ 285 /* MAIRs (stage-1 only) */
@@ -417,7 +440,7 @@ static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova,
417} 440}
418 441
419static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, 442static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
420 size_t size) 443 size_t size, struct iommu_iotlb_gather *gather)
421{ 444{
422 size_t ret; 445 size_t ret;
423 unsigned long flags; 446 unsigned long flags;
@@ -434,14 +457,14 @@ static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
434 */ 457 */
435 pm_runtime_get_sync(qcom_domain->iommu->dev); 458 pm_runtime_get_sync(qcom_domain->iommu->dev);
436 spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags); 459 spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
437 ret = ops->unmap(ops, iova, size); 460 ret = ops->unmap(ops, iova, size, gather);
438 spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags); 461 spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
439 pm_runtime_put_sync(qcom_domain->iommu->dev); 462 pm_runtime_put_sync(qcom_domain->iommu->dev);
440 463
441 return ret; 464 return ret;
442} 465}
443 466
444static void qcom_iommu_iotlb_sync(struct iommu_domain *domain) 467static void qcom_iommu_flush_iotlb_all(struct iommu_domain *domain)
445{ 468{
446 struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); 469 struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
447 struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops, 470 struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops,
@@ -454,6 +477,12 @@ static void qcom_iommu_iotlb_sync(struct iommu_domain *domain)
454 pm_runtime_put_sync(qcom_domain->iommu->dev); 477 pm_runtime_put_sync(qcom_domain->iommu->dev);
455} 478}
456 479
480static void qcom_iommu_iotlb_sync(struct iommu_domain *domain,
481 struct iommu_iotlb_gather *gather)
482{
483 qcom_iommu_flush_iotlb_all(domain);
484}
485
457static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain, 486static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain,
458 dma_addr_t iova) 487 dma_addr_t iova)
459{ 488{
@@ -581,7 +610,7 @@ static const struct iommu_ops qcom_iommu_ops = {
581 .detach_dev = qcom_iommu_detach_dev, 610 .detach_dev = qcom_iommu_detach_dev,
582 .map = qcom_iommu_map, 611 .map = qcom_iommu_map,
583 .unmap = qcom_iommu_unmap, 612 .unmap = qcom_iommu_unmap,
584 .flush_iotlb_all = qcom_iommu_iotlb_sync, 613 .flush_iotlb_all = qcom_iommu_flush_iotlb_all,
585 .iotlb_sync = qcom_iommu_iotlb_sync, 614 .iotlb_sync = qcom_iommu_iotlb_sync,
586 .iova_to_phys = qcom_iommu_iova_to_phys, 615 .iova_to_phys = qcom_iommu_iova_to_phys,
587 .add_device = qcom_iommu_add_device, 616 .add_device = qcom_iommu_add_device,
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index dc26d74d79c2..26290f310f90 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -794,7 +794,7 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova,
794} 794}
795 795
796static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, 796static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova,
797 size_t size) 797 size_t size, struct iommu_iotlb_gather *gather)
798{ 798{
799 struct rk_iommu_domain *rk_domain = to_rk_domain(domain); 799 struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
800 unsigned long flags; 800 unsigned long flags;
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 22d4db302c1c..3b0b18e23187 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -314,7 +314,8 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
314} 314}
315 315
316static size_t s390_iommu_unmap(struct iommu_domain *domain, 316static size_t s390_iommu_unmap(struct iommu_domain *domain,
317 unsigned long iova, size_t size) 317 unsigned long iova, size_t size,
318 struct iommu_iotlb_gather *gather)
318{ 319{
319 struct s390_domain *s390_domain = to_s390_domain(domain); 320 struct s390_domain *s390_domain = to_s390_domain(domain);
320 int flags = ZPCI_PTE_INVALID; 321 int flags = ZPCI_PTE_INVALID;
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 6d40bc1b38bf..3924f7c05544 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -207,7 +207,7 @@ static inline int __gart_iommu_unmap(struct gart_device *gart,
207} 207}
208 208
209static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, 209static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
210 size_t bytes) 210 size_t bytes, struct iommu_iotlb_gather *gather)
211{ 211{
212 struct gart_device *gart = gart_handle; 212 struct gart_device *gart = gart_handle;
213 int err; 213 int err;
@@ -273,11 +273,17 @@ static int gart_iommu_of_xlate(struct device *dev,
273 return 0; 273 return 0;
274} 274}
275 275
276static void gart_iommu_sync(struct iommu_domain *domain) 276static void gart_iommu_sync_map(struct iommu_domain *domain)
277{ 277{
278 FLUSH_GART_REGS(gart_handle); 278 FLUSH_GART_REGS(gart_handle);
279} 279}
280 280
281static void gart_iommu_sync(struct iommu_domain *domain,
282 struct iommu_iotlb_gather *gather)
283{
284 gart_iommu_sync_map(domain);
285}
286
281static const struct iommu_ops gart_iommu_ops = { 287static const struct iommu_ops gart_iommu_ops = {
282 .capable = gart_iommu_capable, 288 .capable = gart_iommu_capable,
283 .domain_alloc = gart_iommu_domain_alloc, 289 .domain_alloc = gart_iommu_domain_alloc,
@@ -292,7 +298,7 @@ static const struct iommu_ops gart_iommu_ops = {
292 .iova_to_phys = gart_iommu_iova_to_phys, 298 .iova_to_phys = gart_iommu_iova_to_phys,
293 .pgsize_bitmap = GART_IOMMU_PGSIZES, 299 .pgsize_bitmap = GART_IOMMU_PGSIZES,
294 .of_xlate = gart_iommu_of_xlate, 300 .of_xlate = gart_iommu_of_xlate,
295 .iotlb_sync_map = gart_iommu_sync, 301 .iotlb_sync_map = gart_iommu_sync_map,
296 .iotlb_sync = gart_iommu_sync, 302 .iotlb_sync = gart_iommu_sync,
297}; 303};
298 304
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index c4a652b227f8..7293fc3f796d 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -680,7 +680,7 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
680} 680}
681 681
682static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, 682static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
683 size_t size) 683 size_t size, struct iommu_iotlb_gather *gather)
684{ 684{
685 struct tegra_smmu_as *as = to_smmu_as(domain); 685 struct tegra_smmu_as *as = to_smmu_as(domain);
686 dma_addr_t pte_dma; 686 dma_addr_t pte_dma;
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 80a740df0737..3ea9d7682999 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -751,7 +751,7 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova,
751} 751}
752 752
753static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, 753static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova,
754 size_t size) 754 size_t size, struct iommu_iotlb_gather *gather)
755{ 755{
756 int ret = 0; 756 int ret = 0;
757 size_t unmapped; 757 size_t unmapped;
@@ -797,7 +797,8 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
797 return paddr; 797 return paddr;
798} 798}
799 799
800static void viommu_iotlb_sync(struct iommu_domain *domain) 800static void viommu_iotlb_sync(struct iommu_domain *domain,
801 struct iommu_iotlb_gather *gather)
801{ 802{
802 struct viommu_domain *vdomain = to_viommu_domain(domain); 803 struct viommu_domain *vdomain = to_viommu_domain(domain);
803 804
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 054391f30fa8..ad830abe1021 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -650,12 +650,13 @@ unpin_exit:
650} 650}
651 651
652static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, 652static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
653 struct list_head *regions) 653 struct list_head *regions,
654 struct iommu_iotlb_gather *iotlb_gather)
654{ 655{
655 long unlocked = 0; 656 long unlocked = 0;
656 struct vfio_regions *entry, *next; 657 struct vfio_regions *entry, *next;
657 658
658 iommu_tlb_sync(domain->domain); 659 iommu_tlb_sync(domain->domain, iotlb_gather);
659 660
660 list_for_each_entry_safe(entry, next, regions, list) { 661 list_for_each_entry_safe(entry, next, regions, list) {
661 unlocked += vfio_unpin_pages_remote(dma, 662 unlocked += vfio_unpin_pages_remote(dma,
@@ -685,18 +686,19 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain,
685 struct vfio_dma *dma, dma_addr_t *iova, 686 struct vfio_dma *dma, dma_addr_t *iova,
686 size_t len, phys_addr_t phys, long *unlocked, 687 size_t len, phys_addr_t phys, long *unlocked,
687 struct list_head *unmapped_list, 688 struct list_head *unmapped_list,
688 int *unmapped_cnt) 689 int *unmapped_cnt,
690 struct iommu_iotlb_gather *iotlb_gather)
689{ 691{
690 size_t unmapped = 0; 692 size_t unmapped = 0;
691 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); 693 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
692 694
693 if (entry) { 695 if (entry) {
694 unmapped = iommu_unmap_fast(domain->domain, *iova, len); 696 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
697 iotlb_gather);
695 698
696 if (!unmapped) { 699 if (!unmapped) {
697 kfree(entry); 700 kfree(entry);
698 } else { 701 } else {
699 iommu_tlb_range_add(domain->domain, *iova, unmapped);
700 entry->iova = *iova; 702 entry->iova = *iova;
701 entry->phys = phys; 703 entry->phys = phys;
702 entry->len = unmapped; 704 entry->len = unmapped;
@@ -712,8 +714,8 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain,
712 * or in case of errors. 714 * or in case of errors.
713 */ 715 */
714 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { 716 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
715 *unlocked += vfio_sync_unpin(dma, domain, 717 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
716 unmapped_list); 718 iotlb_gather);
717 *unmapped_cnt = 0; 719 *unmapped_cnt = 0;
718 } 720 }
719 721
@@ -744,6 +746,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
744 dma_addr_t iova = dma->iova, end = dma->iova + dma->size; 746 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
745 struct vfio_domain *domain, *d; 747 struct vfio_domain *domain, *d;
746 LIST_HEAD(unmapped_region_list); 748 LIST_HEAD(unmapped_region_list);
749 struct iommu_iotlb_gather iotlb_gather;
747 int unmapped_region_cnt = 0; 750 int unmapped_region_cnt = 0;
748 long unlocked = 0; 751 long unlocked = 0;
749 752
@@ -768,6 +771,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
768 cond_resched(); 771 cond_resched();
769 } 772 }
770 773
774 iommu_iotlb_gather_init(&iotlb_gather);
771 while (iova < end) { 775 while (iova < end) {
772 size_t unmapped, len; 776 size_t unmapped, len;
773 phys_addr_t phys, next; 777 phys_addr_t phys, next;
@@ -796,7 +800,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
796 */ 800 */
797 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, 801 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
798 &unlocked, &unmapped_region_list, 802 &unlocked, &unmapped_region_list,
799 &unmapped_region_cnt); 803 &unmapped_region_cnt,
804 &iotlb_gather);
800 if (!unmapped) { 805 if (!unmapped) {
801 unmapped = unmap_unpin_slow(domain, dma, &iova, len, 806 unmapped = unmap_unpin_slow(domain, dma, &iova, len,
802 phys, &unlocked); 807 phys, &unlocked);
@@ -807,8 +812,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
807 812
808 dma->iommu_mapped = false; 813 dma->iommu_mapped = false;
809 814
810 if (unmapped_region_cnt) 815 if (unmapped_region_cnt) {
811 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); 816 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
817 &iotlb_gather);
818 }
812 819
813 if (do_accounting) { 820 if (do_accounting) {
814 vfio_lock_acct(dma, -unlocked, true); 821 vfio_lock_acct(dma, -unlocked, true);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index b5a450a3bb47..6b1b8be3ebec 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -1,7 +1,9 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __IO_PGTABLE_H 2#ifndef __IO_PGTABLE_H
3#define __IO_PGTABLE_H 3#define __IO_PGTABLE_H
4
4#include <linux/bitops.h> 5#include <linux/bitops.h>
6#include <linux/iommu.h>
5 7
6/* 8/*
7 * Public API for use by IOMMU drivers 9 * Public API for use by IOMMU drivers
@@ -17,22 +19,31 @@ enum io_pgtable_fmt {
17}; 19};
18 20
19/** 21/**
20 * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management. 22 * struct iommu_flush_ops - IOMMU callbacks for TLB and page table management.
21 * 23 *
22 * @tlb_flush_all: Synchronously invalidate the entire TLB context. 24 * @tlb_flush_all: Synchronously invalidate the entire TLB context.
23 * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range. 25 * @tlb_flush_walk: Synchronously invalidate all intermediate TLB state
24 * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and 26 * (sometimes referred to as the "walk cache") for a virtual
25 * any corresponding page table updates are visible to the 27 * address range.
26 * IOMMU. 28 * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual
29 * address range.
30 * @tlb_add_page: Optional callback to queue up leaf TLB invalidation for a
31 * single page. IOMMUs that cannot batch TLB invalidation
32 * operations efficiently will typically issue them here, but
33 * others may decide to update the iommu_iotlb_gather structure
34 * and defer the invalidation until iommu_tlb_sync() instead.
27 * 35 *
28 * Note that these can all be called in atomic context and must therefore 36 * Note that these can all be called in atomic context and must therefore
29 * not block. 37 * not block.
30 */ 38 */
31struct iommu_gather_ops { 39struct iommu_flush_ops {
32 void (*tlb_flush_all)(void *cookie); 40 void (*tlb_flush_all)(void *cookie);
33 void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule, 41 void (*tlb_flush_walk)(unsigned long iova, size_t size, size_t granule,
34 bool leaf, void *cookie); 42 void *cookie);
35 void (*tlb_sync)(void *cookie); 43 void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule,
44 void *cookie);
45 void (*tlb_add_page)(struct iommu_iotlb_gather *gather,
46 unsigned long iova, size_t granule, void *cookie);
36}; 47};
37 48
38/** 49/**
@@ -84,7 +95,7 @@ struct io_pgtable_cfg {
84 unsigned int ias; 95 unsigned int ias;
85 unsigned int oas; 96 unsigned int oas;
86 bool coherent_walk; 97 bool coherent_walk;
87 const struct iommu_gather_ops *tlb; 98 const struct iommu_flush_ops *tlb;
88 struct device *iommu_dev; 99 struct device *iommu_dev;
89 100
90 /* Low-level data specific to the table format */ 101 /* Low-level data specific to the table format */
@@ -128,7 +139,7 @@ struct io_pgtable_ops {
128 int (*map)(struct io_pgtable_ops *ops, unsigned long iova, 139 int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
129 phys_addr_t paddr, size_t size, int prot); 140 phys_addr_t paddr, size_t size, int prot);
130 size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, 141 size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
131 size_t size); 142 size_t size, struct iommu_iotlb_gather *gather);
132 phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, 143 phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
133 unsigned long iova); 144 unsigned long iova);
134}; 145};
@@ -184,15 +195,27 @@ static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
184 iop->cfg.tlb->tlb_flush_all(iop->cookie); 195 iop->cfg.tlb->tlb_flush_all(iop->cookie);
185} 196}
186 197
187static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop, 198static inline void
188 unsigned long iova, size_t size, size_t granule, bool leaf) 199io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova,
200 size_t size, size_t granule)
201{
202 iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie);
203}
204
205static inline void
206io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova,
207 size_t size, size_t granule)
189{ 208{
190 iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie); 209 iop->cfg.tlb->tlb_flush_leaf(iova, size, granule, iop->cookie);
191} 210}
192 211
193static inline void io_pgtable_tlb_sync(struct io_pgtable *iop) 212static inline void
213io_pgtable_tlb_add_page(struct io_pgtable *iop,
214 struct iommu_iotlb_gather * gather, unsigned long iova,
215 size_t granule)
194{ 216{
195 iop->cfg.tlb->tlb_sync(iop->cookie); 217 if (iop->cfg.tlb->tlb_add_page)
218 iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie);
196} 219}
197 220
198/** 221/**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fdc355ccc570..64ebaff33455 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -192,6 +192,23 @@ struct iommu_sva_ops {
192#ifdef CONFIG_IOMMU_API 192#ifdef CONFIG_IOMMU_API
193 193
194/** 194/**
195 * struct iommu_iotlb_gather - Range information for a pending IOTLB flush
196 *
197 * @start: IOVA representing the start of the range to be flushed
198 * @end: IOVA representing the end of the range to be flushed (exclusive)
199 * @pgsize: The interval at which to perform the flush
200 *
201 * This structure is intended to be updated by multiple calls to the
202 * ->unmap() function in struct iommu_ops before eventually being passed
203 * into ->iotlb_sync().
204 */
205struct iommu_iotlb_gather {
206 unsigned long start;
207 unsigned long end;
208 size_t pgsize;
209};
210
211/**
195 * struct iommu_ops - iommu ops and capabilities 212 * struct iommu_ops - iommu ops and capabilities
196 * @capable: check capability 213 * @capable: check capability
197 * @domain_alloc: allocate iommu domain 214 * @domain_alloc: allocate iommu domain
@@ -201,7 +218,6 @@ struct iommu_sva_ops {
201 * @map: map a physically contiguous memory region to an iommu domain 218 * @map: map a physically contiguous memory region to an iommu domain
202 * @unmap: unmap a physically contiguous memory region from an iommu domain 219 * @unmap: unmap a physically contiguous memory region from an iommu domain
203 * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain 220 * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
204 * @iotlb_range_add: Add a given iova range to the flush queue for this domain
205 * @iotlb_sync_map: Sync mappings created recently using @map to the hardware 221 * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
206 * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush 222 * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
207 * queue 223 * queue
@@ -242,12 +258,11 @@ struct iommu_ops {
242 int (*map)(struct iommu_domain *domain, unsigned long iova, 258 int (*map)(struct iommu_domain *domain, unsigned long iova,
243 phys_addr_t paddr, size_t size, int prot); 259 phys_addr_t paddr, size_t size, int prot);
244 size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, 260 size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
245 size_t size); 261 size_t size, struct iommu_iotlb_gather *iotlb_gather);
246 void (*flush_iotlb_all)(struct iommu_domain *domain); 262 void (*flush_iotlb_all)(struct iommu_domain *domain);
247 void (*iotlb_range_add)(struct iommu_domain *domain,
248 unsigned long iova, size_t size);
249 void (*iotlb_sync_map)(struct iommu_domain *domain); 263 void (*iotlb_sync_map)(struct iommu_domain *domain);
250 void (*iotlb_sync)(struct iommu_domain *domain); 264 void (*iotlb_sync)(struct iommu_domain *domain,
265 struct iommu_iotlb_gather *iotlb_gather);
251 phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); 266 phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
252 int (*add_device)(struct device *dev); 267 int (*add_device)(struct device *dev);
253 void (*remove_device)(struct device *dev); 268 void (*remove_device)(struct device *dev);
@@ -378,6 +393,13 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
378 return (struct iommu_device *)dev_get_drvdata(dev); 393 return (struct iommu_device *)dev_get_drvdata(dev);
379} 394}
380 395
396static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
397{
398 *gather = (struct iommu_iotlb_gather) {
399 .start = ULONG_MAX,
400 };
401}
402
381#define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ 403#define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */
382#define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ 404#define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */
383#define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ 405#define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */
@@ -402,7 +424,8 @@ extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
402extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, 424extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
403 size_t size); 425 size_t size);
404extern size_t iommu_unmap_fast(struct iommu_domain *domain, 426extern size_t iommu_unmap_fast(struct iommu_domain *domain,
405 unsigned long iova, size_t size); 427 unsigned long iova, size_t size,
428 struct iommu_iotlb_gather *iotlb_gather);
406extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, 429extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
407 struct scatterlist *sg,unsigned int nents, int prot); 430 struct scatterlist *sg,unsigned int nents, int prot);
408extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); 431extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
@@ -476,17 +499,38 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
476 domain->ops->flush_iotlb_all(domain); 499 domain->ops->flush_iotlb_all(domain);
477} 500}
478 501
479static inline void iommu_tlb_range_add(struct iommu_domain *domain, 502static inline void iommu_tlb_sync(struct iommu_domain *domain,
480 unsigned long iova, size_t size) 503 struct iommu_iotlb_gather *iotlb_gather)
481{ 504{
482 if (domain->ops->iotlb_range_add) 505 if (domain->ops->iotlb_sync)
483 domain->ops->iotlb_range_add(domain, iova, size); 506 domain->ops->iotlb_sync(domain, iotlb_gather);
507
508 iommu_iotlb_gather_init(iotlb_gather);
484} 509}
485 510
486static inline void iommu_tlb_sync(struct iommu_domain *domain) 511static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
512 struct iommu_iotlb_gather *gather,
513 unsigned long iova, size_t size)
487{ 514{
488 if (domain->ops->iotlb_sync) 515 unsigned long start = iova, end = start + size;
489 domain->ops->iotlb_sync(domain); 516
517 /*
518 * If the new page is disjoint from the current range or is mapped at
519 * a different granularity, then sync the TLB so that the gather
520 * structure can be rewritten.
521 */
522 if (gather->pgsize != size ||
523 end < gather->start || start > gather->end) {
524 if (gather->pgsize)
525 iommu_tlb_sync(domain, gather);
526 gather->pgsize = size;
527 }
528
529 if (gather->end < end)
530 gather->end = end;
531
532 if (gather->start > start)
533 gather->start = start;
490} 534}
491 535
492/* PCI device grouping function */ 536/* PCI device grouping function */
@@ -567,6 +611,7 @@ struct iommu_group {};
567struct iommu_fwspec {}; 611struct iommu_fwspec {};
568struct iommu_device {}; 612struct iommu_device {};
569struct iommu_fault_param {}; 613struct iommu_fault_param {};
614struct iommu_iotlb_gather {};
570 615
571static inline bool iommu_present(struct bus_type *bus) 616static inline bool iommu_present(struct bus_type *bus)
572{ 617{
@@ -621,7 +666,8 @@ static inline size_t iommu_unmap(struct iommu_domain *domain,
621} 666}
622 667
623static inline size_t iommu_unmap_fast(struct iommu_domain *domain, 668static inline size_t iommu_unmap_fast(struct iommu_domain *domain,
624 unsigned long iova, int gfp_order) 669 unsigned long iova, int gfp_order,
670 struct iommu_iotlb_gather *iotlb_gather)
625{ 671{
626 return 0; 672 return 0;
627} 673}
@@ -637,12 +683,8 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
637{ 683{
638} 684}
639 685
640static inline void iommu_tlb_range_add(struct iommu_domain *domain, 686static inline void iommu_tlb_sync(struct iommu_domain *domain,
641 unsigned long iova, size_t size) 687 struct iommu_iotlb_gather *iotlb_gather)
642{
643}
644
645static inline void iommu_tlb_sync(struct iommu_domain *domain)
646{ 688{
647} 689}
648 690
@@ -827,6 +869,16 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
827 return NULL; 869 return NULL;
828} 870}
829 871
872static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
873{
874}
875
876static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
877 struct iommu_iotlb_gather *gather,
878 unsigned long iova, size_t size)
879{
880}
881
830static inline void iommu_device_unregister(struct iommu_device *iommu) 882static inline void iommu_device_unregister(struct iommu_device *iommu)
831{ 883{
832} 884}