diff options
Diffstat (limited to 'drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c')
-rw-r--r-- | drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c | 371 |
1 files changed, 270 insertions, 101 deletions
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c index cd7feb1b25f6..fc419bb8eab7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c | |||
@@ -23,35 +23,42 @@ | |||
23 | /* | 23 | /* |
24 | * GK20A does not have dedicated video memory, and to accurately represent this | 24 | * GK20A does not have dedicated video memory, and to accurately represent this |
25 | * fact Nouveau will not create a RAM device for it. Therefore its instmem | 25 | * fact Nouveau will not create a RAM device for it. Therefore its instmem |
26 | * implementation must be done directly on top of system memory, while providing | 26 | * implementation must be done directly on top of system memory, while |
27 | * coherent read and write operations. | 27 | * preserving coherency for read and write operations. |
28 | * | 28 | * |
29 | * Instmem can be allocated through two means: | 29 | * Instmem can be allocated through two means: |
30 | * 1) If an IOMMU mapping has been probed, the IOMMU API is used to make memory | 30 | * 1) If an IOMMU unit has been probed, the IOMMU API is used to make memory |
31 | * pages contiguous to the GPU. This is the preferred way. | 31 | * pages contiguous to the GPU. This is the preferred way. |
32 | * 2) If no IOMMU mapping is probed, the DMA API is used to allocate physically | 32 | * 2) If no IOMMU unit is probed, the DMA API is used to allocate physically |
33 | * contiguous memory. | 33 | * contiguous memory. |
34 | * | 34 | * |
35 | * In both cases CPU read and writes are performed using PRAMIN (i.e. using the | 35 | * In both cases CPU read and writes are performed by creating a write-combined |
36 | * GPU path) to ensure these operations are coherent for the GPU. This allows us | 36 | * mapping. The GPU L2 cache must thus be flushed/invalidated when required. To |
37 | * to use more "relaxed" allocation parameters when using the DMA API, since we | 37 | * be conservative we do this every time we acquire or release an instobj, but |
38 | * never need a kernel mapping. | 38 | * ideally L2 management should be handled at a higher level. |
39 | * | ||
40 | * To improve performance, CPU mappings are not removed upon instobj release. | ||
41 | * Instead they are placed into a LRU list to be recycled when the mapped space | ||
42 | * goes beyond a certain threshold. At the moment this limit is 1MB. | ||
39 | */ | 43 | */ |
40 | #define gk20a_instmem(p) container_of((p), struct gk20a_instmem, base) | ||
41 | #include "priv.h" | 44 | #include "priv.h" |
42 | 45 | ||
43 | #include <core/memory.h> | 46 | #include <core/memory.h> |
44 | #include <core/mm.h> | 47 | #include <core/mm.h> |
45 | #include <core/tegra.h> | 48 | #include <core/tegra.h> |
46 | #include <subdev/fb.h> | 49 | #include <subdev/fb.h> |
47 | 50 | #include <subdev/ltc.h> | |
48 | #define gk20a_instobj(p) container_of((p), struct gk20a_instobj, memory) | ||
49 | 51 | ||
50 | struct gk20a_instobj { | 52 | struct gk20a_instobj { |
51 | struct nvkm_memory memory; | 53 | struct nvkm_memory memory; |
52 | struct gk20a_instmem *imem; | ||
53 | struct nvkm_mem mem; | 54 | struct nvkm_mem mem; |
55 | struct gk20a_instmem *imem; | ||
56 | |||
57 | /* CPU mapping */ | ||
58 | u32 *vaddr; | ||
59 | struct list_head vaddr_node; | ||
54 | }; | 60 | }; |
61 | #define gk20a_instobj(p) container_of((p), struct gk20a_instobj, memory) | ||
55 | 62 | ||
56 | /* | 63 | /* |
57 | * Used for objects allocated using the DMA API | 64 | * Used for objects allocated using the DMA API |
@@ -59,10 +66,12 @@ struct gk20a_instobj { | |||
59 | struct gk20a_instobj_dma { | 66 | struct gk20a_instobj_dma { |
60 | struct gk20a_instobj base; | 67 | struct gk20a_instobj base; |
61 | 68 | ||
62 | void *cpuaddr; | 69 | u32 *cpuaddr; |
63 | dma_addr_t handle; | 70 | dma_addr_t handle; |
64 | struct nvkm_mm_node r; | 71 | struct nvkm_mm_node r; |
65 | }; | 72 | }; |
73 | #define gk20a_instobj_dma(p) \ | ||
74 | container_of(gk20a_instobj(p), struct gk20a_instobj_dma, base) | ||
66 | 75 | ||
67 | /* | 76 | /* |
68 | * Used for objects flattened using the IOMMU API | 77 | * Used for objects flattened using the IOMMU API |
@@ -70,25 +79,38 @@ struct gk20a_instobj_dma { | |||
70 | struct gk20a_instobj_iommu { | 79 | struct gk20a_instobj_iommu { |
71 | struct gk20a_instobj base; | 80 | struct gk20a_instobj base; |
72 | 81 | ||
73 | /* array of base.mem->size pages */ | 82 | /* will point to the higher half of pages */ |
83 | dma_addr_t *dma_addrs; | ||
84 | /* array of base.mem->size pages (+ dma_addr_ts) */ | ||
74 | struct page *pages[]; | 85 | struct page *pages[]; |
75 | }; | 86 | }; |
87 | #define gk20a_instobj_iommu(p) \ | ||
88 | container_of(gk20a_instobj(p), struct gk20a_instobj_iommu, base) | ||
76 | 89 | ||
77 | struct gk20a_instmem { | 90 | struct gk20a_instmem { |
78 | struct nvkm_instmem base; | 91 | struct nvkm_instmem base; |
79 | unsigned long lock_flags; | 92 | |
93 | /* protects vaddr_* and gk20a_instobj::vaddr* */ | ||
80 | spinlock_t lock; | 94 | spinlock_t lock; |
81 | u64 addr; | 95 | |
96 | /* CPU mappings LRU */ | ||
97 | unsigned int vaddr_use; | ||
98 | unsigned int vaddr_max; | ||
99 | struct list_head vaddr_lru; | ||
82 | 100 | ||
83 | /* Only used if IOMMU if present */ | 101 | /* Only used if IOMMU if present */ |
84 | struct mutex *mm_mutex; | 102 | struct mutex *mm_mutex; |
85 | struct nvkm_mm *mm; | 103 | struct nvkm_mm *mm; |
86 | struct iommu_domain *domain; | 104 | struct iommu_domain *domain; |
87 | unsigned long iommu_pgshift; | 105 | unsigned long iommu_pgshift; |
106 | u16 iommu_bit; | ||
88 | 107 | ||
89 | /* Only used by DMA API */ | 108 | /* Only used by DMA API */ |
90 | struct dma_attrs attrs; | 109 | struct dma_attrs attrs; |
110 | |||
111 | void __iomem * (*cpu_map)(struct nvkm_memory *); | ||
91 | }; | 112 | }; |
113 | #define gk20a_instmem(p) container_of((p), struct gk20a_instmem, base) | ||
92 | 114 | ||
93 | static enum nvkm_memory_target | 115 | static enum nvkm_memory_target |
94 | gk20a_instobj_target(struct nvkm_memory *memory) | 116 | gk20a_instobj_target(struct nvkm_memory *memory) |
@@ -100,7 +122,6 @@ static u64 | |||
100 | gk20a_instobj_addr(struct nvkm_memory *memory) | 122 | gk20a_instobj_addr(struct nvkm_memory *memory) |
101 | { | 123 | { |
102 | return gk20a_instobj(memory)->mem.offset; | 124 | return gk20a_instobj(memory)->mem.offset; |
103 | |||
104 | } | 125 | } |
105 | 126 | ||
106 | static u64 | 127 | static u64 |
@@ -110,107 +131,217 @@ gk20a_instobj_size(struct nvkm_memory *memory) | |||
110 | } | 131 | } |
111 | 132 | ||
112 | static void __iomem * | 133 | static void __iomem * |
134 | gk20a_instobj_cpu_map_dma(struct nvkm_memory *memory) | ||
135 | { | ||
136 | struct gk20a_instobj_dma *node = gk20a_instobj_dma(memory); | ||
137 | struct device *dev = node->base.imem->base.subdev.device->dev; | ||
138 | int npages = nvkm_memory_size(memory) >> 12; | ||
139 | struct page *pages[npages]; | ||
140 | int i; | ||
141 | |||
142 | /* phys_to_page does not exist on all platforms... */ | ||
143 | pages[0] = pfn_to_page(dma_to_phys(dev, node->handle) >> PAGE_SHIFT); | ||
144 | for (i = 1; i < npages; i++) | ||
145 | pages[i] = pages[0] + i; | ||
146 | |||
147 | return vmap(pages, npages, VM_MAP, pgprot_writecombine(PAGE_KERNEL)); | ||
148 | } | ||
149 | |||
150 | static void __iomem * | ||
151 | gk20a_instobj_cpu_map_iommu(struct nvkm_memory *memory) | ||
152 | { | ||
153 | struct gk20a_instobj_iommu *node = gk20a_instobj_iommu(memory); | ||
154 | int npages = nvkm_memory_size(memory) >> 12; | ||
155 | |||
156 | return vmap(node->pages, npages, VM_MAP, | ||
157 | pgprot_writecombine(PAGE_KERNEL)); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Must be called while holding gk20a_instmem_lock | ||
162 | */ | ||
163 | static void | ||
164 | gk20a_instmem_vaddr_gc(struct gk20a_instmem *imem, const u64 size) | ||
165 | { | ||
166 | while (imem->vaddr_use + size > imem->vaddr_max) { | ||
167 | struct gk20a_instobj *obj; | ||
168 | |||
169 | /* no candidate that can be unmapped, abort... */ | ||
170 | if (list_empty(&imem->vaddr_lru)) | ||
171 | break; | ||
172 | |||
173 | obj = list_first_entry(&imem->vaddr_lru, struct gk20a_instobj, | ||
174 | vaddr_node); | ||
175 | list_del(&obj->vaddr_node); | ||
176 | vunmap(obj->vaddr); | ||
177 | obj->vaddr = NULL; | ||
178 | imem->vaddr_use -= nvkm_memory_size(&obj->memory); | ||
179 | nvkm_debug(&imem->base.subdev, "(GC) vaddr used: %x/%x\n", | ||
180 | imem->vaddr_use, imem->vaddr_max); | ||
181 | |||
182 | } | ||
183 | } | ||
184 | |||
185 | static void __iomem * | ||
113 | gk20a_instobj_acquire(struct nvkm_memory *memory) | 186 | gk20a_instobj_acquire(struct nvkm_memory *memory) |
114 | { | 187 | { |
115 | struct gk20a_instmem *imem = gk20a_instobj(memory)->imem; | 188 | struct gk20a_instobj *node = gk20a_instobj(memory); |
189 | struct gk20a_instmem *imem = node->imem; | ||
190 | struct nvkm_ltc *ltc = imem->base.subdev.device->ltc; | ||
191 | const u64 size = nvkm_memory_size(memory); | ||
116 | unsigned long flags; | 192 | unsigned long flags; |
193 | |||
194 | nvkm_ltc_flush(ltc); | ||
195 | |||
117 | spin_lock_irqsave(&imem->lock, flags); | 196 | spin_lock_irqsave(&imem->lock, flags); |
118 | imem->lock_flags = flags; | 197 | |
119 | return NULL; | 198 | if (node->vaddr) { |
199 | /* remove us from the LRU list since we cannot be unmapped */ | ||
200 | list_del(&node->vaddr_node); | ||
201 | |||
202 | goto out; | ||
203 | } | ||
204 | |||
205 | /* try to free some address space if we reached the limit */ | ||
206 | gk20a_instmem_vaddr_gc(imem, size); | ||
207 | |||
208 | node->vaddr = imem->cpu_map(memory); | ||
209 | |||
210 | if (!node->vaddr) { | ||
211 | nvkm_error(&imem->base.subdev, "cannot map instobj - " | ||
212 | "this is not going to end well...\n"); | ||
213 | goto out; | ||
214 | } | ||
215 | |||
216 | imem->vaddr_use += size; | ||
217 | nvkm_debug(&imem->base.subdev, "vaddr used: %x/%x\n", | ||
218 | imem->vaddr_use, imem->vaddr_max); | ||
219 | |||
220 | out: | ||
221 | spin_unlock_irqrestore(&imem->lock, flags); | ||
222 | |||
223 | return node->vaddr; | ||
120 | } | 224 | } |
121 | 225 | ||
122 | static void | 226 | static void |
123 | gk20a_instobj_release(struct nvkm_memory *memory) | 227 | gk20a_instobj_release(struct nvkm_memory *memory) |
124 | { | 228 | { |
125 | struct gk20a_instmem *imem = gk20a_instobj(memory)->imem; | 229 | struct gk20a_instobj *node = gk20a_instobj(memory); |
126 | spin_unlock_irqrestore(&imem->lock, imem->lock_flags); | 230 | struct gk20a_instmem *imem = node->imem; |
127 | } | 231 | struct nvkm_ltc *ltc = imem->base.subdev.device->ltc; |
232 | unsigned long flags; | ||
128 | 233 | ||
129 | /* | 234 | spin_lock_irqsave(&imem->lock, flags); |
130 | * Use PRAMIN to read/write data and avoid coherency issues. | 235 | |
131 | * PRAMIN uses the GPU path and ensures data will always be coherent. | 236 | /* add ourselves to the LRU list so our CPU mapping can be freed */ |
132 | * | 237 | list_add_tail(&node->vaddr_node, &imem->vaddr_lru); |
133 | * A dynamic mapping based solution would be desirable in the future, but | 238 | |
134 | * the issue remains of how to maintain coherency efficiently. On ARM it is | 239 | spin_unlock_irqrestore(&imem->lock, flags); |
135 | * not easy (if possible at all?) to create uncached temporary mappings. | 240 | |
136 | */ | 241 | wmb(); |
242 | nvkm_ltc_invalidate(ltc); | ||
243 | } | ||
137 | 244 | ||
138 | static u32 | 245 | static u32 |
139 | gk20a_instobj_rd32(struct nvkm_memory *memory, u64 offset) | 246 | gk20a_instobj_rd32(struct nvkm_memory *memory, u64 offset) |
140 | { | 247 | { |
141 | struct gk20a_instobj *node = gk20a_instobj(memory); | 248 | struct gk20a_instobj *node = gk20a_instobj(memory); |
142 | struct gk20a_instmem *imem = node->imem; | 249 | |
143 | struct nvkm_device *device = imem->base.subdev.device; | 250 | return node->vaddr[offset / 4]; |
144 | u64 base = (node->mem.offset + offset) & 0xffffff00000ULL; | ||
145 | u64 addr = (node->mem.offset + offset) & 0x000000fffffULL; | ||
146 | u32 data; | ||
147 | |||
148 | if (unlikely(imem->addr != base)) { | ||
149 | nvkm_wr32(device, 0x001700, base >> 16); | ||
150 | imem->addr = base; | ||
151 | } | ||
152 | data = nvkm_rd32(device, 0x700000 + addr); | ||
153 | return data; | ||
154 | } | 251 | } |
155 | 252 | ||
156 | static void | 253 | static void |
157 | gk20a_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data) | 254 | gk20a_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data) |
158 | { | 255 | { |
159 | struct gk20a_instobj *node = gk20a_instobj(memory); | 256 | struct gk20a_instobj *node = gk20a_instobj(memory); |
160 | struct gk20a_instmem *imem = node->imem; | ||
161 | struct nvkm_device *device = imem->base.subdev.device; | ||
162 | u64 base = (node->mem.offset + offset) & 0xffffff00000ULL; | ||
163 | u64 addr = (node->mem.offset + offset) & 0x000000fffffULL; | ||
164 | 257 | ||
165 | if (unlikely(imem->addr != base)) { | 258 | node->vaddr[offset / 4] = data; |
166 | nvkm_wr32(device, 0x001700, base >> 16); | ||
167 | imem->addr = base; | ||
168 | } | ||
169 | nvkm_wr32(device, 0x700000 + addr, data); | ||
170 | } | 259 | } |
171 | 260 | ||
172 | static void | 261 | static void |
173 | gk20a_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset) | 262 | gk20a_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset) |
174 | { | 263 | { |
175 | struct gk20a_instobj *node = gk20a_instobj(memory); | 264 | struct gk20a_instobj *node = gk20a_instobj(memory); |
265 | |||
176 | nvkm_vm_map_at(vma, offset, &node->mem); | 266 | nvkm_vm_map_at(vma, offset, &node->mem); |
177 | } | 267 | } |
178 | 268 | ||
269 | /* | ||
270 | * Clear the CPU mapping of an instobj if it exists | ||
271 | */ | ||
179 | static void | 272 | static void |
180 | gk20a_instobj_dtor_dma(struct gk20a_instobj *_node) | 273 | gk20a_instobj_dtor(struct gk20a_instobj *node) |
274 | { | ||
275 | struct gk20a_instmem *imem = node->imem; | ||
276 | struct gk20a_instobj *obj; | ||
277 | unsigned long flags; | ||
278 | |||
279 | spin_lock_irqsave(&imem->lock, flags); | ||
280 | |||
281 | if (!node->vaddr) | ||
282 | goto out; | ||
283 | |||
284 | list_for_each_entry(obj, &imem->vaddr_lru, vaddr_node) { | ||
285 | if (obj == node) { | ||
286 | list_del(&obj->vaddr_node); | ||
287 | break; | ||
288 | } | ||
289 | } | ||
290 | vunmap(node->vaddr); | ||
291 | node->vaddr = NULL; | ||
292 | imem->vaddr_use -= nvkm_memory_size(&node->memory); | ||
293 | nvkm_debug(&imem->base.subdev, "vaddr used: %x/%x\n", | ||
294 | imem->vaddr_use, imem->vaddr_max); | ||
295 | |||
296 | out: | ||
297 | spin_unlock_irqrestore(&imem->lock, flags); | ||
298 | } | ||
299 | |||
300 | static void * | ||
301 | gk20a_instobj_dtor_dma(struct nvkm_memory *memory) | ||
181 | { | 302 | { |
182 | struct gk20a_instobj_dma *node = (void *)_node; | 303 | struct gk20a_instobj_dma *node = gk20a_instobj_dma(memory); |
183 | struct gk20a_instmem *imem = _node->imem; | 304 | struct gk20a_instmem *imem = node->base.imem; |
184 | struct device *dev = imem->base.subdev.device->dev; | 305 | struct device *dev = imem->base.subdev.device->dev; |
185 | 306 | ||
307 | gk20a_instobj_dtor(&node->base); | ||
308 | |||
186 | if (unlikely(!node->cpuaddr)) | 309 | if (unlikely(!node->cpuaddr)) |
187 | return; | 310 | goto out; |
188 | 311 | ||
189 | dma_free_attrs(dev, _node->mem.size << PAGE_SHIFT, node->cpuaddr, | 312 | dma_free_attrs(dev, node->base.mem.size << PAGE_SHIFT, node->cpuaddr, |
190 | node->handle, &imem->attrs); | 313 | node->handle, &imem->attrs); |
314 | |||
315 | out: | ||
316 | return node; | ||
191 | } | 317 | } |
192 | 318 | ||
193 | static void | 319 | static void * |
194 | gk20a_instobj_dtor_iommu(struct gk20a_instobj *_node) | 320 | gk20a_instobj_dtor_iommu(struct nvkm_memory *memory) |
195 | { | 321 | { |
196 | struct gk20a_instobj_iommu *node = (void *)_node; | 322 | struct gk20a_instobj_iommu *node = gk20a_instobj_iommu(memory); |
197 | struct gk20a_instmem *imem = _node->imem; | 323 | struct gk20a_instmem *imem = node->base.imem; |
324 | struct device *dev = imem->base.subdev.device->dev; | ||
198 | struct nvkm_mm_node *r; | 325 | struct nvkm_mm_node *r; |
199 | int i; | 326 | int i; |
200 | 327 | ||
201 | if (unlikely(list_empty(&_node->mem.regions))) | 328 | gk20a_instobj_dtor(&node->base); |
202 | return; | ||
203 | 329 | ||
204 | r = list_first_entry(&_node->mem.regions, struct nvkm_mm_node, | 330 | if (unlikely(list_empty(&node->base.mem.regions))) |
331 | goto out; | ||
332 | |||
333 | r = list_first_entry(&node->base.mem.regions, struct nvkm_mm_node, | ||
205 | rl_entry); | 334 | rl_entry); |
206 | 335 | ||
207 | /* clear bit 34 to unmap pages */ | 336 | /* clear IOMMU bit to unmap pages */ |
208 | r->offset &= ~BIT(34 - imem->iommu_pgshift); | 337 | r->offset &= ~BIT(imem->iommu_bit - imem->iommu_pgshift); |
209 | 338 | ||
210 | /* Unmap pages from GPU address space and free them */ | 339 | /* Unmap pages from GPU address space and free them */ |
211 | for (i = 0; i < _node->mem.size; i++) { | 340 | for (i = 0; i < node->base.mem.size; i++) { |
212 | iommu_unmap(imem->domain, | 341 | iommu_unmap(imem->domain, |
213 | (r->offset + i) << imem->iommu_pgshift, PAGE_SIZE); | 342 | (r->offset + i) << imem->iommu_pgshift, PAGE_SIZE); |
343 | dma_unmap_page(dev, node->dma_addrs[i], PAGE_SIZE, | ||
344 | DMA_BIDIRECTIONAL); | ||
214 | __free_page(node->pages[i]); | 345 | __free_page(node->pages[i]); |
215 | } | 346 | } |
216 | 347 | ||
@@ -218,25 +349,27 @@ gk20a_instobj_dtor_iommu(struct gk20a_instobj *_node) | |||
218 | mutex_lock(imem->mm_mutex); | 349 | mutex_lock(imem->mm_mutex); |
219 | nvkm_mm_free(imem->mm, &r); | 350 | nvkm_mm_free(imem->mm, &r); |
220 | mutex_unlock(imem->mm_mutex); | 351 | mutex_unlock(imem->mm_mutex); |
221 | } | ||
222 | |||
223 | static void * | ||
224 | gk20a_instobj_dtor(struct nvkm_memory *memory) | ||
225 | { | ||
226 | struct gk20a_instobj *node = gk20a_instobj(memory); | ||
227 | struct gk20a_instmem *imem = node->imem; | ||
228 | |||
229 | if (imem->domain) | ||
230 | gk20a_instobj_dtor_iommu(node); | ||
231 | else | ||
232 | gk20a_instobj_dtor_dma(node); | ||
233 | 352 | ||
353 | out: | ||
234 | return node; | 354 | return node; |
235 | } | 355 | } |
236 | 356 | ||
237 | static const struct nvkm_memory_func | 357 | static const struct nvkm_memory_func |
238 | gk20a_instobj_func = { | 358 | gk20a_instobj_func_dma = { |
239 | .dtor = gk20a_instobj_dtor, | 359 | .dtor = gk20a_instobj_dtor_dma, |
360 | .target = gk20a_instobj_target, | ||
361 | .addr = gk20a_instobj_addr, | ||
362 | .size = gk20a_instobj_size, | ||
363 | .acquire = gk20a_instobj_acquire, | ||
364 | .release = gk20a_instobj_release, | ||
365 | .rd32 = gk20a_instobj_rd32, | ||
366 | .wr32 = gk20a_instobj_wr32, | ||
367 | .map = gk20a_instobj_map, | ||
368 | }; | ||
369 | |||
370 | static const struct nvkm_memory_func | ||
371 | gk20a_instobj_func_iommu = { | ||
372 | .dtor = gk20a_instobj_dtor_iommu, | ||
240 | .target = gk20a_instobj_target, | 373 | .target = gk20a_instobj_target, |
241 | .addr = gk20a_instobj_addr, | 374 | .addr = gk20a_instobj_addr, |
242 | .size = gk20a_instobj_size, | 375 | .size = gk20a_instobj_size, |
@@ -259,6 +392,8 @@ gk20a_instobj_ctor_dma(struct gk20a_instmem *imem, u32 npages, u32 align, | |||
259 | return -ENOMEM; | 392 | return -ENOMEM; |
260 | *_node = &node->base; | 393 | *_node = &node->base; |
261 | 394 | ||
395 | nvkm_memory_ctor(&gk20a_instobj_func_dma, &node->base.memory); | ||
396 | |||
262 | node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT, | 397 | node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT, |
263 | &node->handle, GFP_KERNEL, | 398 | &node->handle, GFP_KERNEL, |
264 | &imem->attrs); | 399 | &imem->attrs); |
@@ -292,24 +427,40 @@ gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align, | |||
292 | { | 427 | { |
293 | struct gk20a_instobj_iommu *node; | 428 | struct gk20a_instobj_iommu *node; |
294 | struct nvkm_subdev *subdev = &imem->base.subdev; | 429 | struct nvkm_subdev *subdev = &imem->base.subdev; |
430 | struct device *dev = subdev->device->dev; | ||
295 | struct nvkm_mm_node *r; | 431 | struct nvkm_mm_node *r; |
296 | int ret; | 432 | int ret; |
297 | int i; | 433 | int i; |
298 | 434 | ||
299 | if (!(node = kzalloc(sizeof(*node) + | 435 | /* |
300 | sizeof( node->pages[0]) * npages, GFP_KERNEL))) | 436 | * despite their variable size, instmem allocations are small enough |
437 | * (< 1 page) to be handled by kzalloc | ||
438 | */ | ||
439 | if (!(node = kzalloc(sizeof(*node) + ((sizeof(node->pages[0]) + | ||
440 | sizeof(*node->dma_addrs)) * npages), GFP_KERNEL))) | ||
301 | return -ENOMEM; | 441 | return -ENOMEM; |
302 | *_node = &node->base; | 442 | *_node = &node->base; |
443 | node->dma_addrs = (void *)(node->pages + npages); | ||
444 | |||
445 | nvkm_memory_ctor(&gk20a_instobj_func_iommu, &node->base.memory); | ||
303 | 446 | ||
304 | /* Allocate backing memory */ | 447 | /* Allocate backing memory */ |
305 | for (i = 0; i < npages; i++) { | 448 | for (i = 0; i < npages; i++) { |
306 | struct page *p = alloc_page(GFP_KERNEL); | 449 | struct page *p = alloc_page(GFP_KERNEL); |
450 | dma_addr_t dma_adr; | ||
307 | 451 | ||
308 | if (p == NULL) { | 452 | if (p == NULL) { |
309 | ret = -ENOMEM; | 453 | ret = -ENOMEM; |
310 | goto free_pages; | 454 | goto free_pages; |
311 | } | 455 | } |
312 | node->pages[i] = p; | 456 | node->pages[i] = p; |
457 | dma_adr = dma_map_page(dev, p, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); | ||
458 | if (dma_mapping_error(dev, dma_adr)) { | ||
459 | nvkm_error(subdev, "DMA mapping error!\n"); | ||
460 | ret = -ENOMEM; | ||
461 | goto free_pages; | ||
462 | } | ||
463 | node->dma_addrs[i] = dma_adr; | ||
313 | } | 464 | } |
314 | 465 | ||
315 | mutex_lock(imem->mm_mutex); | 466 | mutex_lock(imem->mm_mutex); |
@@ -318,16 +469,15 @@ gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align, | |||
318 | align >> imem->iommu_pgshift, &r); | 469 | align >> imem->iommu_pgshift, &r); |
319 | mutex_unlock(imem->mm_mutex); | 470 | mutex_unlock(imem->mm_mutex); |
320 | if (ret) { | 471 | if (ret) { |
321 | nvkm_error(subdev, "virtual space is full!\n"); | 472 | nvkm_error(subdev, "IOMMU space is full!\n"); |
322 | goto free_pages; | 473 | goto free_pages; |
323 | } | 474 | } |
324 | 475 | ||
325 | /* Map into GPU address space */ | 476 | /* Map into GPU address space */ |
326 | for (i = 0; i < npages; i++) { | 477 | for (i = 0; i < npages; i++) { |
327 | struct page *p = node->pages[i]; | ||
328 | u32 offset = (r->offset + i) << imem->iommu_pgshift; | 478 | u32 offset = (r->offset + i) << imem->iommu_pgshift; |
329 | 479 | ||
330 | ret = iommu_map(imem->domain, offset, page_to_phys(p), | 480 | ret = iommu_map(imem->domain, offset, node->dma_addrs[i], |
331 | PAGE_SIZE, IOMMU_READ | IOMMU_WRITE); | 481 | PAGE_SIZE, IOMMU_READ | IOMMU_WRITE); |
332 | if (ret < 0) { | 482 | if (ret < 0) { |
333 | nvkm_error(subdev, "IOMMU mapping failure: %d\n", ret); | 483 | nvkm_error(subdev, "IOMMU mapping failure: %d\n", ret); |
@@ -340,8 +490,8 @@ gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align, | |||
340 | } | 490 | } |
341 | } | 491 | } |
342 | 492 | ||
343 | /* Bit 34 tells that an address is to be resolved through the IOMMU */ | 493 | /* IOMMU bit tells that an address is to be resolved through the IOMMU */ |
344 | r->offset |= BIT(34 - imem->iommu_pgshift); | 494 | r->offset |= BIT(imem->iommu_bit - imem->iommu_pgshift); |
345 | 495 | ||
346 | node->base.mem.offset = ((u64)r->offset) << imem->iommu_pgshift; | 496 | node->base.mem.offset = ((u64)r->offset) << imem->iommu_pgshift; |
347 | 497 | ||
@@ -356,8 +506,13 @@ release_area: | |||
356 | mutex_unlock(imem->mm_mutex); | 506 | mutex_unlock(imem->mm_mutex); |
357 | 507 | ||
358 | free_pages: | 508 | free_pages: |
359 | for (i = 0; i < npages && node->pages[i] != NULL; i++) | 509 | for (i = 0; i < npages && node->pages[i] != NULL; i++) { |
510 | dma_addr_t dma_addr = node->dma_addrs[i]; | ||
511 | if (dma_addr) | ||
512 | dma_unmap_page(dev, dma_addr, PAGE_SIZE, | ||
513 | DMA_BIDIRECTIONAL); | ||
360 | __free_page(node->pages[i]); | 514 | __free_page(node->pages[i]); |
515 | } | ||
361 | 516 | ||
362 | return ret; | 517 | return ret; |
363 | } | 518 | } |
@@ -367,8 +522,8 @@ gk20a_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero, | |||
367 | struct nvkm_memory **pmemory) | 522 | struct nvkm_memory **pmemory) |
368 | { | 523 | { |
369 | struct gk20a_instmem *imem = gk20a_instmem(base); | 524 | struct gk20a_instmem *imem = gk20a_instmem(base); |
370 | struct gk20a_instobj *node = NULL; | ||
371 | struct nvkm_subdev *subdev = &imem->base.subdev; | 525 | struct nvkm_subdev *subdev = &imem->base.subdev; |
526 | struct gk20a_instobj *node = NULL; | ||
372 | int ret; | 527 | int ret; |
373 | 528 | ||
374 | nvkm_debug(subdev, "%s (%s): size: %x align: %x\n", __func__, | 529 | nvkm_debug(subdev, "%s (%s): size: %x align: %x\n", __func__, |
@@ -388,7 +543,6 @@ gk20a_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero, | |||
388 | if (ret) | 543 | if (ret) |
389 | return ret; | 544 | return ret; |
390 | 545 | ||
391 | nvkm_memory_ctor(&gk20a_instobj_func, &node->memory); | ||
392 | node->imem = imem; | 546 | node->imem = imem; |
393 | 547 | ||
394 | /* present memory for being mapped using small pages */ | 548 | /* present memory for being mapped using small pages */ |
@@ -402,15 +556,25 @@ gk20a_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero, | |||
402 | return 0; | 556 | return 0; |
403 | } | 557 | } |
404 | 558 | ||
405 | static void | 559 | static void * |
406 | gk20a_instmem_fini(struct nvkm_instmem *base) | 560 | gk20a_instmem_dtor(struct nvkm_instmem *base) |
407 | { | 561 | { |
408 | gk20a_instmem(base)->addr = ~0ULL; | 562 | struct gk20a_instmem *imem = gk20a_instmem(base); |
563 | |||
564 | /* perform some sanity checks... */ | ||
565 | if (!list_empty(&imem->vaddr_lru)) | ||
566 | nvkm_warn(&base->subdev, "instobj LRU not empty!\n"); | ||
567 | |||
568 | if (imem->vaddr_use != 0) | ||
569 | nvkm_warn(&base->subdev, "instobj vmap area not empty! " | ||
570 | "0x%x bytes still mapped\n", imem->vaddr_use); | ||
571 | |||
572 | return imem; | ||
409 | } | 573 | } |
410 | 574 | ||
411 | static const struct nvkm_instmem_func | 575 | static const struct nvkm_instmem_func |
412 | gk20a_instmem = { | 576 | gk20a_instmem = { |
413 | .fini = gk20a_instmem_fini, | 577 | .dtor = gk20a_instmem_dtor, |
414 | .memory_new = gk20a_instobj_new, | 578 | .memory_new = gk20a_instobj_new, |
415 | .persistent = true, | 579 | .persistent = true, |
416 | .zero = false, | 580 | .zero = false, |
@@ -429,23 +593,28 @@ gk20a_instmem_new(struct nvkm_device *device, int index, | |||
429 | spin_lock_init(&imem->lock); | 593 | spin_lock_init(&imem->lock); |
430 | *pimem = &imem->base; | 594 | *pimem = &imem->base; |
431 | 595 | ||
596 | /* do not allow more than 1MB of CPU-mapped instmem */ | ||
597 | imem->vaddr_use = 0; | ||
598 | imem->vaddr_max = 0x100000; | ||
599 | INIT_LIST_HEAD(&imem->vaddr_lru); | ||
600 | |||
432 | if (tdev->iommu.domain) { | 601 | if (tdev->iommu.domain) { |
433 | imem->domain = tdev->iommu.domain; | 602 | imem->mm_mutex = &tdev->iommu.mutex; |
434 | imem->mm = &tdev->iommu.mm; | 603 | imem->mm = &tdev->iommu.mm; |
604 | imem->domain = tdev->iommu.domain; | ||
435 | imem->iommu_pgshift = tdev->iommu.pgshift; | 605 | imem->iommu_pgshift = tdev->iommu.pgshift; |
436 | imem->mm_mutex = &tdev->iommu.mutex; | 606 | imem->cpu_map = gk20a_instobj_cpu_map_iommu; |
607 | imem->iommu_bit = tdev->func->iommu_bit; | ||
437 | 608 | ||
438 | nvkm_info(&imem->base.subdev, "using IOMMU\n"); | 609 | nvkm_info(&imem->base.subdev, "using IOMMU\n"); |
439 | } else { | 610 | } else { |
440 | init_dma_attrs(&imem->attrs); | 611 | init_dma_attrs(&imem->attrs); |
441 | /* | 612 | /* We will access the memory through our own mapping */ |
442 | * We will access instmem through PRAMIN and thus do not need a | ||
443 | * consistent CPU pointer or kernel mapping | ||
444 | */ | ||
445 | dma_set_attr(DMA_ATTR_NON_CONSISTENT, &imem->attrs); | 613 | dma_set_attr(DMA_ATTR_NON_CONSISTENT, &imem->attrs); |
446 | dma_set_attr(DMA_ATTR_WEAK_ORDERING, &imem->attrs); | 614 | dma_set_attr(DMA_ATTR_WEAK_ORDERING, &imem->attrs); |
447 | dma_set_attr(DMA_ATTR_WRITE_COMBINE, &imem->attrs); | 615 | dma_set_attr(DMA_ATTR_WRITE_COMBINE, &imem->attrs); |
448 | dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &imem->attrs); | 616 | dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &imem->attrs); |
617 | imem->cpu_map = gk20a_instobj_cpu_map_dma; | ||
449 | 618 | ||
450 | nvkm_info(&imem->base.subdev, "using DMA API\n"); | 619 | nvkm_info(&imem->base.subdev, "using DMA API\n"); |
451 | } | 620 | } |