summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2016-06-06 09:23:06 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-13 10:42:26 -0400
commit987de665838f6b4aceadf52f076b91da4cc633ca (patch)
treeb265cf18cbd3cba69202674b0b5033ee28948234 /drivers
parent15d241a8cb1d6cf25752f4c0f1e858bbcd34db3f (diff)
gpu: nvgpu: optimize mem_desc accessor loops
Instead of going via gk20a_mem_{wr,rd}32() on each iteration, do direct memcpy/memset with sysmem, and minimize the enter/exit overhead with vidmem. JIRA DNVGPU-23 Change-Id: I5437e35f8393a746777a40636c1e9b5d93ced1f6 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1159524 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c151
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h4
2 files changed, 132 insertions, 23 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 60c1b7ea..14a3dbc6 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -150,19 +150,101 @@ u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
150 return gk20a_mem_rd32(g, mem, offset / sizeof(u32)); 150 return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
151} 151}
152 152
153/*
154 * Batch innerloop for the function below once per each PRAMIN range (some
155 * 4B..1MB at a time). "start" reg goes as-is to gk20a_{readl,writel}.
156 */
157typedef void (*pramin_access_batch_fn)(struct gk20a *g, u32 start, u32 words,
158 u32 **arg);
159
160/*
161 * The PRAMIN range is 1 MB, must change base addr if a buffer crosses that.
162 * This same loop is used for read/write/memset. Offset and size in bytes.
163 * One call to "loop" is done per range, with "arg" supplied.
164 */
165static inline void pramin_access_batched(struct gk20a *g, struct mem_desc *mem,
166 u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
167{
168 offset /= sizeof(u32);
169
170 while (size) {
171 u32 byteoff = gk20a_pramin_enter(g, mem, offset);
172 u32 start_reg = pram_data032_r(byteoff / sizeof(u32));
173 u32 until_end = SZ_1M - (byteoff & (SZ_1M - 1));
174 u32 n = min(size, until_end);
175
176 loop(g, start_reg, n / sizeof(u32), arg);
177
178 /* read back to synchronize accesses */
179 gk20a_readl(g, start_reg);
180 gk20a_pramin_exit(g, mem);
181
182 offset += n / sizeof(u32);
183 size -= n;
184 }
185}
186
187static inline void pramin_access_batch_rd_n(struct gk20a *g, u32 start,
188 u32 words, u32 **arg)
189{
190 u32 r = start, *dest_u32 = *arg;
191
192 while (words--) {
193 *dest_u32++ = gk20a_readl(g, r);
194 r += sizeof(u32);
195 }
196
197 *arg = dest_u32;
198}
199
200static inline void pramin_access_batch_wr_n(struct gk20a *g, u32 start,
201 u32 words, u32 **arg)
202{
203 u32 r = start, *src_u32 = *arg;
204
205 while (words--) {
206 gk20a_writel(g, r, *src_u32++);
207 r += sizeof(u32);
208 }
209
210 *arg = src_u32;
211}
212
213static inline void pramin_access_batch_set(struct gk20a *g, u32 start,
214 u32 words, u32 **arg)
215{
216 u32 r = start, repeat = **arg;
217
218 while (words--) {
219 gk20a_writel(g, r, repeat);
220 r += sizeof(u32);
221 }
222}
223
153void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, 224void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
154 u32 offset, void *dest, u32 size) 225 u32 offset, void *dest, u32 size)
155{ 226{
156 u32 i;
157 u32 *dest_u32 = dest;
158
159 WARN_ON(offset & 3); 227 WARN_ON(offset & 3);
160 WARN_ON(size & 3); 228 WARN_ON(size & 3);
161 offset /= sizeof(u32);
162 size /= sizeof(u32);
163 229
164 for (i = 0; i < size; i++) 230 if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
165 dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i); 231 u8 *src = (u8 *)mem->cpu_va + offset;
232
233 WARN_ON(!mem->cpu_va);
234 memcpy(dest, src, size);
235#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
236 if (size)
237 gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
238 src, *dest, size);
239#endif
240 } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
241 u32 *dest_u32 = dest;
242
243 pramin_access_batched(g, mem, offset, size,
244 pramin_access_batch_rd_n, &dest_u32);
245 } else {
246 WARN_ON("Accessing unallocated mem_desc");
247 }
166} 248}
167 249
168void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data) 250void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
@@ -195,30 +277,57 @@ void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
195void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset, 277void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
196 void *src, u32 size) 278 void *src, u32 size)
197{ 279{
198 u32 i;
199 u32 *src_u32 = src;
200
201 WARN_ON(offset & 3); 280 WARN_ON(offset & 3);
202 WARN_ON(size & 3); 281 WARN_ON(size & 3);
203 offset /= sizeof(u32);
204 size /= sizeof(u32);
205 282
206 for (i = 0; i < size; i++) 283 if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
207 gk20a_mem_wr32(g, mem, offset + i, src_u32[i]); 284 u8 *dest = (u8 *)mem->cpu_va + offset;
285
286 WARN_ON(!mem->cpu_va);
287#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
288 if (size)
289 gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
290 dest, *src, size);
291#endif
292 memcpy(dest, src, size);
293 } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
294 u32 *src_u32 = src;
295
296 pramin_access_batched(g, mem, offset, size,
297 pramin_access_batch_wr_n, &src_u32);
298 } else {
299 WARN_ON("Accessing unallocated mem_desc");
300 }
208} 301}
209 302
210void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset, 303void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
211 u32 value, u32 size) 304 u32 c, u32 size)
212{ 305{
213 u32 i;
214
215 WARN_ON(offset & 3); 306 WARN_ON(offset & 3);
216 WARN_ON(size & 3); 307 WARN_ON(size & 3);
217 offset /= sizeof(u32); 308 WARN_ON(c & ~0xff);
218 size /= sizeof(u32); 309
310 c &= 0xff;
311
312 if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
313 u8 *dest = (u8 *)mem->cpu_va + offset;
314
315 WARN_ON(!mem->cpu_va);
316#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
317 if (size)
318 gk20a_dbg(gpu_dbg_mem, " %p = 0x%x [times %d]",
319 dest, c, size);
320#endif
321 memset(dest, c, size);
322 } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
323 u32 repeat_value = c | (c << 8) | (c << 16) | (c << 24);
324 u32 *p = &repeat_value;
219 325
220 for (i = 0; i < size; i++) 326 pramin_access_batched(g, mem, offset, size,
221 gk20a_mem_wr32(g, mem, offset + i, value); 327 pramin_access_batch_set, &p);
328 } else {
329 WARN_ON("Accessing unallocated mem_desc");
330 }
222} 331}
223 332
224/* 333/*
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index d1628b07..23420fef 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -458,9 +458,9 @@ void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
458/* memcpy from cpu, offset and size in bytes (32b-aligned) */ 458/* memcpy from cpu, offset and size in bytes (32b-aligned) */
459void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset, 459void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
460 void *src, u32 size); 460 void *src, u32 size);
461/* size and offset in bytes (32b-aligned), filled with u32s */ 461/* size and offset in bytes (32b-aligned), filled with the constant byte c */
462void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset, 462void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
463 u32 value, u32 size); 463 u32 c, u32 size);
464 464
465#if 0 /*related to addr bits above, concern below TBD on which is accurate */ 465#if 0 /*related to addr bits above, concern below TBD on which is accurate */
466#define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\ 466#define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\