summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/gr_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c8656
1 files changed, 8656 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
new file mode 100644
index 00000000..fc008169
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -0,0 +1,8656 @@
1/*
2 * GK20A Graphics
3 *
4 * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25#include <trace/events/gk20a.h>
26#include <uapi/linux/nvgpu.h>
27
28#include <nvgpu/dma.h>
29#include <nvgpu/kmem.h>
30#include <nvgpu/gmmu.h>
31#include <nvgpu/timers.h>
32#include <nvgpu/nvgpu_common.h>
33#include <nvgpu/log.h>
34#include <nvgpu/bsearch.h>
35#include <nvgpu/sort.h>
36#include <nvgpu/bug.h>
37#include <nvgpu/firmware.h>
38#include <nvgpu/enabled.h>
39#include <nvgpu/debug.h>
40#include <nvgpu/barrier.h>
41#include <nvgpu/mm.h>
42#include <nvgpu/ctxsw_trace.h>
43
44#include "gk20a.h"
45#include "gr_ctx_gk20a.h"
46#include "gr_pri_gk20a.h"
47#include "regops_gk20a.h"
48#include "dbg_gpu_gk20a.h"
49
50#include "common/linux/os_linux.h"
51
52#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
53#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
54#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
55#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
56#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
57#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
58#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
59#include <nvgpu/hw/gk20a/hw_pri_ringmaster_gk20a.h>
60#include <nvgpu/hw/gk20a/hw_pri_ringstation_sys_gk20a.h>
61#include <nvgpu/hw/gk20a/hw_pri_ringstation_gpc_gk20a.h>
62#include <nvgpu/hw/gk20a/hw_pri_ringstation_fbp_gk20a.h>
63#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
64#include <nvgpu/hw/gk20a/hw_ltc_gk20a.h>
65#include <nvgpu/hw/gk20a/hw_fb_gk20a.h>
66#include <nvgpu/hw/gk20a/hw_therm_gk20a.h>
67#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
68
69#define BLK_SIZE (256)
70#define NV_PMM_FBP_STRIDE 0x1000
71#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200
72#define NV_PERF_PMMGPC_CHIPLET_OFFSET 0x1000
73#define NV_PERF_PMMGPCROUTER_STRIDE 0x0200
74#define NV_PCFG_BASE 0x00088000
75#define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE 0x0020
76#define FE_PWR_MODE_TIMEOUT_MAX 2000
77#define FE_PWR_MODE_TIMEOUT_DEFAULT 10
78#define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000
79#define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10
80#define FECS_ARB_CMD_TIMEOUT_MAX 40
81#define FECS_ARB_CMD_TIMEOUT_DEFAULT 2
82#define GFXP_WFI_TIMEOUT_COUNT_DEFAULT 100000
83
84static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
85
86/* global ctx buffer */
87static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
88static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
89static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
90 struct channel_gk20a *c);
91static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
92
93/* channel gr ctx buffer */
94static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
95 struct channel_gk20a *c,
96 u32 class, u32 padding);
97static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
98
99/* channel patch ctx buffer */
100static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
101 struct channel_gk20a *c);
102static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
103
104/* golden ctx image */
105static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
106 struct channel_gk20a *c);
107/*elcg init */
108static void gr_gk20a_enable_elcg(struct gk20a *g);
109
110int gr_gk20a_get_ctx_id(struct gk20a *g,
111 struct channel_gk20a *c,
112 u32 *ctx_id)
113{
114 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
115 struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
116
117 /* Channel gr_ctx buffer is gpu cacheable.
118 Flush and invalidate before cpu update. */
119 g->ops.mm.l2_flush(g, true);
120
121 if (nvgpu_mem_begin(g, mem))
122 return -ENOMEM;
123
124 *ctx_id = nvgpu_mem_rd(g, mem,
125 ctxsw_prog_main_image_context_id_o());
126 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, "ctx_id: 0x%x", *ctx_id);
127
128 nvgpu_mem_end(g, mem);
129
130 return 0;
131}
132
133void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
134{
135 unsigned int i;
136
137 nvgpu_err(g, "gr_fecs_os_r : %d",
138 gk20a_readl(g, gr_fecs_os_r()));
139 nvgpu_err(g, "gr_fecs_cpuctl_r : 0x%x",
140 gk20a_readl(g, gr_fecs_cpuctl_r()));
141 nvgpu_err(g, "gr_fecs_idlestate_r : 0x%x",
142 gk20a_readl(g, gr_fecs_idlestate_r()));
143 nvgpu_err(g, "gr_fecs_mailbox0_r : 0x%x",
144 gk20a_readl(g, gr_fecs_mailbox0_r()));
145 nvgpu_err(g, "gr_fecs_mailbox1_r : 0x%x",
146 gk20a_readl(g, gr_fecs_mailbox1_r()));
147 nvgpu_err(g, "gr_fecs_irqstat_r : 0x%x",
148 gk20a_readl(g, gr_fecs_irqstat_r()));
149 nvgpu_err(g, "gr_fecs_irqmode_r : 0x%x",
150 gk20a_readl(g, gr_fecs_irqmode_r()));
151 nvgpu_err(g, "gr_fecs_irqmask_r : 0x%x",
152 gk20a_readl(g, gr_fecs_irqmask_r()));
153 nvgpu_err(g, "gr_fecs_irqdest_r : 0x%x",
154 gk20a_readl(g, gr_fecs_irqdest_r()));
155 nvgpu_err(g, "gr_fecs_debug1_r : 0x%x",
156 gk20a_readl(g, gr_fecs_debug1_r()));
157 nvgpu_err(g, "gr_fecs_debuginfo_r : 0x%x",
158 gk20a_readl(g, gr_fecs_debuginfo_r()));
159
160 for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
161 nvgpu_err(g, "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
162 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
163
164 nvgpu_err(g, "gr_fecs_engctl_r : 0x%x",
165 gk20a_readl(g, gr_fecs_engctl_r()));
166 nvgpu_err(g, "gr_fecs_curctx_r : 0x%x",
167 gk20a_readl(g, gr_fecs_curctx_r()));
168 nvgpu_err(g, "gr_fecs_nxtctx_r : 0x%x",
169 gk20a_readl(g, gr_fecs_nxtctx_r()));
170
171 gk20a_writel(g, gr_fecs_icd_cmd_r(),
172 gr_fecs_icd_cmd_opc_rreg_f() |
173 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
174 nvgpu_err(g, "FECS_FALCON_REG_IMB : 0x%x",
175 gk20a_readl(g, gr_fecs_icd_rdata_r()));
176
177 gk20a_writel(g, gr_fecs_icd_cmd_r(),
178 gr_fecs_icd_cmd_opc_rreg_f() |
179 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
180 nvgpu_err(g, "FECS_FALCON_REG_DMB : 0x%x",
181 gk20a_readl(g, gr_fecs_icd_rdata_r()));
182
183 gk20a_writel(g, gr_fecs_icd_cmd_r(),
184 gr_fecs_icd_cmd_opc_rreg_f() |
185 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
186 nvgpu_err(g, "FECS_FALCON_REG_CSW : 0x%x",
187 gk20a_readl(g, gr_fecs_icd_rdata_r()));
188
189 gk20a_writel(g, gr_fecs_icd_cmd_r(),
190 gr_fecs_icd_cmd_opc_rreg_f() |
191 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
192 nvgpu_err(g, "FECS_FALCON_REG_CTX : 0x%x",
193 gk20a_readl(g, gr_fecs_icd_rdata_r()));
194
195 gk20a_writel(g, gr_fecs_icd_cmd_r(),
196 gr_fecs_icd_cmd_opc_rreg_f() |
197 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
198 nvgpu_err(g, "FECS_FALCON_REG_EXCI : 0x%x",
199 gk20a_readl(g, gr_fecs_icd_rdata_r()));
200
201 for (i = 0; i < 4; i++) {
202 gk20a_writel(g, gr_fecs_icd_cmd_r(),
203 gr_fecs_icd_cmd_opc_rreg_f() |
204 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
205 nvgpu_err(g, "FECS_FALCON_REG_PC : 0x%x",
206 gk20a_readl(g, gr_fecs_icd_rdata_r()));
207
208 gk20a_writel(g, gr_fecs_icd_cmd_r(),
209 gr_fecs_icd_cmd_opc_rreg_f() |
210 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
211 nvgpu_err(g, "FECS_FALCON_REG_SP : 0x%x",
212 gk20a_readl(g, gr_fecs_icd_rdata_r()));
213 }
214}
215
216static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
217{
218 u32 i, ucode_u32_size;
219 const u32 *ucode_u32_data;
220 u32 checksum;
221
222 gk20a_dbg_fn("");
223
224 gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
225 gr_gpccs_dmemc_blk_f(0) |
226 gr_gpccs_dmemc_aincw_f(1)));
227
228 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
229 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
230
231 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
232 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
233 checksum += ucode_u32_data[i];
234 }
235
236 gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
237 gr_fecs_dmemc_blk_f(0) |
238 gr_fecs_dmemc_aincw_f(1)));
239
240 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
241 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
242
243 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
244 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
245 checksum += ucode_u32_data[i];
246 }
247 gk20a_dbg_fn("done");
248}
249
250static void gr_gk20a_load_falcon_imem(struct gk20a *g)
251{
252 u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
253 const u32 *ucode_u32_data;
254 u32 tag, i, pad_start, pad_end;
255 u32 checksum;
256
257 gk20a_dbg_fn("");
258
259 cfg = gk20a_readl(g, gr_fecs_cfg_r());
260 fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
261
262 cfg = gk20a_readl(g, gr_gpc0_cfg_r());
263 gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
264
265 /* Use the broadcast address to access all of the GPCCS units. */
266 gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
267 gr_gpccs_imemc_blk_f(0) |
268 gr_gpccs_imemc_aincw_f(1)));
269
270 /* Setup the tags for the instruction memory. */
271 tag = 0;
272 gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
273
274 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
275 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
276
277 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
278 if (i && ((i % (256/sizeof(u32))) == 0)) {
279 tag++;
280 gk20a_writel(g, gr_gpccs_imemt_r(0),
281 gr_gpccs_imemt_tag_f(tag));
282 }
283 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
284 checksum += ucode_u32_data[i];
285 }
286
287 pad_start = i*4;
288 pad_end = pad_start+(256-pad_start%256)+256;
289 for (i = pad_start;
290 (i < gpccs_imem_size * 256) && (i < pad_end);
291 i += 4) {
292 if (i && ((i % 256) == 0)) {
293 tag++;
294 gk20a_writel(g, gr_gpccs_imemt_r(0),
295 gr_gpccs_imemt_tag_f(tag));
296 }
297 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
298 }
299
300 gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
301 gr_fecs_imemc_blk_f(0) |
302 gr_fecs_imemc_aincw_f(1)));
303
304 /* Setup the tags for the instruction memory. */
305 tag = 0;
306 gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
307
308 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
309 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
310
311 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
312 if (i && ((i % (256/sizeof(u32))) == 0)) {
313 tag++;
314 gk20a_writel(g, gr_fecs_imemt_r(0),
315 gr_fecs_imemt_tag_f(tag));
316 }
317 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
318 checksum += ucode_u32_data[i];
319 }
320
321 pad_start = i*4;
322 pad_end = pad_start+(256-pad_start%256)+256;
323 for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
324 if (i && ((i % 256) == 0)) {
325 tag++;
326 gk20a_writel(g, gr_fecs_imemt_r(0),
327 gr_fecs_imemt_tag_f(tag));
328 }
329 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
330 }
331}
332
333int gr_gk20a_wait_idle(struct gk20a *g, unsigned long duration_ms,
334 u32 expect_delay)
335{
336 u32 delay = expect_delay;
337 bool gr_enabled;
338 bool ctxsw_active;
339 bool gr_busy;
340 u32 gr_engine_id;
341 u32 engine_status;
342 bool ctx_status_invalid;
343 struct nvgpu_timeout timeout;
344
345 gk20a_dbg_fn("");
346
347 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
348
349 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
350
351 do {
352 /* fmodel: host gets fifo_engine_status(gr) from gr
353 only when gr_status is read */
354 gk20a_readl(g, gr_status_r());
355
356 gr_enabled = gk20a_readl(g, mc_enable_r()) &
357 mc_enable_pgraph_enabled_f();
358
359 engine_status = gk20a_readl(g,
360 fifo_engine_status_r(gr_engine_id));
361
362 ctxsw_active = engine_status &
363 fifo_engine_status_ctxsw_in_progress_f();
364
365 ctx_status_invalid =
366 (fifo_engine_status_ctx_status_v(engine_status) ==
367 fifo_engine_status_ctx_status_invalid_v());
368
369 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
370 gr_engine_status_value_busy_f();
371
372 if (!gr_enabled || ctx_status_invalid
373 || (!gr_busy && !ctxsw_active)) {
374 gk20a_dbg_fn("done");
375 return 0;
376 }
377
378 nvgpu_usleep_range(delay, delay * 2);
379 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
380
381 } while (!nvgpu_timeout_expired(&timeout));
382
383 nvgpu_err(g,
384 "timeout, ctxsw busy : %d, gr busy : %d",
385 ctxsw_active, gr_busy);
386
387 return -EAGAIN;
388}
389
390int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long duration_ms,
391 u32 expect_delay)
392{
393 u32 val;
394 u32 delay = expect_delay;
395 struct nvgpu_timeout timeout;
396
397 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL))
398 return 0;
399
400 gk20a_dbg_fn("");
401
402 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
403
404 do {
405 val = gk20a_readl(g, gr_status_r());
406
407 if (!gr_status_fe_method_lower_v(val)) {
408 gk20a_dbg_fn("done");
409 return 0;
410 }
411
412 nvgpu_usleep_range(delay, delay * 2);
413 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
414 } while (!nvgpu_timeout_expired(&timeout));
415
416 nvgpu_err(g,
417 "timeout, fe busy : %x", val);
418
419 return -EAGAIN;
420}
421
422int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
423 u32 *mailbox_ret, u32 opc_success,
424 u32 mailbox_ok, u32 opc_fail,
425 u32 mailbox_fail, bool sleepduringwait)
426{
427 struct nvgpu_timeout timeout;
428 u32 delay = GR_FECS_POLL_INTERVAL;
429 u32 check = WAIT_UCODE_LOOP;
430 u32 reg;
431
432 gk20a_dbg_fn("");
433
434 if (sleepduringwait)
435 delay = GR_IDLE_CHECK_DEFAULT;
436
437 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
438 NVGPU_TIMER_CPU_TIMER);
439
440 while (check == WAIT_UCODE_LOOP) {
441 if (nvgpu_timeout_expired(&timeout))
442 check = WAIT_UCODE_TIMEOUT;
443
444 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
445
446 if (mailbox_ret)
447 *mailbox_ret = reg;
448
449 switch (opc_success) {
450 case GR_IS_UCODE_OP_EQUAL:
451 if (reg == mailbox_ok)
452 check = WAIT_UCODE_OK;
453 break;
454 case GR_IS_UCODE_OP_NOT_EQUAL:
455 if (reg != mailbox_ok)
456 check = WAIT_UCODE_OK;
457 break;
458 case GR_IS_UCODE_OP_AND:
459 if (reg & mailbox_ok)
460 check = WAIT_UCODE_OK;
461 break;
462 case GR_IS_UCODE_OP_LESSER:
463 if (reg < mailbox_ok)
464 check = WAIT_UCODE_OK;
465 break;
466 case GR_IS_UCODE_OP_LESSER_EQUAL:
467 if (reg <= mailbox_ok)
468 check = WAIT_UCODE_OK;
469 break;
470 case GR_IS_UCODE_OP_SKIP:
471 /* do no success check */
472 break;
473 default:
474 nvgpu_err(g,
475 "invalid success opcode 0x%x", opc_success);
476
477 check = WAIT_UCODE_ERROR;
478 break;
479 }
480
481 switch (opc_fail) {
482 case GR_IS_UCODE_OP_EQUAL:
483 if (reg == mailbox_fail)
484 check = WAIT_UCODE_ERROR;
485 break;
486 case GR_IS_UCODE_OP_NOT_EQUAL:
487 if (reg != mailbox_fail)
488 check = WAIT_UCODE_ERROR;
489 break;
490 case GR_IS_UCODE_OP_AND:
491 if (reg & mailbox_fail)
492 check = WAIT_UCODE_ERROR;
493 break;
494 case GR_IS_UCODE_OP_LESSER:
495 if (reg < mailbox_fail)
496 check = WAIT_UCODE_ERROR;
497 break;
498 case GR_IS_UCODE_OP_LESSER_EQUAL:
499 if (reg <= mailbox_fail)
500 check = WAIT_UCODE_ERROR;
501 break;
502 case GR_IS_UCODE_OP_SKIP:
503 /* do no check on fail*/
504 break;
505 default:
506 nvgpu_err(g,
507 "invalid fail opcode 0x%x", opc_fail);
508 check = WAIT_UCODE_ERROR;
509 break;
510 }
511
512 if (sleepduringwait) {
513 nvgpu_usleep_range(delay, delay * 2);
514 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
515 } else
516 nvgpu_udelay(delay);
517 }
518
519 if (check == WAIT_UCODE_TIMEOUT) {
520 nvgpu_err(g,
521 "timeout waiting on ucode response");
522 gk20a_fecs_dump_falcon_stats(g);
523 gk20a_gr_debug_dump(g);
524 return -1;
525 } else if (check == WAIT_UCODE_ERROR) {
526 nvgpu_err(g,
527 "ucode method failed on mailbox=%d value=0x%08x",
528 mailbox_id, reg);
529 gk20a_fecs_dump_falcon_stats(g);
530 return -1;
531 }
532
533 gk20a_dbg_fn("done");
534 return 0;
535}
536
537/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
538 * We should replace most, if not all, fecs method calls to this instead. */
539int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
540 struct fecs_method_op_gk20a op,
541 bool sleepduringwait)
542{
543 struct gr_gk20a *gr = &g->gr;
544 int ret;
545
546 nvgpu_mutex_acquire(&gr->fecs_mutex);
547
548 if (op.mailbox.id != 0)
549 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
550 op.mailbox.data);
551
552 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
553 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
554
555 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
556 gk20a_writel(g, gr_fecs_method_push_r(),
557 gr_fecs_method_push_adr_f(op.method.addr));
558
559 /* op.mailbox.id == 4 cases require waiting for completion on
560 * for op.mailbox.id == 0 */
561 if (op.mailbox.id == 4)
562 op.mailbox.id = 0;
563
564 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
565 op.cond.ok, op.mailbox.ok,
566 op.cond.fail, op.mailbox.fail,
567 sleepduringwait);
568
569 nvgpu_mutex_release(&gr->fecs_mutex);
570
571 return ret;
572}
573
574/* Sideband mailbox writes are done a bit differently */
575int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
576 struct fecs_method_op_gk20a op)
577{
578 struct gr_gk20a *gr = &g->gr;
579 int ret;
580
581 nvgpu_mutex_acquire(&gr->fecs_mutex);
582
583 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(op.mailbox.id),
584 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
585
586 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
587 gk20a_writel(g, gr_fecs_method_push_r(),
588 gr_fecs_method_push_adr_f(op.method.addr));
589
590 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
591 op.cond.ok, op.mailbox.ok,
592 op.cond.fail, op.mailbox.fail,
593 false);
594
595 nvgpu_mutex_release(&gr->fecs_mutex);
596
597 return ret;
598}
599
600static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
601{
602 return gr_gk20a_submit_fecs_method_op(g,
603 (struct fecs_method_op_gk20a) {
604 .method.addr = fecs_method,
605 .method.data = ~0,
606 .mailbox = { .id = 1, /*sideband?*/
607 .data = ~0, .clr = ~0, .ret = ret,
608 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
609 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
610 .cond.ok = GR_IS_UCODE_OP_EQUAL,
611 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
612}
613
614/* Stop processing (stall) context switches at FECS.
615 * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
616 * are sent to the ucode in sequence, it can get into an undefined state. */
617int gr_gk20a_disable_ctxsw(struct gk20a *g)
618{
619 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
620 return gr_gk20a_ctrl_ctxsw(g,
621 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
622}
623
624/* Start processing (continue) context switches at FECS */
625int gr_gk20a_enable_ctxsw(struct gk20a *g)
626{
627 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
628 return gr_gk20a_ctrl_ctxsw(g,
629 gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
630}
631
632int gr_gk20a_halt_pipe(struct gk20a *g)
633{
634 return gr_gk20a_submit_fecs_method_op(g,
635 (struct fecs_method_op_gk20a) {
636 .method.addr =
637 gr_fecs_method_push_adr_halt_pipeline_v(),
638 .method.data = ~0,
639 .mailbox = { .id = 1, /*sideband?*/
640 .data = ~0, .clr = ~0, .ret = NULL,
641 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
642 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
643 .cond.ok = GR_IS_UCODE_OP_EQUAL,
644 .cond.fail = GR_IS_UCODE_OP_EQUAL }, false);
645}
646
647
648int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
649{
650 u32 addr_lo;
651 u32 addr_hi;
652
653 gk20a_dbg_fn("");
654
655 addr_lo = u64_lo32(gpu_va) >> 12;
656 addr_hi = u64_hi32(gpu_va);
657
658 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
659 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
660 ram_in_gr_wfi_ptr_lo_f(addr_lo));
661
662 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
663 ram_in_gr_wfi_ptr_hi_f(addr_hi));
664
665 return 0;
666}
667
668/*
669 * Context state can be written directly, or "patched" at times. So that code
670 * can be used in either situation it is written using a series of
671 * _ctx_patch_write(..., patch) statements. However any necessary map overhead
672 * should be minimized; thus, bundle the sequence of these writes together, and
673 * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
674 */
675
676int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
677 struct channel_ctx_gk20a *ch_ctx,
678 bool update_patch_count)
679{
680 int err = 0;
681
682 err = nvgpu_mem_begin(g, &ch_ctx->patch_ctx.mem);
683 if (err)
684 return err;
685
686 if (update_patch_count) {
687 /* reset patch count if ucode has already processed it */
688 ch_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
689 &ch_ctx->gr_ctx->mem,
690 ctxsw_prog_main_image_patch_count_o());
691 nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
692 ch_ctx->patch_ctx.data_count);
693 }
694 return 0;
695}
696
697void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
698 struct channel_ctx_gk20a *ch_ctx,
699 bool update_patch_count)
700{
701 nvgpu_mem_end(g, &ch_ctx->patch_ctx.mem);
702
703 /* Write context count to context image if it is mapped */
704 if (update_patch_count) {
705 nvgpu_mem_wr(g, &ch_ctx->gr_ctx->mem,
706 ctxsw_prog_main_image_patch_count_o(),
707 ch_ctx->patch_ctx.data_count);
708 nvgpu_log(g, gpu_dbg_info, "write patch count %d",
709 ch_ctx->patch_ctx.data_count);
710 }
711}
712
713void gr_gk20a_ctx_patch_write(struct gk20a *g,
714 struct channel_ctx_gk20a *ch_ctx,
715 u32 addr, u32 data, bool patch)
716{
717 if (patch) {
718 u32 patch_slot = ch_ctx->patch_ctx.data_count *
719 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
720 if (patch_slot > (PATCH_CTX_ENTRIES_FROM_SIZE(
721 ch_ctx->patch_ctx.mem.size) -
722 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY)) {
723 nvgpu_err(g, "failed to access patch_slot %d",
724 patch_slot);
725 return;
726 }
727 nvgpu_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot, addr);
728 nvgpu_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot + 1, data);
729 ch_ctx->patch_ctx.data_count++;
730 nvgpu_log(g, gpu_dbg_info,
731 "patch addr = 0x%x data = 0x%x data_count %d",
732 addr, data, ch_ctx->patch_ctx.data_count);
733 } else {
734 gk20a_writel(g, addr, data);
735 }
736}
737
738static u32 fecs_current_ctx_data(struct gk20a *g, struct nvgpu_mem *inst_block)
739{
740 u32 ptr = u64_lo32(nvgpu_inst_block_addr(g, inst_block)
741 >> ram_in_base_shift_v());
742 u32 aperture = nvgpu_aperture_mask(g, inst_block,
743 gr_fecs_current_ctx_target_sys_mem_ncoh_f(),
744 gr_fecs_current_ctx_target_vid_mem_f());
745
746 return gr_fecs_current_ctx_ptr_f(ptr) | aperture |
747 gr_fecs_current_ctx_valid_f(1);
748}
749
750static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
751 struct channel_gk20a *c)
752{
753 u32 inst_base_ptr = u64_lo32(nvgpu_inst_block_addr(g, &c->inst_block)
754 >> ram_in_base_shift_v());
755 u32 data = fecs_current_ctx_data(g, &c->inst_block);
756 u32 ret;
757
758 gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
759 c->chid, inst_base_ptr);
760
761 ret = gr_gk20a_submit_fecs_method_op(g,
762 (struct fecs_method_op_gk20a) {
763 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
764 .method.data = data,
765 .mailbox = { .id = 0, .data = 0,
766 .clr = 0x30,
767 .ret = NULL,
768 .ok = 0x10,
769 .fail = 0x20, },
770 .cond.ok = GR_IS_UCODE_OP_AND,
771 .cond.fail = GR_IS_UCODE_OP_AND}, true);
772 if (ret)
773 nvgpu_err(g,
774 "bind channel instance failed");
775
776 return ret;
777}
778
779void gr_gk20a_write_zcull_ptr(struct gk20a *g,
780 struct nvgpu_mem *mem, u64 gpu_va)
781{
782 u32 va = u64_lo32(gpu_va >> 8);
783
784 nvgpu_mem_wr(g, mem,
785 ctxsw_prog_main_image_zcull_ptr_o(), va);
786}
787
788void gr_gk20a_write_pm_ptr(struct gk20a *g,
789 struct nvgpu_mem *mem, u64 gpu_va)
790{
791 u32 va = u64_lo32(gpu_va >> 8);
792
793 nvgpu_mem_wr(g, mem,
794 ctxsw_prog_main_image_pm_ptr_o(), va);
795}
796
797static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
798{
799 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
800 struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
801 struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
802 struct nvgpu_mem *ctxheader = &ctx->mem;
803 int ret = 0;
804
805 gk20a_dbg_fn("");
806
807 if (nvgpu_mem_begin(g, mem))
808 return -ENOMEM;
809
810 if (nvgpu_mem_begin(g, ctxheader)) {
811 ret = -ENOMEM;
812 goto clean_up_mem;
813 }
814
815 if (ch_ctx->zcull_ctx.gpu_va == 0 &&
816 ch_ctx->zcull_ctx.ctx_sw_mode ==
817 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
818 ret = -EINVAL;
819 goto clean_up;
820 }
821
822 ret = gk20a_disable_channel_tsg(g, c);
823 if (ret) {
824 nvgpu_err(g, "failed to disable channel/TSG");
825 goto clean_up;
826 }
827 ret = gk20a_fifo_preempt(g, c);
828 if (ret) {
829 gk20a_enable_channel_tsg(g, c);
830 nvgpu_err(g, "failed to preempt channel/TSG");
831 goto clean_up;
832 }
833
834 nvgpu_mem_wr(g, mem,
835 ctxsw_prog_main_image_zcull_o(),
836 ch_ctx->zcull_ctx.ctx_sw_mode);
837
838 if (ctxheader->gpu_va)
839 g->ops.gr.write_zcull_ptr(g, ctxheader,
840 ch_ctx->zcull_ctx.gpu_va);
841 else
842 g->ops.gr.write_zcull_ptr(g, mem, ch_ctx->zcull_ctx.gpu_va);
843
844 gk20a_enable_channel_tsg(g, c);
845
846clean_up:
847 nvgpu_mem_end(g, ctxheader);
848clean_up_mem:
849 nvgpu_mem_end(g, mem);
850
851 return ret;
852}
853
854u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc)
855{
856 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
857 u32 gpc_offset = gpc_stride * gpc;
858
859 return gpc_offset;
860}
861
862u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc)
863{
864 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
865 GPU_LIT_TPC_IN_GPC_STRIDE);
866 u32 tpc_offset = tpc_in_gpc_stride * tpc;
867
868 return tpc_offset;
869}
870
871static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
872 struct channel_gk20a *c, bool patch)
873{
874 struct gr_gk20a *gr = &g->gr;
875 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
876 u64 addr;
877 u32 size;
878
879 gk20a_dbg_fn("");
880 if (patch) {
881 int err;
882 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
883 if (err)
884 return err;
885 }
886
887 /* global pagepool buffer */
888 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
889 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
890 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
891 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
892
893 size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
894 gr_scc_pagepool_total_pages_byte_granularity_v();
895
896 if (size == g->ops.gr.pagepool_default_size(g))
897 size = gr_scc_pagepool_total_pages_hwmax_v();
898
899 gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
900 addr, size);
901
902 g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
903
904 /* global bundle cb */
905 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
906 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
907 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
908 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
909
910 size = gr->bundle_cb_default_size;
911
912 gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
913 addr, size);
914
915 g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
916
917 /* global attrib cb */
918 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
919 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
920 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
921 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
922
923 gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
924 g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
925 g->ops.gr.commit_global_cb_manager(g, c, patch);
926
927 if (patch)
928 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
929
930 return 0;
931}
932
933int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
934{
935 struct gr_gk20a *gr = &g->gr;
936 struct channel_ctx_gk20a *ch_ctx = NULL;
937 u32 gpm_pd_cfg;
938 u32 pd_ab_dist_cfg0;
939 u32 ds_debug;
940 u32 mpc_vtg_debug;
941 u32 pe_vaf;
942 u32 pe_vsc_vpc;
943
944 gk20a_dbg_fn("");
945
946 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
947 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
948 ds_debug = gk20a_readl(g, gr_ds_debug_r());
949 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
950
951 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
952 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
953 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
954
955 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
956 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
957 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
958 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
959 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
960 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
961
962 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
963 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
964 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
965 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
966 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, false);
967 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
968 } else {
969 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
970 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
971 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
972 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
973
974 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
975 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
976 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, false);
977 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
978 }
979
980 return 0;
981}
982
983/*
984 * Return map tiles count for given index
985 * Return 0 if index is out-of-bounds
986 */
987static u32 gr_gk20a_get_map_tile_count(struct gr_gk20a *gr, u32 index)
988{
989 if (index >= gr->map_tile_count)
990 return 0;
991
992 return gr->map_tiles[index];
993}
994
995int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
996{
997 u32 norm_entries, norm_shift;
998 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
999 u32 map0, map1, map2, map3, map4, map5;
1000
1001 if (!gr->map_tiles)
1002 return -1;
1003
1004 gk20a_dbg_fn("");
1005
1006 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1007 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1008 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1009
1010 map0 = gr_crstr_gpc_map0_tile0_f(gr_gk20a_get_map_tile_count(gr, 0)) |
1011 gr_crstr_gpc_map0_tile1_f(gr_gk20a_get_map_tile_count(gr, 1)) |
1012 gr_crstr_gpc_map0_tile2_f(gr_gk20a_get_map_tile_count(gr, 2)) |
1013 gr_crstr_gpc_map0_tile3_f(gr_gk20a_get_map_tile_count(gr, 3)) |
1014 gr_crstr_gpc_map0_tile4_f(gr_gk20a_get_map_tile_count(gr, 4)) |
1015 gr_crstr_gpc_map0_tile5_f(gr_gk20a_get_map_tile_count(gr, 5));
1016
1017 map1 = gr_crstr_gpc_map1_tile6_f(gr_gk20a_get_map_tile_count(gr, 6)) |
1018 gr_crstr_gpc_map1_tile7_f(gr_gk20a_get_map_tile_count(gr, 7)) |
1019 gr_crstr_gpc_map1_tile8_f(gr_gk20a_get_map_tile_count(gr, 8)) |
1020 gr_crstr_gpc_map1_tile9_f(gr_gk20a_get_map_tile_count(gr, 9)) |
1021 gr_crstr_gpc_map1_tile10_f(gr_gk20a_get_map_tile_count(gr, 10)) |
1022 gr_crstr_gpc_map1_tile11_f(gr_gk20a_get_map_tile_count(gr, 11));
1023
1024 map2 = gr_crstr_gpc_map2_tile12_f(gr_gk20a_get_map_tile_count(gr, 12)) |
1025 gr_crstr_gpc_map2_tile13_f(gr_gk20a_get_map_tile_count(gr, 13)) |
1026 gr_crstr_gpc_map2_tile14_f(gr_gk20a_get_map_tile_count(gr, 14)) |
1027 gr_crstr_gpc_map2_tile15_f(gr_gk20a_get_map_tile_count(gr, 15)) |
1028 gr_crstr_gpc_map2_tile16_f(gr_gk20a_get_map_tile_count(gr, 16)) |
1029 gr_crstr_gpc_map2_tile17_f(gr_gk20a_get_map_tile_count(gr, 17));
1030
1031 map3 = gr_crstr_gpc_map3_tile18_f(gr_gk20a_get_map_tile_count(gr, 18)) |
1032 gr_crstr_gpc_map3_tile19_f(gr_gk20a_get_map_tile_count(gr, 19)) |
1033 gr_crstr_gpc_map3_tile20_f(gr_gk20a_get_map_tile_count(gr, 20)) |
1034 gr_crstr_gpc_map3_tile21_f(gr_gk20a_get_map_tile_count(gr, 21)) |
1035 gr_crstr_gpc_map3_tile22_f(gr_gk20a_get_map_tile_count(gr, 22)) |
1036 gr_crstr_gpc_map3_tile23_f(gr_gk20a_get_map_tile_count(gr, 23));
1037
1038 map4 = gr_crstr_gpc_map4_tile24_f(gr_gk20a_get_map_tile_count(gr, 24)) |
1039 gr_crstr_gpc_map4_tile25_f(gr_gk20a_get_map_tile_count(gr, 25)) |
1040 gr_crstr_gpc_map4_tile26_f(gr_gk20a_get_map_tile_count(gr, 26)) |
1041 gr_crstr_gpc_map4_tile27_f(gr_gk20a_get_map_tile_count(gr, 27)) |
1042 gr_crstr_gpc_map4_tile28_f(gr_gk20a_get_map_tile_count(gr, 28)) |
1043 gr_crstr_gpc_map4_tile29_f(gr_gk20a_get_map_tile_count(gr, 29));
1044
1045 map5 = gr_crstr_gpc_map5_tile30_f(gr_gk20a_get_map_tile_count(gr, 30)) |
1046 gr_crstr_gpc_map5_tile31_f(gr_gk20a_get_map_tile_count(gr, 31)) |
1047 gr_crstr_gpc_map5_tile32_f(0) |
1048 gr_crstr_gpc_map5_tile33_f(0) |
1049 gr_crstr_gpc_map5_tile34_f(0) |
1050 gr_crstr_gpc_map5_tile35_f(0);
1051
1052 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1053 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1054 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1055 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1056 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1057 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1058
1059 switch (gr->tpc_count) {
1060 case 1:
1061 norm_shift = 4;
1062 break;
1063 case 2:
1064 case 3:
1065 norm_shift = 3;
1066 break;
1067 case 4:
1068 case 5:
1069 case 6:
1070 case 7:
1071 norm_shift = 2;
1072 break;
1073 case 8:
1074 case 9:
1075 case 10:
1076 case 11:
1077 case 12:
1078 case 13:
1079 case 14:
1080 case 15:
1081 norm_shift = 1;
1082 break;
1083 default:
1084 norm_shift = 0;
1085 break;
1086 }
1087
1088 norm_entries = gr->tpc_count << norm_shift;
1089 coeff5_mod = (1 << 5) % norm_entries;
1090 coeff6_mod = (1 << 6) % norm_entries;
1091 coeff7_mod = (1 << 7) % norm_entries;
1092 coeff8_mod = (1 << 8) % norm_entries;
1093 coeff9_mod = (1 << 9) % norm_entries;
1094 coeff10_mod = (1 << 10) % norm_entries;
1095 coeff11_mod = (1 << 11) % norm_entries;
1096
1097 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1098 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1099 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1100 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1101 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1102 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1103
1104 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1105 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1106 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1107 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1108 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1109 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1110 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1111
1112 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1113 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1114 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1115 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1116 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1117 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1118
1119 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1120 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1121 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1122
1123 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1124 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1125 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1126 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1127 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1128 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1129
1130 return 0;
1131}
1132
1133static inline u32 count_bits(u32 mask)
1134{
1135 u32 temp = mask;
1136 u32 count;
1137 for (count = 0; temp != 0; count++)
1138 temp &= temp - 1;
1139
1140 return count;
1141}
1142
1143void gr_gk20a_init_sm_id_table(struct gk20a *g)
1144{
1145 u32 gpc, tpc;
1146 u32 sm_id = 0;
1147
1148 for (tpc = 0; tpc < g->gr.max_tpc_per_gpc_count; tpc++) {
1149 for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
1150
1151 if (tpc < g->gr.gpc_tpc_count[gpc]) {
1152 g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
1153 g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
1154 g->gr.sm_to_cluster[sm_id].sm_index = 0;
1155 g->gr.sm_to_cluster[sm_id].global_tpc_index =
1156 sm_id;
1157 sm_id++;
1158 }
1159 }
1160 }
1161 g->gr.no_of_sm = sm_id;
1162}
1163
1164/*
1165 * Return number of TPCs in a GPC
1166 * Return 0 if GPC index is invalid i.e. GPC is disabled
1167 */
1168u32 gr_gk20a_get_tpc_count(struct gr_gk20a *gr, u32 gpc_index)
1169{
1170 if (gpc_index >= gr->gpc_count)
1171 return 0;
1172
1173 return gr->gpc_tpc_count[gpc_index];
1174}
1175
1176int gr_gk20a_init_fs_state(struct gk20a *g)
1177{
1178 struct gr_gk20a *gr = &g->gr;
1179 u32 tpc_index, gpc_index;
1180 u32 sm_id = 0, gpc_id = 0;
1181 u32 tpc_per_gpc;
1182 u32 fuse_tpc_mask;
1183 u32 reg_index;
1184
1185 gk20a_dbg_fn("");
1186
1187 if (g->ops.gr.init_sm_id_table) {
1188 g->ops.gr.init_sm_id_table(g);
1189 /* Is table empty ? */
1190 if (g->gr.no_of_sm == 0)
1191 return -EINVAL;
1192 }
1193
1194 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
1195 tpc_index = g->gr.sm_to_cluster[sm_id].tpc_index;
1196 gpc_index = g->gr.sm_to_cluster[sm_id].gpc_index;
1197
1198 g->ops.gr.program_sm_id_numbering(g, gpc_index, tpc_index, sm_id);
1199
1200 if (g->ops.gr.program_active_tpc_counts)
1201 g->ops.gr.program_active_tpc_counts(g, gpc_index);
1202 }
1203
1204 for (reg_index = 0, gpc_id = 0;
1205 reg_index < gr_pd_num_tpc_per_gpc__size_1_v();
1206 reg_index++, gpc_id += 8) {
1207
1208 tpc_per_gpc =
1209 gr_pd_num_tpc_per_gpc_count0_f(gr_gk20a_get_tpc_count(gr, gpc_id + 0)) |
1210 gr_pd_num_tpc_per_gpc_count1_f(gr_gk20a_get_tpc_count(gr, gpc_id + 1)) |
1211 gr_pd_num_tpc_per_gpc_count2_f(gr_gk20a_get_tpc_count(gr, gpc_id + 2)) |
1212 gr_pd_num_tpc_per_gpc_count3_f(gr_gk20a_get_tpc_count(gr, gpc_id + 3)) |
1213 gr_pd_num_tpc_per_gpc_count4_f(gr_gk20a_get_tpc_count(gr, gpc_id + 4)) |
1214 gr_pd_num_tpc_per_gpc_count5_f(gr_gk20a_get_tpc_count(gr, gpc_id + 5)) |
1215 gr_pd_num_tpc_per_gpc_count6_f(gr_gk20a_get_tpc_count(gr, gpc_id + 6)) |
1216 gr_pd_num_tpc_per_gpc_count7_f(gr_gk20a_get_tpc_count(gr, gpc_id + 7));
1217
1218 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1219 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1220 }
1221
1222 /* gr__setup_pd_mapping stubbed for gk20a */
1223 g->ops.gr.setup_rop_mapping(g, gr);
1224 if (g->ops.gr.setup_alpha_beta_tables)
1225 g->ops.gr.setup_alpha_beta_tables(g, gr);
1226
1227 for (gpc_index = 0;
1228 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1229 gpc_index += 4) {
1230
1231 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1232 gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1233 gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1234 gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1235 gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1236 }
1237
1238 fuse_tpc_mask = g->ops.gr.get_gpc_tpc_mask(g, 0);
1239 if (g->tpc_fs_mask_user &&
1240 fuse_tpc_mask == (0x1U << gr->max_tpc_count) - 1U) {
1241 u32 val = g->tpc_fs_mask_user;
1242 val &= (0x1U << gr->max_tpc_count) - 1U;
1243 gk20a_writel(g, gr_cwd_fs_r(),
1244 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1245 gr_cwd_fs_num_tpcs_f(hweight32(val)));
1246 } else {
1247 gk20a_writel(g, gr_cwd_fs_r(),
1248 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1249 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1250 }
1251
1252 gk20a_writel(g, gr_bes_zrop_settings_r(),
1253 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1254 gk20a_writel(g, gr_bes_crop_settings_r(),
1255 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1256
1257 return 0;
1258}
1259
1260static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1261{
1262 struct gk20a *g = c->g;
1263 int ret;
1264
1265 gk20a_dbg_fn("");
1266
1267 ret = gr_gk20a_submit_fecs_method_op(g,
1268 (struct fecs_method_op_gk20a) {
1269 .method.addr = save_type,
1270 .method.data = fecs_current_ctx_data(g, &c->inst_block),
1271 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1272 .ok = 1, .fail = 2,
1273 },
1274 .cond.ok = GR_IS_UCODE_OP_AND,
1275 .cond.fail = GR_IS_UCODE_OP_AND,
1276 }, true);
1277
1278 if (ret)
1279 nvgpu_err(g, "save context image failed");
1280
1281 return ret;
1282}
1283
1284static u32 gk20a_init_sw_bundle(struct gk20a *g)
1285{
1286 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1287 u32 last_bundle_data = 0;
1288 u32 err = 0;
1289 unsigned int i;
1290
1291 /* disable fe_go_idle */
1292 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1293 gr_fe_go_idle_timeout_count_disabled_f());
1294 /* enable pipe mode override */
1295 gk20a_writel(g, gr_pipe_bundle_config_r(),
1296 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1297
1298 /* load bundle init */
1299 for (i = 0; i < sw_bundle_init->count; i++) {
1300 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1301 gk20a_writel(g, gr_pipe_bundle_data_r(),
1302 sw_bundle_init->l[i].value);
1303 last_bundle_data = sw_bundle_init->l[i].value;
1304 }
1305
1306 gk20a_writel(g, gr_pipe_bundle_address_r(),
1307 sw_bundle_init->l[i].addr);
1308
1309 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1310 GR_GO_IDLE_BUNDLE) {
1311 err = gr_gk20a_wait_idle(g,
1312 gk20a_get_gr_idle_timeout(g),
1313 GR_IDLE_CHECK_DEFAULT);
1314 if (err)
1315 goto error;
1316 }
1317
1318 err = gr_gk20a_wait_fe_idle(g, gk20a_get_gr_idle_timeout(g),
1319 GR_IDLE_CHECK_DEFAULT);
1320 if (err)
1321 goto error;
1322 }
1323
1324 if (!err && g->ops.gr.init_sw_veid_bundle) {
1325 err = g->ops.gr.init_sw_veid_bundle(g);
1326 if (err)
1327 goto error;
1328 }
1329
1330 /* disable pipe mode override */
1331 gk20a_writel(g, gr_pipe_bundle_config_r(),
1332 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1333
1334 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1335 GR_IDLE_CHECK_DEFAULT);
1336
1337 /* restore fe_go_idle */
1338 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1339 gr_fe_go_idle_timeout_count_prod_f());
1340
1341 return err;
1342
1343error:
1344 /* in case of error skip waiting for GR idle - just restore state */
1345 gk20a_writel(g, gr_pipe_bundle_config_r(),
1346 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1347
1348 /* restore fe_go_idle */
1349 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1350 gr_fe_go_idle_timeout_count_prod_f());
1351
1352 return err;
1353}
1354
1355/* init global golden image from a fresh gr_ctx in channel ctx.
1356 save a copy in local_golden_image in ctx_vars */
1357static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1358 struct channel_gk20a *c)
1359{
1360 struct gr_gk20a *gr = &g->gr;
1361 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1362 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1363 u32 ctx_header_words;
1364 u32 i;
1365 u32 data;
1366 struct nvgpu_mem *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
1367 struct nvgpu_mem *gr_mem = &ch_ctx->gr_ctx->mem;
1368 u32 err = 0;
1369 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
1370 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
1371 u32 last_method_data = 0;
1372
1373 gk20a_dbg_fn("");
1374
1375 /* golden ctx is global to all channels. Although only the first
1376 channel initializes golden image, driver needs to prevent multiple
1377 channels from initializing golden ctx at the same time */
1378 nvgpu_mutex_acquire(&gr->ctx_mutex);
1379
1380 if (gr->ctx_vars.golden_image_initialized) {
1381 goto clean_up;
1382 }
1383 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1384 struct nvgpu_timeout timeout;
1385
1386 nvgpu_timeout_init(g, &timeout,
1387 FE_PWR_MODE_TIMEOUT_MAX /
1388 FE_PWR_MODE_TIMEOUT_DEFAULT,
1389 NVGPU_TIMER_RETRY_TIMER);
1390 gk20a_writel(g, gr_fe_pwr_mode_r(),
1391 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_force_on_f());
1392 do {
1393 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1394 if (req == gr_fe_pwr_mode_req_done_v())
1395 break;
1396 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1397 } while (!nvgpu_timeout_expired_msg(&timeout,
1398 "timeout forcing FE on"));
1399 }
1400
1401
1402 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1403 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1404 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1405 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1406 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1407 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1408 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1409 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
1410 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
1411 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
1412 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1413 nvgpu_udelay(10);
1414
1415 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1416 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1417 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1418 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1419 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1420 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1421 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1422 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
1423 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
1424 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
1425 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1426 nvgpu_udelay(10);
1427
1428 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1429 struct nvgpu_timeout timeout;
1430
1431 nvgpu_timeout_init(g, &timeout,
1432 FE_PWR_MODE_TIMEOUT_MAX /
1433 FE_PWR_MODE_TIMEOUT_DEFAULT,
1434 NVGPU_TIMER_RETRY_TIMER);
1435 gk20a_writel(g, gr_fe_pwr_mode_r(),
1436 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_auto_f());
1437
1438 do {
1439 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1440 if (req == gr_fe_pwr_mode_req_done_v())
1441 break;
1442 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1443 } while (!nvgpu_timeout_expired_msg(&timeout,
1444 "timeout setting FE power to auto"));
1445 }
1446
1447 /* clear scc ram */
1448 gk20a_writel(g, gr_scc_init_r(),
1449 gr_scc_init_ram_trigger_f());
1450
1451 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1452 if (err)
1453 goto clean_up;
1454
1455 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1456 GR_IDLE_CHECK_DEFAULT);
1457
1458 /* load ctx init */
1459 for (i = 0; i < sw_ctx_load->count; i++)
1460 gk20a_writel(g, sw_ctx_load->l[i].addr,
1461 sw_ctx_load->l[i].value);
1462
1463 if (g->ops.gr.init_preemption_state)
1464 g->ops.gr.init_preemption_state(g);
1465
1466 if (g->ops.clock_gating.blcg_gr_load_gating_prod)
1467 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
1468
1469 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1470 GR_IDLE_CHECK_DEFAULT);
1471 if (err)
1472 goto clean_up;
1473
1474 /* disable fe_go_idle */
1475 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1476 gr_fe_go_idle_timeout_count_disabled_f());
1477
1478 err = gr_gk20a_commit_global_ctx_buffers(g, c, false);
1479 if (err)
1480 goto clean_up;
1481
1482 /* override a few ctx state registers */
1483 g->ops.gr.commit_global_timeslice(g, c);
1484
1485 /* floorsweep anything left */
1486 err = g->ops.gr.init_fs_state(g);
1487 if (err)
1488 goto clean_up;
1489
1490 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1491 GR_IDLE_CHECK_DEFAULT);
1492 if (err)
1493 goto restore_fe_go_idle;
1494
1495 err = gk20a_init_sw_bundle(g);
1496 if (err)
1497 goto clean_up;
1498
1499restore_fe_go_idle:
1500 /* restore fe_go_idle */
1501 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1502 gr_fe_go_idle_timeout_count_prod_f());
1503
1504 if (err || gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1505 GR_IDLE_CHECK_DEFAULT))
1506 goto clean_up;
1507
1508 /* load method init */
1509 if (sw_method_init->count) {
1510 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1511 sw_method_init->l[0].value);
1512 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1513 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1514 sw_method_init->l[0].addr);
1515 last_method_data = sw_method_init->l[0].value;
1516 }
1517 for (i = 1; i < sw_method_init->count; i++) {
1518 if (sw_method_init->l[i].value != last_method_data) {
1519 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1520 sw_method_init->l[i].value);
1521 last_method_data = sw_method_init->l[i].value;
1522 }
1523 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1524 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1525 sw_method_init->l[i].addr);
1526 }
1527
1528 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1529 GR_IDLE_CHECK_DEFAULT);
1530 if (err)
1531 goto clean_up;
1532
1533 nvgpu_kfree(g, gr->sm_error_states);
1534
1535 /* we need to allocate this after g->ops.gr.init_fs_state() since
1536 * we initialize gr->no_of_sm in this function
1537 */
1538 gr->sm_error_states = nvgpu_kzalloc(g,
1539 sizeof(struct nvgpu_gr_sm_error_state)
1540 * gr->no_of_sm);
1541 if (!gr->sm_error_states) {
1542 err = -ENOMEM;
1543 goto restore_fe_go_idle;
1544 }
1545
1546 if (nvgpu_mem_begin(g, gold_mem))
1547 goto clean_up;
1548
1549 if (nvgpu_mem_begin(g, gr_mem))
1550 goto clean_up;
1551
1552 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1553 ctx_header_words >>= 2;
1554
1555 g->ops.mm.l2_flush(g, true);
1556
1557 for (i = 0; i < ctx_header_words; i++) {
1558 data = nvgpu_mem_rd32(g, gr_mem, i);
1559 nvgpu_mem_wr32(g, gold_mem, i, data);
1560 }
1561 nvgpu_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
1562 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1563
1564 g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
1565
1566 err = g->ops.gr.commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1567 if (err)
1568 goto clean_up;
1569
1570 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1571
1572
1573
1574 if (gr->ctx_vars.local_golden_image == NULL) {
1575
1576 gr->ctx_vars.local_golden_image =
1577 nvgpu_vzalloc(g, gr->ctx_vars.golden_image_size);
1578
1579 if (gr->ctx_vars.local_golden_image == NULL) {
1580 err = -ENOMEM;
1581 goto clean_up;
1582 }
1583 nvgpu_mem_rd_n(g, gold_mem, 0,
1584 gr->ctx_vars.local_golden_image,
1585 gr->ctx_vars.golden_image_size);
1586
1587 }
1588
1589 err = g->ops.gr.commit_inst(c, gr_mem->gpu_va);
1590 if (err)
1591 goto clean_up;
1592
1593 gr->ctx_vars.golden_image_initialized = true;
1594
1595 gk20a_writel(g, gr_fecs_current_ctx_r(),
1596 gr_fecs_current_ctx_valid_false_f());
1597
1598clean_up:
1599 if (err)
1600 nvgpu_err(g, "fail");
1601 else
1602 gk20a_dbg_fn("done");
1603
1604 nvgpu_mem_end(g, gold_mem);
1605 nvgpu_mem_end(g, gr_mem);
1606
1607 nvgpu_mutex_release(&gr->ctx_mutex);
1608 return err;
1609}
1610
1611int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1612 struct channel_gk20a *c,
1613 bool enable_smpc_ctxsw)
1614{
1615 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1616 struct nvgpu_mem *mem;
1617 u32 data;
1618 int ret;
1619
1620 gk20a_dbg_fn("");
1621
1622 if (!ch_ctx->gr_ctx) {
1623 nvgpu_err(g, "no graphics context allocated");
1624 return -EFAULT;
1625 }
1626
1627 mem = &ch_ctx->gr_ctx->mem;
1628
1629 ret = gk20a_disable_channel_tsg(g, c);
1630 if (ret) {
1631 nvgpu_err(g, "failed to disable channel/TSG");
1632 goto out;
1633 }
1634 ret = gk20a_fifo_preempt(g, c);
1635 if (ret) {
1636 gk20a_enable_channel_tsg(g, c);
1637 nvgpu_err(g, "failed to preempt channel/TSG");
1638 goto out;
1639 }
1640
1641 /* Channel gr_ctx buffer is gpu cacheable.
1642 Flush and invalidate before cpu update. */
1643 g->ops.mm.l2_flush(g, true);
1644
1645 if (nvgpu_mem_begin(g, mem)) {
1646 ret = -ENOMEM;
1647 goto out;
1648 }
1649
1650 data = nvgpu_mem_rd(g, mem,
1651 ctxsw_prog_main_image_pm_o());
1652
1653 data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1654 data |= enable_smpc_ctxsw ?
1655 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1656 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1657
1658 nvgpu_mem_wr(g, mem,
1659 ctxsw_prog_main_image_pm_o(), data);
1660
1661 nvgpu_mem_end(g, mem);
1662out:
1663 gk20a_enable_channel_tsg(g, c);
1664 return ret;
1665}
1666
1667int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
1668 struct channel_gk20a *c,
1669 bool enable_hwpm_ctxsw)
1670{
1671 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1672 struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
1673 struct nvgpu_mem *gr_mem;
1674 u32 data;
1675 u64 virt_addr;
1676 struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
1677 struct nvgpu_mem *ctxheader = &ctx->mem;
1678 int ret;
1679
1680 gk20a_dbg_fn("");
1681
1682 if (!ch_ctx->gr_ctx) {
1683 nvgpu_err(g, "no graphics context allocated");
1684 return -EFAULT;
1685 }
1686
1687 gr_mem = &ch_ctx->gr_ctx->mem;
1688
1689 if (enable_hwpm_ctxsw) {
1690 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
1691 return 0;
1692 } else {
1693 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f())
1694 return 0;
1695 }
1696
1697 ret = gk20a_disable_channel_tsg(g, c);
1698 if (ret) {
1699 nvgpu_err(g, "failed to disable channel/TSG");
1700 return ret;
1701 }
1702
1703 ret = gk20a_fifo_preempt(g, c);
1704 if (ret) {
1705 gk20a_enable_channel_tsg(g, c);
1706 nvgpu_err(g, "failed to preempt channel/TSG");
1707 return ret;
1708 }
1709
1710 /* Channel gr_ctx buffer is gpu cacheable.
1711 Flush and invalidate before cpu update. */
1712 g->ops.mm.l2_flush(g, true);
1713
1714 if (enable_hwpm_ctxsw) {
1715 /* Allocate buffer if necessary */
1716 if (pm_ctx->mem.gpu_va == 0) {
1717 ret = nvgpu_dma_alloc_flags_sys(g,
1718 NVGPU_DMA_NO_KERNEL_MAPPING,
1719 g->gr.ctx_vars.pm_ctxsw_image_size,
1720 &pm_ctx->mem);
1721 if (ret) {
1722 c->g->ops.fifo.enable_channel(c);
1723 nvgpu_err(g,
1724 "failed to allocate pm ctxt buffer");
1725 return ret;
1726 }
1727
1728 pm_ctx->mem.gpu_va = nvgpu_gmmu_map(c->vm,
1729 &pm_ctx->mem,
1730 pm_ctx->mem.size,
1731 NVGPU_AS_MAP_BUFFER_FLAGS_CACHEABLE,
1732 gk20a_mem_flag_none, true,
1733 pm_ctx->mem.aperture);
1734 if (!pm_ctx->mem.gpu_va) {
1735 nvgpu_err(g,
1736 "failed to map pm ctxt buffer");
1737 nvgpu_dma_free(g, &pm_ctx->mem);
1738 c->g->ops.fifo.enable_channel(c);
1739 return -ENOMEM;
1740 }
1741 }
1742
1743 /* Now clear the buffer */
1744 if (nvgpu_mem_begin(g, &pm_ctx->mem)) {
1745 ret = -ENOMEM;
1746 goto cleanup_pm_buf;
1747 }
1748
1749 nvgpu_memset(g, &pm_ctx->mem, 0, 0, pm_ctx->mem.size);
1750
1751 nvgpu_mem_end(g, &pm_ctx->mem);
1752 }
1753
1754 if (nvgpu_mem_begin(g, gr_mem)) {
1755 ret = -ENOMEM;
1756 goto cleanup_pm_buf;
1757 }
1758
1759 if (nvgpu_mem_begin(g, ctxheader)) {
1760 ret = -ENOMEM;
1761 goto clean_up_mem;
1762 }
1763
1764 data = nvgpu_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
1765 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1766
1767 if (enable_hwpm_ctxsw) {
1768 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
1769
1770 virt_addr = pm_ctx->mem.gpu_va;
1771 } else {
1772 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1773 virt_addr = 0;
1774 }
1775
1776 data |= pm_ctx->pm_mode;
1777
1778 nvgpu_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
1779
1780 if (ctxheader->gpu_va)
1781 g->ops.gr.write_pm_ptr(g, ctxheader, virt_addr);
1782 else
1783 g->ops.gr.write_pm_ptr(g, gr_mem, virt_addr);
1784
1785 nvgpu_mem_end(g, ctxheader);
1786 nvgpu_mem_end(g, gr_mem);
1787
1788 /* enable channel */
1789 gk20a_enable_channel_tsg(g, c);
1790
1791 return 0;
1792clean_up_mem:
1793 nvgpu_mem_end(g, gr_mem);
1794cleanup_pm_buf:
1795 nvgpu_gmmu_unmap(c->vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
1796 nvgpu_dma_free(g, &pm_ctx->mem);
1797 memset(&pm_ctx->mem, 0, sizeof(struct nvgpu_mem));
1798
1799 gk20a_enable_channel_tsg(g, c);
1800 return ret;
1801}
1802
1803void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
1804 struct nvgpu_mem *mem)
1805{
1806 nvgpu_mem_wr(g, mem,
1807 ctxsw_prog_main_image_num_save_ops_o(), 0);
1808 nvgpu_mem_wr(g, mem,
1809 ctxsw_prog_main_image_num_restore_ops_o(), 0);
1810}
1811
1812/* load saved fresh copy of gloden image into channel gr_ctx */
1813int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1814 struct channel_gk20a *c)
1815{
1816 struct gr_gk20a *gr = &g->gr;
1817 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1818 u32 virt_addr_lo;
1819 u32 virt_addr_hi;
1820 u64 virt_addr = 0;
1821 u32 v, data;
1822 int ret = 0;
1823 struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
1824 struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
1825 struct nvgpu_mem *ctxheader = &ctx->mem;
1826
1827 gk20a_dbg_fn("");
1828
1829 if (gr->ctx_vars.local_golden_image == NULL)
1830 return -1;
1831
1832 /* Channel gr_ctx buffer is gpu cacheable.
1833 Flush and invalidate before cpu update. */
1834 g->ops.mm.l2_flush(g, true);
1835
1836 if (nvgpu_mem_begin(g, mem))
1837 return -ENOMEM;
1838
1839 if (nvgpu_mem_begin(g, ctxheader)) {
1840 ret = -ENOMEM;
1841 goto clean_up_mem;
1842 }
1843
1844 nvgpu_mem_wr_n(g, mem, 0,
1845 gr->ctx_vars.local_golden_image,
1846 gr->ctx_vars.golden_image_size);
1847
1848 if (g->ops.gr.init_ctxsw_hdr_data)
1849 g->ops.gr.init_ctxsw_hdr_data(g, mem);
1850
1851 if (g->ops.gr.enable_cde_in_fecs && c->cde)
1852 g->ops.gr.enable_cde_in_fecs(g, mem);
1853
1854 /* set priv access map */
1855 virt_addr_lo =
1856 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1857 virt_addr_hi =
1858 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1859
1860 if (g->allow_all)
1861 data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
1862 else
1863 data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
1864
1865 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
1866 data);
1867
1868 if (ctxheader->gpu_va) {
1869 nvgpu_mem_wr(g, ctxheader,
1870 ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
1871 virt_addr_lo);
1872 nvgpu_mem_wr(g, ctxheader,
1873 ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
1874 virt_addr_hi);
1875 } else {
1876 nvgpu_mem_wr(g, mem,
1877 ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
1878 virt_addr_lo);
1879 nvgpu_mem_wr(g, mem,
1880 ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
1881 virt_addr_hi);
1882 }
1883 /* disable verif features */
1884 v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
1885 v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1886 v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1887 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
1888
1889 if (g->ops.gr.update_ctxsw_preemption_mode)
1890 g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
1891
1892 if (g->ops.gr.update_boosted_ctx)
1893 g->ops.gr.update_boosted_ctx(g, mem, ch_ctx->gr_ctx);
1894
1895 virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
1896 virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
1897
1898 nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
1899 ch_ctx->patch_ctx.data_count);
1900 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
1901 ch_ctx->patch_ctx.data_count);
1902
1903 if (ctxheader->gpu_va) {
1904 nvgpu_mem_wr(g, ctxheader,
1905 ctxsw_prog_main_image_patch_adr_lo_o(),
1906 virt_addr_lo);
1907 nvgpu_mem_wr(g, ctxheader,
1908 ctxsw_prog_main_image_patch_adr_hi_o(),
1909 virt_addr_hi);
1910 } else {
1911 nvgpu_mem_wr(g, mem,
1912 ctxsw_prog_main_image_patch_adr_lo_o(),
1913 virt_addr_lo);
1914 nvgpu_mem_wr(g, mem,
1915 ctxsw_prog_main_image_patch_adr_hi_o(),
1916 virt_addr_hi);
1917 }
1918
1919 /* Update main header region of the context buffer with the info needed
1920 * for PM context switching, including mode and possibly a pointer to
1921 * the PM backing store.
1922 */
1923 if (ch_ctx->pm_ctx.pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
1924 if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
1925 nvgpu_err(g,
1926 "context switched pm with no pm buffer!");
1927 nvgpu_mem_end(g, mem);
1928 return -EFAULT;
1929 }
1930
1931 virt_addr = ch_ctx->pm_ctx.mem.gpu_va;
1932 } else
1933 virt_addr = 0;
1934
1935 data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
1936 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1937 data |= ch_ctx->pm_ctx.pm_mode;
1938
1939 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
1940
1941 if (ctxheader->gpu_va)
1942 g->ops.gr.write_pm_ptr(g, ctxheader, virt_addr);
1943 else
1944 g->ops.gr.write_pm_ptr(g, mem, virt_addr);
1945
1946
1947 nvgpu_mem_end(g, ctxheader);
1948clean_up_mem:
1949 nvgpu_mem_end(g, mem);
1950
1951 return ret;
1952}
1953
1954static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1955{
1956 gk20a_dbg_fn("");
1957
1958 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1959 gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1960
1961 gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1962 gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1963
1964 gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1965 gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1966
1967 gk20a_dbg_fn("done");
1968}
1969
1970static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1971{
1972 struct mm_gk20a *mm = &g->mm;
1973 struct vm_gk20a *vm = mm->pmu.vm;
1974 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1975 int err;
1976
1977 err = g->ops.mm.alloc_inst_block(g, &ucode_info->inst_blk_desc);
1978 if (err)
1979 return err;
1980
1981 g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
1982
1983 /* Map ucode surface to GMMU */
1984 ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
1985 &ucode_info->surface_desc,
1986 ucode_info->surface_desc.size,
1987 0, /* flags */
1988 gk20a_mem_flag_read_only,
1989 false,
1990 ucode_info->surface_desc.aperture);
1991 if (!ucode_info->surface_desc.gpu_va) {
1992 nvgpu_err(g, "failed to update gmmu ptes");
1993 return -ENOMEM;
1994 }
1995
1996 return 0;
1997}
1998
1999static void gr_gk20a_init_ctxsw_ucode_segment(
2000 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
2001{
2002 p_seg->offset = *offset;
2003 p_seg->size = size;
2004 *offset = ALIGN(*offset + size, BLK_SIZE);
2005}
2006
2007static void gr_gk20a_init_ctxsw_ucode_segments(
2008 struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
2009 struct gk20a_ctxsw_bootloader_desc *bootdesc,
2010 u32 code_size, u32 data_size)
2011{
2012 u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
2013 segments->boot_entry = bootdesc->entry_point;
2014 segments->boot_imem_offset = bootdesc->imem_offset;
2015 gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
2016 gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
2017 gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
2018}
2019
2020static int gr_gk20a_copy_ctxsw_ucode_segments(
2021 struct gk20a *g,
2022 struct nvgpu_mem *dst,
2023 struct gk20a_ctxsw_ucode_segments *segments,
2024 u32 *bootimage,
2025 u32 *code, u32 *data)
2026{
2027 unsigned int i;
2028
2029 nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage,
2030 segments->boot.size);
2031 nvgpu_mem_wr_n(g, dst, segments->code.offset, code,
2032 segments->code.size);
2033 nvgpu_mem_wr_n(g, dst, segments->data.offset, data,
2034 segments->data.size);
2035
2036 /* compute a "checksum" for the boot binary to detect its version */
2037 segments->boot_signature = 0;
2038 for (i = 0; i < segments->boot.size / sizeof(u32); i++)
2039 segments->boot_signature += bootimage[i];
2040
2041 return 0;
2042}
2043
2044int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
2045{
2046 struct mm_gk20a *mm = &g->mm;
2047 struct vm_gk20a *vm = mm->pmu.vm;
2048 struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
2049 struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
2050 struct nvgpu_firmware *fecs_fw;
2051 struct nvgpu_firmware *gpccs_fw;
2052 u32 *fecs_boot_image;
2053 u32 *gpccs_boot_image;
2054 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2055 u32 ucode_size;
2056 int err = 0;
2057
2058 fecs_fw = nvgpu_request_firmware(g, GK20A_FECS_UCODE_IMAGE, 0);
2059 if (!fecs_fw) {
2060 nvgpu_err(g, "failed to load fecs ucode!!");
2061 return -ENOENT;
2062 }
2063
2064 fecs_boot_desc = (void *)fecs_fw->data;
2065 fecs_boot_image = (void *)(fecs_fw->data +
2066 sizeof(struct gk20a_ctxsw_bootloader_desc));
2067
2068 gpccs_fw = nvgpu_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE, 0);
2069 if (!gpccs_fw) {
2070 nvgpu_release_firmware(g, fecs_fw);
2071 nvgpu_err(g, "failed to load gpccs ucode!!");
2072 return -ENOENT;
2073 }
2074
2075 gpccs_boot_desc = (void *)gpccs_fw->data;
2076 gpccs_boot_image = (void *)(gpccs_fw->data +
2077 sizeof(struct gk20a_ctxsw_bootloader_desc));
2078
2079 ucode_size = 0;
2080 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
2081 fecs_boot_desc,
2082 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
2083 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
2084 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
2085 gpccs_boot_desc,
2086 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
2087 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
2088
2089 err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc);
2090 if (err)
2091 goto clean_up;
2092
2093 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2094 &ucode_info->fecs,
2095 fecs_boot_image,
2096 g->gr.ctx_vars.ucode.fecs.inst.l,
2097 g->gr.ctx_vars.ucode.fecs.data.l);
2098
2099 nvgpu_release_firmware(g, fecs_fw);
2100 fecs_fw = NULL;
2101
2102 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2103 &ucode_info->gpccs,
2104 gpccs_boot_image,
2105 g->gr.ctx_vars.ucode.gpccs.inst.l,
2106 g->gr.ctx_vars.ucode.gpccs.data.l);
2107
2108 nvgpu_release_firmware(g, gpccs_fw);
2109 gpccs_fw = NULL;
2110
2111 err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
2112 if (err)
2113 goto clean_up;
2114
2115 return 0;
2116
2117clean_up:
2118 if (ucode_info->surface_desc.gpu_va)
2119 nvgpu_gmmu_unmap(vm, &ucode_info->surface_desc,
2120 ucode_info->surface_desc.gpu_va);
2121 nvgpu_dma_free(g, &ucode_info->surface_desc);
2122
2123 nvgpu_release_firmware(g, gpccs_fw);
2124 gpccs_fw = NULL;
2125 nvgpu_release_firmware(g, fecs_fw);
2126 fecs_fw = NULL;
2127
2128 return err;
2129}
2130
2131void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
2132{
2133 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2134 int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2135 u64 inst_ptr;
2136 u32 val;
2137
2138 while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2139 gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
2140 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2141 retries--;
2142 }
2143 if (!retries) {
2144 nvgpu_err(g,
2145 "arbiter idle timeout, status: %08x",
2146 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
2147 }
2148
2149 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2150
2151 inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
2152 gk20a_writel(g, gr_fecs_new_ctx_r(),
2153 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2154 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2155 gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
2156 gr_fecs_new_ctx_target_vid_mem_f()) |
2157 gr_fecs_new_ctx_valid_m());
2158
2159 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2160 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2161 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2162 gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
2163 gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
2164
2165 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2166
2167 /* Wait for arbiter command to complete */
2168 retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2169 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2170 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2171 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2172 retries--;
2173 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2174 }
2175 if (!retries)
2176 nvgpu_err(g, "arbiter complete timeout");
2177
2178 gk20a_writel(g, gr_fecs_current_ctx_r(),
2179 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2180 gr_fecs_current_ctx_target_m() |
2181 gr_fecs_current_ctx_valid_m());
2182 /* Send command to arbiter to flush */
2183 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2184
2185 retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2186 val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2187 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2188 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2189 retries--;
2190 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2191 }
2192 if (!retries)
2193 nvgpu_err(g, "arbiter complete timeout");
2194}
2195
2196void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
2197 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2198{
2199 u32 addr_code32;
2200 u32 addr_data32;
2201
2202 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2203 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2204
2205 /*
2206 * Copy falcon bootloader header into dmem at offset 0.
2207 * Configure dmem port 0 for auto-incrementing writes starting at dmem
2208 * offset 0.
2209 */
2210 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2211 gr_fecs_dmemc_offs_f(0) |
2212 gr_fecs_dmemc_blk_f(0) |
2213 gr_fecs_dmemc_aincw_f(1));
2214
2215 /* Write out the actual data */
2216 switch (segments->boot_signature) {
2217 case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
2218 case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
2219 case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
2220 case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
2221 case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
2222 case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
2223 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2224 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2225 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2226 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2227 /* fallthrough */
2228 case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
2229 case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
2230 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
2231 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
2232 case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
2233 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2234 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2235 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2236 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2237 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
2238 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2239 addr_code32);
2240 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2241 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2242 segments->code.size);
2243 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2244 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2245 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2246 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2247 addr_data32);
2248 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2249 segments->data.size);
2250 break;
2251 case FALCON_UCODE_SIG_T12X_FECS_OLDER:
2252 case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
2253 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2254 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2255 addr_code32);
2256 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2257 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2258 segments->code.size);
2259 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2260 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2261 addr_data32);
2262 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2263 segments->data.size);
2264 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2265 addr_code32);
2266 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2267 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2268 break;
2269 default:
2270 nvgpu_err(g,
2271 "unknown falcon ucode boot signature 0x%08x"
2272 " with reg_offset 0x%08x",
2273 segments->boot_signature, reg_offset);
2274 BUG();
2275 }
2276}
2277
2278void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
2279 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2280{
2281 u32 addr_load32;
2282 u32 blocks;
2283 u32 b;
2284 u32 dst;
2285
2286 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2287 blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2288
2289 /*
2290 * Set the base FB address for the DMA transfer. Subtract off the 256
2291 * byte IMEM block offset such that the relative FB and IMEM offsets
2292 * match, allowing the IMEM tags to be properly created.
2293 */
2294
2295 dst = segments->boot_imem_offset;
2296 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2297 (addr_load32 - (dst >> 8)));
2298
2299 for (b = 0; b < blocks; b++) {
2300 /* Setup destination IMEM offset */
2301 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2302 dst + (b << 8));
2303
2304 /* Setup source offset (relative to BASE) */
2305 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2306 dst + (b << 8));
2307
2308 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2309 gr_fecs_dmatrfcmd_imem_f(0x01) |
2310 gr_fecs_dmatrfcmd_write_f(0x00) |
2311 gr_fecs_dmatrfcmd_size_f(0x06) |
2312 gr_fecs_dmatrfcmd_ctxdma_f(0));
2313 }
2314
2315 /* Specify the falcon boot vector */
2316 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2317 gr_fecs_bootvec_vec_f(segments->boot_entry));
2318}
2319
2320static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2321{
2322 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2323 u64 addr_base = ucode_info->surface_desc.gpu_va;
2324
2325 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2326
2327 gr_gk20a_load_falcon_bind_instblk(g);
2328
2329 g->ops.gr.falcon_load_ucode(g, addr_base,
2330 &g->ctxsw_ucode_info.fecs, 0);
2331
2332 g->ops.gr.falcon_load_ucode(g, addr_base,
2333 &g->ctxsw_ucode_info.gpccs,
2334 gr_gpcs_gpccs_falcon_hwcfg_r() -
2335 gr_fecs_falcon_hwcfg_r());
2336}
2337
2338int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
2339{
2340 int err;
2341
2342 gk20a_dbg_fn("");
2343
2344 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
2345 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2346 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2347 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2348 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2349 }
2350
2351 /*
2352 * In case bootloader is not supported, revert to the old way of
2353 * loading gr ucode, without the faster bootstrap routine.
2354 */
2355 if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
2356 gr_gk20a_load_falcon_dmem(g);
2357 gr_gk20a_load_falcon_imem(g);
2358 gr_gk20a_start_falcon_ucode(g);
2359 } else {
2360 if (!g->gr.skip_ucode_init) {
2361 err = gr_gk20a_init_ctxsw_ucode(g);
2362
2363 if (err)
2364 return err;
2365 }
2366 gr_gk20a_load_falcon_with_bootloader(g);
2367 g->gr.skip_ucode_init = true;
2368 }
2369 gk20a_dbg_fn("done");
2370 return 0;
2371}
2372
2373static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
2374{
2375 u32 ret;
2376
2377 gk20a_dbg_fn("");
2378
2379 ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL,
2380 GR_IS_UCODE_OP_EQUAL,
2381 eUcodeHandshakeInitComplete,
2382 GR_IS_UCODE_OP_SKIP, 0, false);
2383 if (ret) {
2384 nvgpu_err(g, "falcon ucode init timeout");
2385 return ret;
2386 }
2387
2388 if (nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP) ||
2389 nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS))
2390 gk20a_writel(g, gr_fecs_current_ctx_r(),
2391 gr_fecs_current_ctx_valid_false_f());
2392
2393 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2394 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2395 gk20a_writel(g, gr_fecs_method_push_r(),
2396 gr_fecs_method_push_adr_set_watchdog_timeout_f());
2397
2398 gk20a_dbg_fn("done");
2399 return 0;
2400}
2401
2402int gr_gk20a_init_ctx_state(struct gk20a *g)
2403{
2404 u32 ret;
2405 struct fecs_method_op_gk20a op = {
2406 .mailbox = { .id = 0, .data = 0,
2407 .clr = ~0, .ok = 0, .fail = 0},
2408 .method.data = 0,
2409 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2410 .cond.fail = GR_IS_UCODE_OP_SKIP,
2411 };
2412
2413 gk20a_dbg_fn("");
2414 if (!g->gr.ctx_vars.golden_image_size) {
2415 op.method.addr =
2416 gr_fecs_method_push_adr_discover_image_size_v();
2417 op.mailbox.ret = &g->gr.ctx_vars.golden_image_size;
2418 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2419 if (ret) {
2420 nvgpu_err(g,
2421 "query golden image size failed");
2422 return ret;
2423 }
2424 op.method.addr =
2425 gr_fecs_method_push_adr_discover_zcull_image_size_v();
2426 op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size;
2427 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2428 if (ret) {
2429 nvgpu_err(g,
2430 "query zcull ctx image size failed");
2431 return ret;
2432 }
2433 op.method.addr =
2434 gr_fecs_method_push_adr_discover_pm_image_size_v();
2435 op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size;
2436 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2437 if (ret) {
2438 nvgpu_err(g,
2439 "query pm ctx image size failed");
2440 return ret;
2441 }
2442 g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2443 }
2444
2445 gk20a_dbg_fn("done");
2446 return 0;
2447}
2448
2449static void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
2450 struct gr_ctx_buffer_desc *desc)
2451{
2452 if (!desc)
2453 return;
2454 nvgpu_dma_free(g, &desc->mem);
2455 desc->destroy = NULL;
2456}
2457
2458static int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
2459 struct gr_ctx_buffer_desc *desc,
2460 size_t size)
2461{
2462 int err = 0;
2463
2464 err = nvgpu_dma_alloc_flags_sys(g, NVGPU_DMA_NO_KERNEL_MAPPING,
2465 size, &desc->mem);
2466 if (err)
2467 return err;
2468
2469 desc->destroy = gk20a_gr_destroy_ctx_buffer;
2470
2471 return err;
2472}
2473
2474static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2475{
2476 struct gr_gk20a *gr = &g->gr;
2477 u32 i;
2478
2479 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2480 /* destroy exists iff buffer is allocated */
2481 if (gr->global_ctx_buffer[i].destroy) {
2482 gr->global_ctx_buffer[i].destroy(g,
2483 &gr->global_ctx_buffer[i]);
2484 }
2485 }
2486
2487 gk20a_dbg_fn("done");
2488}
2489
2490static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2491{
2492 struct gr_gk20a *gr = &g->gr;
2493 int attr_buffer_size, err;
2494
2495 u32 cb_buffer_size = gr->bundle_cb_default_size *
2496 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2497
2498 u32 pagepool_buffer_size = g->ops.gr.pagepool_default_size(g) *
2499 gr_scc_pagepool_total_pages_byte_granularity_v();
2500
2501 gk20a_dbg_fn("");
2502
2503 attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2504
2505 gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2506
2507 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[CIRCULAR],
2508 cb_buffer_size);
2509 if (err)
2510 goto clean_up;
2511
2512 if (g->ops.secure_alloc)
2513 g->ops.secure_alloc(g,
2514 &gr->global_ctx_buffer[CIRCULAR_VPR],
2515 cb_buffer_size);
2516
2517 gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2518
2519 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[PAGEPOOL],
2520 pagepool_buffer_size);
2521 if (err)
2522 goto clean_up;
2523
2524 if (g->ops.secure_alloc)
2525 g->ops.secure_alloc(g,
2526 &gr->global_ctx_buffer[PAGEPOOL_VPR],
2527 pagepool_buffer_size);
2528
2529 gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2530
2531 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[ATTRIBUTE],
2532 attr_buffer_size);
2533 if (err)
2534 goto clean_up;
2535
2536 if (g->ops.secure_alloc)
2537 g->ops.secure_alloc(g,
2538 &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2539 attr_buffer_size);
2540
2541 gk20a_dbg_info("golden_image_size : %d",
2542 gr->ctx_vars.golden_image_size);
2543
2544 err = gk20a_gr_alloc_ctx_buffer(g,
2545 &gr->global_ctx_buffer[GOLDEN_CTX],
2546 gr->ctx_vars.golden_image_size);
2547 if (err)
2548 goto clean_up;
2549
2550 gk20a_dbg_info("priv_access_map_size : %d",
2551 gr->ctx_vars.priv_access_map_size);
2552
2553 err = gk20a_gr_alloc_ctx_buffer(g,
2554 &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2555 gr->ctx_vars.priv_access_map_size);
2556
2557 if (err)
2558 goto clean_up;
2559
2560 gk20a_dbg_fn("done");
2561 return 0;
2562
2563 clean_up:
2564 nvgpu_err(g, "fail");
2565 gr_gk20a_free_global_ctx_buffers(g);
2566 return -ENOMEM;
2567}
2568
2569static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2570{
2571 struct vm_gk20a *ch_vm = c->vm;
2572 struct gr_gk20a *gr = &c->g->gr;
2573 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2574 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2575 int *g_bfr_index = c->ch_ctx.global_ctx_buffer_index;
2576 u32 i;
2577
2578 gk20a_dbg_fn("");
2579
2580 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2581 if (g_bfr_index[i]) {
2582 struct nvgpu_mem *mem;
2583
2584 /*
2585 * Translate from VA index to buffer index to determine
2586 * the correct struct nvgpu_mem to use. Handles the VPR
2587 * vs non-VPR difference in context images.
2588 */
2589 mem = &gr->global_ctx_buffer[g_bfr_index[i]].mem;
2590
2591 nvgpu_gmmu_unmap(ch_vm, mem, g_bfr_va[i]);
2592 }
2593 }
2594
2595 memset(g_bfr_va, 0, sizeof(c->ch_ctx.global_ctx_buffer_va));
2596 memset(g_bfr_size, 0, sizeof(c->ch_ctx.global_ctx_buffer_size));
2597 memset(g_bfr_index, 0, sizeof(c->ch_ctx.global_ctx_buffer_index));
2598
2599 c->ch_ctx.global_ctx_buffer_mapped = false;
2600}
2601
2602static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2603 struct channel_gk20a *c)
2604{
2605 struct vm_gk20a *ch_vm = c->vm;
2606 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2607 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2608 int *g_bfr_index = c->ch_ctx.global_ctx_buffer_index;
2609 struct gr_gk20a *gr = &g->gr;
2610 struct nvgpu_mem *mem;
2611 u64 gpu_va;
2612
2613 gk20a_dbg_fn("");
2614
2615 /* Circular Buffer */
2616 if (!c->vpr ||
2617 (gr->global_ctx_buffer[CIRCULAR_VPR].mem.priv.sgt == NULL)) {
2618 mem = &gr->global_ctx_buffer[CIRCULAR].mem;
2619 g_bfr_index[CIRCULAR_VA] = CIRCULAR;
2620 } else {
2621 mem = &gr->global_ctx_buffer[CIRCULAR_VPR].mem;
2622 g_bfr_index[CIRCULAR_VA] = CIRCULAR_VPR;
2623 }
2624
2625 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2626 NVGPU_AS_MAP_BUFFER_FLAGS_CACHEABLE,
2627 gk20a_mem_flag_none, true, mem->aperture);
2628 if (!gpu_va)
2629 goto clean_up;
2630 g_bfr_va[CIRCULAR_VA] = gpu_va;
2631 g_bfr_size[CIRCULAR_VA] = mem->size;
2632
2633 /* Attribute Buffer */
2634 if (!c->vpr ||
2635 (gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.priv.sgt == NULL)) {
2636 mem = &gr->global_ctx_buffer[ATTRIBUTE].mem;
2637 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE;
2638 } else {
2639 mem = &gr->global_ctx_buffer[ATTRIBUTE_VPR].mem;
2640 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE_VPR;
2641 }
2642
2643 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2644 NVGPU_AS_MAP_BUFFER_FLAGS_CACHEABLE,
2645 gk20a_mem_flag_none, false, mem->aperture);
2646 if (!gpu_va)
2647 goto clean_up;
2648 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2649 g_bfr_size[ATTRIBUTE_VA] = mem->size;
2650
2651 /* Page Pool */
2652 if (!c->vpr ||
2653 (gr->global_ctx_buffer[PAGEPOOL_VPR].mem.priv.sgt == NULL)) {
2654 mem = &gr->global_ctx_buffer[PAGEPOOL].mem;
2655 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL;
2656 } else {
2657 mem = &gr->global_ctx_buffer[PAGEPOOL_VPR].mem;
2658 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL_VPR;
2659 }
2660
2661 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2662 NVGPU_AS_MAP_BUFFER_FLAGS_CACHEABLE,
2663 gk20a_mem_flag_none, true, mem->aperture);
2664 if (!gpu_va)
2665 goto clean_up;
2666 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2667 g_bfr_size[PAGEPOOL_VA] = mem->size;
2668
2669 /* Golden Image */
2670 mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
2671 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2672 gk20a_mem_flag_none, true, mem->aperture);
2673 if (!gpu_va)
2674 goto clean_up;
2675 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2676 g_bfr_size[GOLDEN_CTX_VA] = mem->size;
2677 g_bfr_index[GOLDEN_CTX_VA] = GOLDEN_CTX;
2678
2679 /* Priv register Access Map */
2680 mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
2681 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2682 gk20a_mem_flag_none, true, mem->aperture);
2683 if (!gpu_va)
2684 goto clean_up;
2685 g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2686 g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
2687 g_bfr_index[PRIV_ACCESS_MAP_VA] = PRIV_ACCESS_MAP;
2688
2689 c->ch_ctx.global_ctx_buffer_mapped = true;
2690 return 0;
2691
2692clean_up:
2693 gr_gk20a_unmap_global_ctx_buffers(c);
2694
2695 return -ENOMEM;
2696}
2697
2698int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
2699 struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
2700 u32 class,
2701 u32 padding)
2702{
2703 struct gr_ctx_desc *gr_ctx = NULL;
2704 struct gr_gk20a *gr = &g->gr;
2705 int err = 0;
2706
2707 gk20a_dbg_fn("");
2708
2709 if (gr->ctx_vars.buffer_size == 0)
2710 return 0;
2711
2712 /* alloc channel gr ctx buffer */
2713 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2714 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2715
2716 gr_ctx = nvgpu_kzalloc(g, sizeof(*gr_ctx));
2717 if (!gr_ctx)
2718 return -ENOMEM;
2719
2720 err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_NO_KERNEL_MAPPING,
2721 gr->ctx_vars.buffer_total_size,
2722 &gr_ctx->mem);
2723 if (err)
2724 goto err_free_ctx;
2725
2726 gr_ctx->mem.gpu_va = nvgpu_gmmu_map(vm,
2727 &gr_ctx->mem,
2728 gr_ctx->mem.size,
2729 0, /* not GPU-cacheable */
2730 gk20a_mem_flag_none, true,
2731 gr_ctx->mem.aperture);
2732 if (!gr_ctx->mem.gpu_va)
2733 goto err_free_mem;
2734
2735 *__gr_ctx = gr_ctx;
2736
2737 return 0;
2738
2739 err_free_mem:
2740 nvgpu_dma_free(g, &gr_ctx->mem);
2741 err_free_ctx:
2742 nvgpu_kfree(g, gr_ctx);
2743 gr_ctx = NULL;
2744
2745 return err;
2746}
2747
2748static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
2749 struct tsg_gk20a *tsg, u32 class, u32 padding)
2750{
2751 struct gr_ctx_desc **gr_ctx = &tsg->tsg_gr_ctx;
2752 int err;
2753
2754 if (!tsg->vm) {
2755 nvgpu_err(tsg->g, "No address space bound");
2756 return -ENOMEM;
2757 }
2758
2759 err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
2760 if (err)
2761 return err;
2762
2763 return 0;
2764}
2765
2766static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2767 struct channel_gk20a *c,
2768 u32 class,
2769 u32 padding)
2770{
2771 struct gr_ctx_desc **gr_ctx = &c->ch_ctx.gr_ctx;
2772 int err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, c->vm, class, padding);
2773 if (err)
2774 return err;
2775
2776 return 0;
2777}
2778
2779void gr_gk20a_free_gr_ctx(struct gk20a *g,
2780 struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx)
2781{
2782 gk20a_dbg_fn("");
2783
2784 if (!gr_ctx || !gr_ctx->mem.gpu_va)
2785 return;
2786
2787 nvgpu_gmmu_unmap(vm, &gr_ctx->mem, gr_ctx->mem.gpu_va);
2788 nvgpu_dma_free(g, &gr_ctx->mem);
2789 nvgpu_kfree(g, gr_ctx);
2790}
2791
2792void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
2793{
2794 if (!tsg->vm) {
2795 nvgpu_err(tsg->g, "No address space bound");
2796 return;
2797 }
2798 tsg->g->ops.gr.free_gr_ctx(tsg->g, tsg->vm, tsg->tsg_gr_ctx);
2799 tsg->tsg_gr_ctx = NULL;
2800}
2801
2802static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2803{
2804 c->g->ops.gr.free_gr_ctx(c->g, c->vm, c->ch_ctx.gr_ctx);
2805 c->ch_ctx.gr_ctx = NULL;
2806}
2807
2808u32 gr_gk20a_get_patch_slots(struct gk20a *g)
2809{
2810 return PATCH_CTX_SLOTS_PER_PAGE;
2811}
2812
2813static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2814 struct channel_gk20a *c)
2815{
2816 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2817 struct vm_gk20a *ch_vm = c->vm;
2818 u32 alloc_size;
2819 int err = 0;
2820
2821 gk20a_dbg_fn("");
2822
2823 alloc_size = g->ops.gr.get_patch_slots(g) *
2824 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
2825
2826 nvgpu_log(g, gpu_dbg_info, "patch buffer size in entries: %d",
2827 alloc_size);
2828
2829 err = nvgpu_dma_alloc_map_flags_sys(ch_vm, NVGPU_DMA_NO_KERNEL_MAPPING,
2830 alloc_size * sizeof(u32), &patch_ctx->mem);
2831 if (err)
2832 return err;
2833
2834 gk20a_dbg_fn("done");
2835 return 0;
2836}
2837
2838static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2839{
2840 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2841 struct gk20a *g = c->g;
2842
2843 gk20a_dbg_fn("");
2844
2845 if (patch_ctx->mem.gpu_va)
2846 nvgpu_gmmu_unmap(c->vm, &patch_ctx->mem,
2847 patch_ctx->mem.gpu_va);
2848
2849 nvgpu_dma_free(g, &patch_ctx->mem);
2850 patch_ctx->data_count = 0;
2851}
2852
2853static void gr_gk20a_free_channel_pm_ctx(struct channel_gk20a *c)
2854{
2855 struct pm_ctx_desc *pm_ctx = &c->ch_ctx.pm_ctx;
2856 struct gk20a *g = c->g;
2857
2858 gk20a_dbg_fn("");
2859
2860 if (pm_ctx->mem.gpu_va) {
2861 nvgpu_gmmu_unmap(c->vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
2862
2863 nvgpu_dma_free(g, &pm_ctx->mem);
2864 }
2865}
2866
2867void gk20a_free_channel_ctx(struct channel_gk20a *c, bool is_tsg)
2868{
2869 if(c->g->ops.fifo.free_channel_ctx_header)
2870 c->g->ops.fifo.free_channel_ctx_header(c);
2871 gr_gk20a_unmap_global_ctx_buffers(c);
2872 gr_gk20a_free_channel_patch_ctx(c);
2873 gr_gk20a_free_channel_pm_ctx(c);
2874 if (!is_tsg)
2875 gr_gk20a_free_channel_gr_ctx(c);
2876
2877 /* zcull_ctx */
2878
2879 memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2880
2881 c->first_init = false;
2882}
2883
2884int gk20a_alloc_obj_ctx(struct channel_gk20a *c, u32 class_num, u32 flags)
2885{
2886 struct gk20a *g = c->g;
2887 struct fifo_gk20a *f = &g->fifo;
2888 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2889 struct tsg_gk20a *tsg = NULL;
2890 int err = 0;
2891
2892 gk20a_dbg_fn("");
2893
2894 /* an address space needs to have been bound at this point.*/
2895 if (!gk20a_channel_as_bound(c) && !c->vm) {
2896 nvgpu_err(g,
2897 "not bound to address space at time"
2898 " of grctx allocation");
2899 return -EINVAL;
2900 }
2901
2902 if (!g->ops.gr.is_valid_class(g, class_num)) {
2903 nvgpu_err(g,
2904 "invalid obj class 0x%x", class_num);
2905 err = -EINVAL;
2906 goto out;
2907 }
2908 c->obj_class = class_num;
2909
2910 if (gk20a_is_channel_marked_as_tsg(c))
2911 tsg = &f->tsg[c->tsgid];
2912
2913 /* allocate gr ctx buffer */
2914 if (!tsg) {
2915 if (!ch_ctx->gr_ctx) {
2916 err = gr_gk20a_alloc_channel_gr_ctx(g, c,
2917 class_num,
2918 flags);
2919 if (err) {
2920 nvgpu_err(g,
2921 "fail to allocate gr ctx buffer");
2922 goto out;
2923 }
2924 } else {
2925 /*TBD: needs to be more subtle about which is
2926 * being allocated as some are allowed to be
2927 * allocated along same channel */
2928 nvgpu_err(g,
2929 "too many classes alloc'd on same channel");
2930 err = -EINVAL;
2931 goto out;
2932 }
2933 } else {
2934 if (!tsg->tsg_gr_ctx) {
2935 tsg->vm = c->vm;
2936 nvgpu_vm_get(tsg->vm);
2937 err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
2938 class_num,
2939 flags);
2940 if (err) {
2941 nvgpu_err(g,
2942 "fail to allocate TSG gr ctx buffer");
2943 nvgpu_vm_put(tsg->vm);
2944 tsg->vm = NULL;
2945 goto out;
2946 }
2947 }
2948 ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
2949 }
2950
2951 /* PM ctxt switch is off by default */
2952 ch_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
2953
2954 /* commit gr ctx buffer */
2955 err = g->ops.gr.commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
2956 if (err) {
2957 nvgpu_err(g,
2958 "fail to commit gr ctx buffer");
2959 goto out;
2960 }
2961
2962 /* allocate patch buffer */
2963 if (ch_ctx->patch_ctx.mem.priv.sgt == NULL) {
2964 ch_ctx->patch_ctx.data_count = 0;
2965 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2966 if (err) {
2967 nvgpu_err(g,
2968 "fail to allocate patch buffer");
2969 goto out;
2970 }
2971 }
2972
2973 /* map global buffer to channel gpu_va and commit */
2974 if (!ch_ctx->global_ctx_buffer_mapped) {
2975 err = gr_gk20a_map_global_ctx_buffers(g, c);
2976 if (err) {
2977 nvgpu_err(g,
2978 "fail to map global ctx buffer");
2979 goto out;
2980 }
2981 gr_gk20a_elpg_protected_call(g,
2982 gr_gk20a_commit_global_ctx_buffers(g, c, true));
2983 }
2984
2985 /* tweak any perf parameters per-context here */
2986 if (class_num == KEPLER_COMPUTE_A) {
2987 u32 tex_lock_disable_mask;
2988 u32 texlock;
2989 u32 lockboost_mask;
2990 u32 lockboost;
2991
2992 if (g->support_pmu && g->can_elpg) {
2993 err = nvgpu_pmu_disable_elpg(g);
2994 if (err) {
2995 nvgpu_err(g,
2996 "failed to set disable elpg");
2997 }
2998 }
2999
3000 tex_lock_disable_mask =
3001 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m() |
3002 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m() |
3003 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m() |
3004 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m() |
3005 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
3006 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
3007
3008 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
3009
3010 texlock = (texlock & ~tex_lock_disable_mask) |
3011 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f() |
3012 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f() |
3013 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f() |
3014 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f() |
3015 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
3016 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
3017
3018 lockboost_mask =
3019 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_m();
3020
3021 lockboost = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_macro_sched_r());
3022 lockboost = (lockboost & ~lockboost_mask) |
3023 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);
3024
3025 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
3026
3027 if (!err) {
3028 gr_gk20a_ctx_patch_write(g, ch_ctx,
3029 gr_gpcs_tpcs_sm_sch_texlock_r(),
3030 texlock, true);
3031 gr_gk20a_ctx_patch_write(g, ch_ctx,
3032 gr_gpcs_tpcs_sm_sch_macro_sched_r(),
3033 lockboost, true);
3034 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
3035 } else {
3036 nvgpu_err(g,
3037 "failed to set texlock for compute class");
3038 }
3039
3040 if (g->support_pmu && g->can_elpg)
3041 nvgpu_pmu_enable_elpg(g);
3042 }
3043
3044 /* init golden image, ELPG enabled after this is done */
3045 err = gr_gk20a_init_golden_ctx_image(g, c);
3046 if (err) {
3047 nvgpu_err(g,
3048 "fail to init golden ctx image");
3049 goto out;
3050 }
3051
3052 /* load golden image */
3053 if (!c->first_init) {
3054 err = gr_gk20a_elpg_protected_call(g,
3055 gr_gk20a_load_golden_ctx_image(g, c));
3056 if (err) {
3057 nvgpu_err(g,
3058 "fail to load golden ctx image");
3059 goto out;
3060 }
3061#ifdef CONFIG_GK20A_CTXSW_TRACE
3062 if (g->ops.fecs_trace.bind_channel && !c->vpr) {
3063 err = g->ops.fecs_trace.bind_channel(g, c);
3064 if (err)
3065 nvgpu_warn(g,
3066 "fail to bind channel for ctxsw trace");
3067 }
3068#endif
3069 c->first_init = true;
3070 }
3071
3072 if (g->ops.gr.set_czf_bypass)
3073 g->ops.gr.set_czf_bypass(g, c);
3074
3075 gk20a_dbg_fn("done");
3076 return 0;
3077out:
3078 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
3079 can be reused so no need to release them.
3080 2. golden image init and load is a one time thing so if
3081 they pass, no need to undo. */
3082 nvgpu_err(g, "fail");
3083 return err;
3084}
3085
3086static void gk20a_remove_gr_support(struct gr_gk20a *gr)
3087{
3088 struct gk20a *g = gr->g;
3089
3090 gk20a_dbg_fn("");
3091
3092 gr_gk20a_free_cyclestats_snapshot_data(g);
3093
3094 gr_gk20a_free_global_ctx_buffers(g);
3095
3096 nvgpu_dma_free(g, &gr->mmu_wr_mem);
3097 nvgpu_dma_free(g, &gr->mmu_rd_mem);
3098
3099 nvgpu_dma_free(g, &gr->compbit_store.mem);
3100
3101 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
3102
3103 nvgpu_kfree(g, gr->sm_error_states);
3104 nvgpu_kfree(g, gr->gpc_tpc_count);
3105 nvgpu_kfree(g, gr->gpc_zcb_count);
3106 nvgpu_kfree(g, gr->gpc_ppc_count);
3107 nvgpu_kfree(g, gr->pes_tpc_count[0]);
3108 nvgpu_kfree(g, gr->pes_tpc_count[1]);
3109 nvgpu_kfree(g, gr->pes_tpc_mask[0]);
3110 nvgpu_kfree(g, gr->pes_tpc_mask[1]);
3111 nvgpu_kfree(g, gr->sm_to_cluster);
3112 nvgpu_kfree(g, gr->gpc_skip_mask);
3113 nvgpu_kfree(g, gr->map_tiles);
3114 nvgpu_kfree(g, gr->fbp_rop_l2_en_mask);
3115 gr->gpc_tpc_count = NULL;
3116 gr->gpc_zcb_count = NULL;
3117 gr->gpc_ppc_count = NULL;
3118 gr->pes_tpc_count[0] = NULL;
3119 gr->pes_tpc_count[1] = NULL;
3120 gr->pes_tpc_mask[0] = NULL;
3121 gr->pes_tpc_mask[1] = NULL;
3122 gr->gpc_skip_mask = NULL;
3123 gr->map_tiles = NULL;
3124 gr->fbp_rop_l2_en_mask = NULL;
3125
3126 gr->ctx_vars.valid = false;
3127 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.inst.l);
3128 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.data.l);
3129 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.inst.l);
3130 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.data.l);
3131 nvgpu_kfree(g, gr->ctx_vars.sw_bundle_init.l);
3132 nvgpu_kfree(g, gr->ctx_vars.sw_veid_bundle_init.l);
3133 nvgpu_kfree(g, gr->ctx_vars.sw_method_init.l);
3134 nvgpu_kfree(g, gr->ctx_vars.sw_ctx_load.l);
3135 nvgpu_kfree(g, gr->ctx_vars.sw_non_ctx_load.l);
3136 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.sys.l);
3137 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc.l);
3138 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.tpc.l);
3139 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
3140 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.ppc.l);
3141 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_sys.l);
3142 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_gpc.l);
3143 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_tpc.l);
3144 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ppc.l);
3145 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_sys.l);
3146 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp.l);
3147 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_gpc.l);
3148 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp_router.l);
3149 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc_router.l);
3150 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ltc.l);
3151 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_fbpa.l);
3152
3153 nvgpu_vfree(g, gr->ctx_vars.local_golden_image);
3154 gr->ctx_vars.local_golden_image = NULL;
3155
3156 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map)
3157 nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
3158 gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
3159
3160 gk20a_comptag_allocator_destroy(g, &gr->comp_tags);
3161}
3162
3163static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
3164{
3165 u32 gpc_index, pes_index;
3166 u32 pes_tpc_mask;
3167 u32 pes_tpc_count;
3168 u32 pes_heavy_index;
3169 u32 gpc_new_skip_mask;
3170 u32 tmp;
3171 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
3172 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
3173
3174 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
3175 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
3176
3177 tmp = gk20a_readl(g, top_num_gpcs_r());
3178 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
3179
3180 tmp = gk20a_readl(g, top_num_fbps_r());
3181 gr->max_fbps_count = top_num_fbps_value_v(tmp);
3182
3183 gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g);
3184
3185 gr->fbp_rop_l2_en_mask =
3186 nvgpu_kzalloc(g, gr->max_fbps_count * sizeof(u32));
3187 if (!gr->fbp_rop_l2_en_mask)
3188 goto clean_up;
3189
3190 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
3191 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
3192
3193 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
3194
3195 tmp = gk20a_readl(g, top_num_fbps_r());
3196 gr->sys_count = top_num_fbps_value_v(tmp);
3197
3198 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
3199 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
3200
3201 gr->pe_count_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
3202 if (WARN(gr->pe_count_per_gpc > GK20A_GR_MAX_PES_PER_GPC,
3203 "too many pes per gpc\n"))
3204 goto clean_up;
3205
3206 gr->max_zcull_per_gpc_count = nvgpu_get_litter_value(g, GPU_LIT_NUM_ZCULL_BANKS);
3207
3208 if (!gr->gpc_count) {
3209 nvgpu_err(g, "gpc_count==0!");
3210 goto clean_up;
3211 }
3212
3213 gr->gpc_tpc_count = nvgpu_kzalloc(g, gr->gpc_count * sizeof(u32));
3214 gr->gpc_tpc_mask = nvgpu_kzalloc(g, gr->gpc_count * sizeof(u32));
3215 gr->gpc_zcb_count = nvgpu_kzalloc(g, gr->gpc_count * sizeof(u32));
3216 gr->gpc_ppc_count = nvgpu_kzalloc(g, gr->gpc_count * sizeof(u32));
3217
3218 gr->gpc_skip_mask =
3219 nvgpu_kzalloc(g, gr_pd_dist_skip_table__size_1_v() *
3220 4 * sizeof(u32));
3221
3222 if (!gr->gpc_tpc_count || !gr->gpc_tpc_mask || !gr->gpc_zcb_count ||
3223 !gr->gpc_ppc_count || !gr->gpc_skip_mask)
3224 goto clean_up;
3225
3226 gr->ppc_count = 0;
3227 gr->tpc_count = 0;
3228 gr->zcb_count = 0;
3229 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3230 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r() +
3231 gpc_stride * gpc_index);
3232
3233 gr->gpc_tpc_count[gpc_index] =
3234 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3235 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3236
3237 gr->gpc_zcb_count[gpc_index] =
3238 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3239 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3240
3241 if (g->ops.gr.get_gpc_tpc_mask)
3242 gr->gpc_tpc_mask[gpc_index] =
3243 g->ops.gr.get_gpc_tpc_mask(g, gpc_index);
3244
3245 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3246 if (!gr->pes_tpc_count[pes_index]) {
3247 gr->pes_tpc_count[pes_index] =
3248 nvgpu_kzalloc(g, gr->gpc_count *
3249 sizeof(u32));
3250 gr->pes_tpc_mask[pes_index] =
3251 nvgpu_kzalloc(g, gr->gpc_count *
3252 sizeof(u32));
3253 if (!gr->pes_tpc_count[pes_index] ||
3254 !gr->pes_tpc_mask[pes_index])
3255 goto clean_up;
3256 }
3257
3258 tmp = gk20a_readl(g,
3259 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3260 gpc_index * gpc_stride);
3261
3262 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3263 pes_tpc_count = count_bits(pes_tpc_mask);
3264
3265 /* detect PES presence by seeing if there are
3266 * TPCs connected to it.
3267 */
3268 if (pes_tpc_count != 0)
3269 gr->gpc_ppc_count[gpc_index]++;
3270
3271 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3272 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3273 }
3274
3275 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3276
3277 gpc_new_skip_mask = 0;
3278 if (gr->pe_count_per_gpc > 1 &&
3279 gr->pes_tpc_count[0][gpc_index] +
3280 gr->pes_tpc_count[1][gpc_index] == 5) {
3281 pes_heavy_index =
3282 gr->pes_tpc_count[0][gpc_index] >
3283 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3284
3285 gpc_new_skip_mask =
3286 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3287 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3288 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3289
3290 } else if (gr->pe_count_per_gpc > 1 &&
3291 (gr->pes_tpc_count[0][gpc_index] +
3292 gr->pes_tpc_count[1][gpc_index] == 4) &&
3293 (gr->pes_tpc_count[0][gpc_index] !=
3294 gr->pes_tpc_count[1][gpc_index])) {
3295 pes_heavy_index =
3296 gr->pes_tpc_count[0][gpc_index] >
3297 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3298
3299 gpc_new_skip_mask =
3300 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3301 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3302 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3303 }
3304 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3305 }
3306
3307 gr->sm_to_cluster = nvgpu_kzalloc(g, gr->gpc_count * gr->tpc_count *
3308 sm_per_tpc * sizeof(struct sm_info));
3309 gr->no_of_sm = 0;
3310
3311 gk20a_dbg_info("fbps: %d", gr->num_fbps);
3312 gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3313 gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3314 gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3315 gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3316 gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3317 gk20a_dbg_info("sys_count: %d", gr->sys_count);
3318 gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3319 gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3320 gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3321 gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3322
3323 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3324 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3325 gpc_index, gr->gpc_tpc_count[gpc_index]);
3326 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3327 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3328 gpc_index, gr->gpc_zcb_count[gpc_index]);
3329 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3330 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3331 gpc_index, gr->gpc_ppc_count[gpc_index]);
3332 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3333 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3334 gpc_index, gr->gpc_skip_mask[gpc_index]);
3335 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3336 for (pes_index = 0;
3337 pes_index < gr->pe_count_per_gpc;
3338 pes_index++)
3339 gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3340 pes_index, gpc_index,
3341 gr->pes_tpc_count[pes_index][gpc_index]);
3342
3343 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3344 for (pes_index = 0;
3345 pes_index < gr->pe_count_per_gpc;
3346 pes_index++)
3347 gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3348 pes_index, gpc_index,
3349 gr->pes_tpc_mask[pes_index][gpc_index]);
3350
3351 g->ops.gr.bundle_cb_defaults(g);
3352 g->ops.gr.cb_size_default(g);
3353 g->ops.gr.calc_global_ctx_buffer_size(g);
3354 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3355
3356 gk20a_dbg_info("bundle_cb_default_size: %d",
3357 gr->bundle_cb_default_size);
3358 gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3359 gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3360 gk20a_dbg_info("attrib_cb_default_size: %d",
3361 gr->attrib_cb_default_size);
3362 gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3363 gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3364 gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3365 gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3366
3367 return 0;
3368
3369clean_up:
3370 return -ENOMEM;
3371}
3372
3373static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3374{
3375 int err;
3376
3377 err = nvgpu_dma_alloc_sys(g, 0x1000, &gr->mmu_wr_mem);
3378 if (err)
3379 goto err;
3380
3381 err = nvgpu_dma_alloc_sys(g, 0x1000, &gr->mmu_rd_mem);
3382 if (err)
3383 goto err_free_wr_mem;
3384 return 0;
3385
3386 err_free_wr_mem:
3387 nvgpu_dma_free(g, &gr->mmu_wr_mem);
3388 err:
3389 return -ENOMEM;
3390}
3391
3392static u32 prime_set[18] = {
3393 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3394
3395static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3396{
3397 s32 comm_denom;
3398 s32 mul_factor;
3399 s32 *init_frac = NULL;
3400 s32 *init_err = NULL;
3401 s32 *run_err = NULL;
3402 s32 *sorted_num_tpcs = NULL;
3403 s32 *sorted_to_unsorted_gpc_map = NULL;
3404 u32 gpc_index;
3405 u32 gpc_mark = 0;
3406 u32 num_tpc;
3407 u32 max_tpc_count = 0;
3408 u32 swap;
3409 u32 tile_count;
3410 u32 index;
3411 bool delete_map = false;
3412 bool gpc_sorted;
3413 int ret = 0;
3414 int num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
3415 int num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
3416 int map_tile_count = num_gpcs * num_tpc_per_gpc;
3417
3418 init_frac = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3419 init_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3420 run_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3421 sorted_num_tpcs =
3422 nvgpu_kzalloc(g, num_gpcs * num_tpc_per_gpc * sizeof(s32));
3423 sorted_to_unsorted_gpc_map =
3424 nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3425
3426 if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3427 sorted_to_unsorted_gpc_map)) {
3428 ret = -ENOMEM;
3429 goto clean_up;
3430 }
3431
3432 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3433
3434 if (gr->tpc_count == 3)
3435 gr->map_row_offset = 2;
3436 else if (gr->tpc_count < 3)
3437 gr->map_row_offset = 1;
3438 else {
3439 gr->map_row_offset = 3;
3440
3441 for (index = 1; index < 18; index++) {
3442 u32 prime = prime_set[index];
3443 if ((gr->tpc_count % prime) != 0) {
3444 gr->map_row_offset = prime;
3445 break;
3446 }
3447 }
3448 }
3449
3450 switch (gr->tpc_count) {
3451 case 15:
3452 gr->map_row_offset = 6;
3453 break;
3454 case 14:
3455 gr->map_row_offset = 5;
3456 break;
3457 case 13:
3458 gr->map_row_offset = 2;
3459 break;
3460 case 11:
3461 gr->map_row_offset = 7;
3462 break;
3463 case 10:
3464 gr->map_row_offset = 6;
3465 break;
3466 case 7:
3467 case 5:
3468 gr->map_row_offset = 1;
3469 break;
3470 default:
3471 break;
3472 }
3473
3474 if (gr->map_tiles) {
3475 if (gr->map_tile_count != gr->tpc_count)
3476 delete_map = true;
3477
3478 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3479 if (gr_gk20a_get_map_tile_count(gr, tile_count)
3480 >= gr->tpc_count)
3481 delete_map = true;
3482 }
3483
3484 if (delete_map) {
3485 nvgpu_kfree(g, gr->map_tiles);
3486 gr->map_tiles = NULL;
3487 gr->map_tile_count = 0;
3488 }
3489 }
3490
3491 if (gr->map_tiles == NULL) {
3492 gr->map_tiles = nvgpu_kzalloc(g, map_tile_count * sizeof(u8));
3493 if (gr->map_tiles == NULL) {
3494 ret = -ENOMEM;
3495 goto clean_up;
3496 }
3497 gr->map_tile_count = map_tile_count;
3498
3499 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3500 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3501 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3502 }
3503
3504 gpc_sorted = false;
3505 while (!gpc_sorted) {
3506 gpc_sorted = true;
3507 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3508 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3509 gpc_sorted = false;
3510 swap = sorted_num_tpcs[gpc_index];
3511 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3512 sorted_num_tpcs[gpc_index + 1] = swap;
3513 swap = sorted_to_unsorted_gpc_map[gpc_index];
3514 sorted_to_unsorted_gpc_map[gpc_index] =
3515 sorted_to_unsorted_gpc_map[gpc_index + 1];
3516 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3517 }
3518 }
3519 }
3520
3521 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3522 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3523 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3524
3525 mul_factor = gr->gpc_count * max_tpc_count;
3526 if (mul_factor & 0x1)
3527 mul_factor = 2;
3528 else
3529 mul_factor = 1;
3530
3531 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3532
3533 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3534 num_tpc = sorted_num_tpcs[gpc_index];
3535
3536 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3537
3538 if (num_tpc != 0)
3539 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3540 else
3541 init_err[gpc_index] = 0;
3542
3543 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3544 }
3545
3546 while (gpc_mark < gr->tpc_count) {
3547 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3548 if ((run_err[gpc_index] * 2) >= comm_denom) {
3549 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3550 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3551 } else
3552 run_err[gpc_index] += init_frac[gpc_index];
3553 }
3554 }
3555 }
3556
3557clean_up:
3558 nvgpu_kfree(g, init_frac);
3559 nvgpu_kfree(g, init_err);
3560 nvgpu_kfree(g, run_err);
3561 nvgpu_kfree(g, sorted_num_tpcs);
3562 nvgpu_kfree(g, sorted_to_unsorted_gpc_map);
3563
3564 if (ret)
3565 nvgpu_err(g, "fail");
3566 else
3567 gk20a_dbg_fn("done");
3568
3569 return ret;
3570}
3571
3572static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3573{
3574 struct gr_zcull_gk20a *zcull = &gr->zcull;
3575
3576 zcull->aliquot_width = gr->tpc_count * 16;
3577 zcull->aliquot_height = 16;
3578
3579 zcull->width_align_pixels = gr->tpc_count * 16;
3580 zcull->height_align_pixels = 32;
3581
3582 zcull->aliquot_size =
3583 zcull->aliquot_width * zcull->aliquot_height;
3584
3585 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3586 zcull->pixel_squares_by_aliquots =
3587 gr->zcb_count * 16 * 16 * gr->tpc_count /
3588 (gr->gpc_count * gr->gpc_tpc_count[0]);
3589
3590 zcull->total_aliquots =
3591 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3592 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3593
3594 return 0;
3595}
3596
3597u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3598{
3599 /* assuming gr has already been initialized */
3600 return gr->ctx_vars.zcull_ctxsw_image_size;
3601}
3602
3603int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3604 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3605{
3606 struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3607
3608 zcull_ctx->ctx_sw_mode = mode;
3609 zcull_ctx->gpu_va = zcull_va;
3610
3611 /* TBD: don't disable channel in sw method processing */
3612 return gr_gk20a_ctx_zcull_setup(g, c);
3613}
3614
3615int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3616 struct gr_zcull_info *zcull_params)
3617{
3618 struct gr_zcull_gk20a *zcull = &gr->zcull;
3619
3620 zcull_params->width_align_pixels = zcull->width_align_pixels;
3621 zcull_params->height_align_pixels = zcull->height_align_pixels;
3622 zcull_params->pixel_squares_by_aliquots =
3623 zcull->pixel_squares_by_aliquots;
3624 zcull_params->aliquot_total = zcull->total_aliquots;
3625
3626 zcull_params->region_byte_multiplier =
3627 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3628 zcull_params->region_header_size =
3629 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3630 gr_zcull_save_restore_header_bytes_per_gpc_v();
3631
3632 zcull_params->subregion_header_size =
3633 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3634 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3635
3636 zcull_params->subregion_width_align_pixels =
3637 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3638 zcull_params->subregion_height_align_pixels =
3639 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3640 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3641
3642 return 0;
3643}
3644
3645int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3646 struct zbc_entry *color_val, u32 index)
3647{
3648 u32 i;
3649
3650 /* update l2 table */
3651 g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3652
3653 /* update ds table */
3654 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3655 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3656 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3657 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3658 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3659 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3660 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3661 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3662
3663 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3664 gr_ds_zbc_color_fmt_val_f(color_val->format));
3665
3666 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3667 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3668
3669 /* trigger the write */
3670 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3671 gr_ds_zbc_tbl_ld_select_c_f() |
3672 gr_ds_zbc_tbl_ld_action_write_f() |
3673 gr_ds_zbc_tbl_ld_trigger_active_f());
3674
3675 /* update local copy */
3676 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3677 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3678 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3679 }
3680 gr->zbc_col_tbl[index].format = color_val->format;
3681 gr->zbc_col_tbl[index].ref_cnt++;
3682
3683 return 0;
3684}
3685
3686int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3687 struct zbc_entry *depth_val, u32 index)
3688{
3689 /* update l2 table */
3690 g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3691
3692 /* update ds table */
3693 gk20a_writel(g, gr_ds_zbc_z_r(),
3694 gr_ds_zbc_z_val_f(depth_val->depth));
3695
3696 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3697 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3698
3699 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3700 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3701
3702 /* trigger the write */
3703 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3704 gr_ds_zbc_tbl_ld_select_z_f() |
3705 gr_ds_zbc_tbl_ld_action_write_f() |
3706 gr_ds_zbc_tbl_ld_trigger_active_f());
3707
3708 /* update local copy */
3709 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3710 gr->zbc_dep_tbl[index].format = depth_val->format;
3711 gr->zbc_dep_tbl[index].ref_cnt++;
3712
3713 return 0;
3714}
3715
3716void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3717{
3718 struct fifo_gk20a *f = &g->fifo;
3719 struct fifo_engine_info_gk20a *gr_info = NULL;
3720 u32 ret;
3721 u32 engine_id;
3722
3723 engine_id = gk20a_fifo_get_gr_engine_id(g);
3724 gr_info = (f->engine_info + engine_id);
3725
3726 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3727 if (ret) {
3728 nvgpu_err(g,
3729 "failed to disable gr engine activity");
3730 return;
3731 }
3732
3733 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
3734 GR_IDLE_CHECK_DEFAULT);
3735 if (ret) {
3736 nvgpu_err(g,
3737 "failed to idle graphics");
3738 goto clean_up;
3739 }
3740
3741 /* update zbc */
3742 gk20a_pmu_save_zbc(g, entries);
3743
3744clean_up:
3745 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3746 if (ret) {
3747 nvgpu_err(g,
3748 "failed to enable gr engine activity");
3749 }
3750}
3751
3752int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3753 struct zbc_entry *zbc_val)
3754{
3755 struct zbc_color_table *c_tbl;
3756 struct zbc_depth_table *d_tbl;
3757 u32 i;
3758 int ret = -ENOMEM;
3759 bool added = false;
3760 u32 entries;
3761
3762 /* no endian swap ? */
3763
3764 nvgpu_mutex_acquire(&gr->zbc_lock);
3765 switch (zbc_val->type) {
3766 case GK20A_ZBC_TYPE_COLOR:
3767 /* search existing tables */
3768 for (i = 0; i < gr->max_used_color_index; i++) {
3769
3770 c_tbl = &gr->zbc_col_tbl[i];
3771
3772 if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3773 memcmp(c_tbl->color_ds, zbc_val->color_ds,
3774 sizeof(zbc_val->color_ds)) == 0 &&
3775 memcmp(c_tbl->color_l2, zbc_val->color_l2,
3776 sizeof(zbc_val->color_l2)) == 0) {
3777
3778 added = true;
3779 c_tbl->ref_cnt++;
3780 ret = 0;
3781 break;
3782 }
3783 }
3784 /* add new table */
3785 if (!added &&
3786 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3787
3788 c_tbl =
3789 &gr->zbc_col_tbl[gr->max_used_color_index];
3790 WARN_ON(c_tbl->ref_cnt != 0);
3791
3792 ret = g->ops.gr.add_zbc_color(g, gr,
3793 zbc_val, gr->max_used_color_index);
3794
3795 if (!ret)
3796 gr->max_used_color_index++;
3797 }
3798 break;
3799 case GK20A_ZBC_TYPE_DEPTH:
3800 /* search existing tables */
3801 for (i = 0; i < gr->max_used_depth_index; i++) {
3802
3803 d_tbl = &gr->zbc_dep_tbl[i];
3804
3805 if (d_tbl->ref_cnt &&
3806 d_tbl->depth == zbc_val->depth &&
3807 d_tbl->format == zbc_val->format) {
3808 added = true;
3809 d_tbl->ref_cnt++;
3810 ret = 0;
3811 break;
3812 }
3813 }
3814 /* add new table */
3815 if (!added &&
3816 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3817
3818 d_tbl =
3819 &gr->zbc_dep_tbl[gr->max_used_depth_index];
3820 WARN_ON(d_tbl->ref_cnt != 0);
3821
3822 ret = g->ops.gr.add_zbc_depth(g, gr,
3823 zbc_val, gr->max_used_depth_index);
3824
3825 if (!ret)
3826 gr->max_used_depth_index++;
3827 }
3828 break;
3829 case T19X_ZBC:
3830 if (g->ops.gr.add_zbc_type_s) {
3831 added = g->ops.gr.add_zbc_type_s(g, gr, zbc_val, &ret);
3832 } else {
3833 nvgpu_err(g,
3834 "invalid zbc table type %d", zbc_val->type);
3835 ret = -EINVAL;
3836 goto err_mutex;
3837 }
3838 break;
3839 default:
3840 nvgpu_err(g,
3841 "invalid zbc table type %d", zbc_val->type);
3842 ret = -EINVAL;
3843 goto err_mutex;
3844 }
3845
3846 if (!added && ret == 0) {
3847 /* update zbc for elpg only when new entry is added */
3848 entries = max(gr->max_used_color_index,
3849 gr->max_used_depth_index);
3850 g->ops.gr.pmu_save_zbc(g, entries);
3851 }
3852
3853err_mutex:
3854 nvgpu_mutex_release(&gr->zbc_lock);
3855 return ret;
3856}
3857
3858/* get a zbc table entry specified by index
3859 * return table size when type is invalid */
3860int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3861 struct zbc_query_params *query_params)
3862{
3863 u32 index = query_params->index_size;
3864 u32 i;
3865
3866 switch (query_params->type) {
3867 case GK20A_ZBC_TYPE_INVALID:
3868 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3869 break;
3870 case GK20A_ZBC_TYPE_COLOR:
3871 if (index >= GK20A_ZBC_TABLE_SIZE) {
3872 nvgpu_err(g,
3873 "invalid zbc color table index");
3874 return -EINVAL;
3875 }
3876 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3877 query_params->color_l2[i] =
3878 gr->zbc_col_tbl[index].color_l2[i];
3879 query_params->color_ds[i] =
3880 gr->zbc_col_tbl[index].color_ds[i];
3881 }
3882 query_params->format = gr->zbc_col_tbl[index].format;
3883 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3884 break;
3885 case GK20A_ZBC_TYPE_DEPTH:
3886 if (index >= GK20A_ZBC_TABLE_SIZE) {
3887 nvgpu_err(g,
3888 "invalid zbc depth table index");
3889 return -EINVAL;
3890 }
3891 query_params->depth = gr->zbc_dep_tbl[index].depth;
3892 query_params->format = gr->zbc_dep_tbl[index].format;
3893 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3894 break;
3895 case T19X_ZBC:
3896 if (g->ops.gr.zbc_s_query_table) {
3897 return g->ops.gr.zbc_s_query_table(g, gr,
3898 query_params);
3899 } else {
3900 nvgpu_err(g,
3901 "invalid zbc table type");
3902 return -EINVAL;
3903 }
3904 break;
3905 default:
3906 nvgpu_err(g,
3907 "invalid zbc table type");
3908 return -EINVAL;
3909 }
3910
3911 return 0;
3912}
3913
3914static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3915{
3916 unsigned int i;
3917 int ret;
3918
3919 for (i = 0; i < gr->max_used_color_index; i++) {
3920 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3921 struct zbc_entry zbc_val;
3922
3923 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3924 memcpy(zbc_val.color_ds,
3925 c_tbl->color_ds, sizeof(zbc_val.color_ds));
3926 memcpy(zbc_val.color_l2,
3927 c_tbl->color_l2, sizeof(zbc_val.color_l2));
3928 zbc_val.format = c_tbl->format;
3929
3930 ret = g->ops.gr.add_zbc_color(g, gr, &zbc_val, i);
3931
3932 if (ret)
3933 return ret;
3934 }
3935 for (i = 0; i < gr->max_used_depth_index; i++) {
3936 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3937 struct zbc_entry zbc_val;
3938
3939 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3940 zbc_val.depth = d_tbl->depth;
3941 zbc_val.format = d_tbl->format;
3942
3943 ret = g->ops.gr.add_zbc_depth(g, gr, &zbc_val, i);
3944 if (ret)
3945 return ret;
3946 }
3947
3948 if (g->ops.gr.load_zbc_s_tbl) {
3949 ret = g->ops.gr.load_zbc_s_tbl(g, gr);
3950 if (ret)
3951 return ret;
3952 }
3953
3954 return 0;
3955}
3956
3957int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3958{
3959 struct zbc_entry zbc_val;
3960 u32 i, err;
3961
3962 nvgpu_mutex_init(&gr->zbc_lock);
3963
3964 /* load default color table */
3965 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3966
3967 /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
3968 zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v();
3969 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3970 zbc_val.color_ds[i] = 0;
3971 zbc_val.color_l2[i] = 0;
3972 }
3973 zbc_val.color_l2[0] = 0xff000000;
3974 zbc_val.color_ds[3] = 0x3f800000;
3975 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3976
3977 /* Transparent black = (fmt 1 = zero) */
3978 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3979 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3980 zbc_val.color_ds[i] = 0;
3981 zbc_val.color_l2[i] = 0;
3982 }
3983 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3984
3985 /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
3986 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3987 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3988 zbc_val.color_ds[i] = 0x3f800000;
3989 zbc_val.color_l2[i] = 0xffffffff;
3990 }
3991 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3992
3993 if (!err)
3994 gr->max_default_color_index = 3;
3995 else {
3996 nvgpu_err(g,
3997 "fail to load default zbc color table");
3998 return err;
3999 }
4000
4001 /* load default depth table */
4002 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
4003
4004 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4005 zbc_val.depth = 0x3f800000;
4006 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4007
4008 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4009 zbc_val.depth = 0;
4010 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
4011
4012 if (!err)
4013 gr->max_default_depth_index = 2;
4014 else {
4015 nvgpu_err(g,
4016 "fail to load default zbc depth table");
4017 return err;
4018 }
4019
4020 if (g->ops.gr.load_zbc_s_default_tbl) {
4021 err = g->ops.gr.load_zbc_s_default_tbl(g, gr);
4022 if (err)
4023 return err;
4024 }
4025
4026 return 0;
4027}
4028
4029int _gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4030 struct zbc_entry *zbc_val)
4031{
4032 struct fifo_gk20a *f = &g->fifo;
4033 struct fifo_engine_info_gk20a *gr_info = NULL;
4034 int ret;
4035 u32 engine_id;
4036
4037 engine_id = gk20a_fifo_get_gr_engine_id(g);
4038 gr_info = (f->engine_info + engine_id);
4039
4040 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
4041 if (ret) {
4042 nvgpu_err(g,
4043 "failed to disable gr engine activity");
4044 return ret;
4045 }
4046
4047 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
4048 GR_IDLE_CHECK_DEFAULT);
4049 if (ret) {
4050 nvgpu_err(g,
4051 "failed to idle graphics");
4052 goto clean_up;
4053 }
4054
4055 ret = gr_gk20a_add_zbc(g, gr, zbc_val);
4056
4057clean_up:
4058 if (gk20a_fifo_enable_engine_activity(g, gr_info)) {
4059 nvgpu_err(g,
4060 "failed to enable gr engine activity");
4061 }
4062
4063 return ret;
4064}
4065
4066int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4067 struct zbc_entry *zbc_val)
4068{
4069 gk20a_dbg_fn("");
4070
4071 return gr_gk20a_elpg_protected_call(g,
4072 gr_gk20a_add_zbc(g, gr, zbc_val));
4073}
4074
4075void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
4076{
4077 u32 gate_ctrl;
4078
4079 if (!nvgpu_is_enabled(g, NVGPU_GPU_CAN_BLCG))
4080 return;
4081
4082 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
4083
4084 switch (mode) {
4085 case BLCG_RUN:
4086 gate_ctrl = set_field(gate_ctrl,
4087 therm_gate_ctrl_blk_clk_m(),
4088 therm_gate_ctrl_blk_clk_run_f());
4089 break;
4090 case BLCG_AUTO:
4091 gate_ctrl = set_field(gate_ctrl,
4092 therm_gate_ctrl_blk_clk_m(),
4093 therm_gate_ctrl_blk_clk_auto_f());
4094 break;
4095 default:
4096 nvgpu_err(g,
4097 "invalid blcg mode %d", mode);
4098 return;
4099 }
4100
4101 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
4102}
4103
4104void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
4105{
4106 u32 gate_ctrl;
4107
4108 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
4109
4110 if (!nvgpu_is_enabled(g, NVGPU_GPU_CAN_ELCG))
4111 return;
4112
4113 switch (mode) {
4114 case ELCG_RUN:
4115 gate_ctrl = set_field(gate_ctrl,
4116 therm_gate_ctrl_eng_clk_m(),
4117 therm_gate_ctrl_eng_clk_run_f());
4118 gate_ctrl = set_field(gate_ctrl,
4119 therm_gate_ctrl_eng_pwr_m(),
4120 /* set elpg to auto to meet hw expectation */
4121 therm_gate_ctrl_eng_pwr_auto_f());
4122 break;
4123 case ELCG_STOP:
4124 gate_ctrl = set_field(gate_ctrl,
4125 therm_gate_ctrl_eng_clk_m(),
4126 therm_gate_ctrl_eng_clk_stop_f());
4127 break;
4128 case ELCG_AUTO:
4129 gate_ctrl = set_field(gate_ctrl,
4130 therm_gate_ctrl_eng_clk_m(),
4131 therm_gate_ctrl_eng_clk_auto_f());
4132 break;
4133 default:
4134 nvgpu_err(g,
4135 "invalid elcg mode %d", mode);
4136 }
4137
4138 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
4139}
4140
4141void gr_gk20a_init_cg_mode(struct gk20a *g, u32 cgmode, u32 mode_config)
4142{
4143 u32 engine_idx;
4144 u32 active_engine_id = 0;
4145 struct fifo_engine_info_gk20a *engine_info = NULL;
4146 struct fifo_gk20a *f = &g->fifo;
4147
4148 for (engine_idx = 0; engine_idx < f->num_engines; ++engine_idx) {
4149 active_engine_id = f->active_engines_list[engine_idx];
4150 engine_info = &f->engine_info[active_engine_id];
4151
4152 /* gr_engine supports both BLCG and ELCG */
4153 if ((cgmode == BLCG_MODE) &&
4154 (engine_info->engine_enum == ENGINE_GR_GK20A)) {
4155 gr_gk20a_init_blcg_mode(g, mode_config, active_engine_id);
4156 break;
4157 } else if (cgmode == ELCG_MODE)
4158 g->ops.gr.init_elcg_mode(g, mode_config,
4159 active_engine_id);
4160 else
4161 nvgpu_err(g, "invalid cg mode %d, config %d for "
4162 "act_eng_id %d",
4163 cgmode, mode_config, active_engine_id);
4164 }
4165}
4166
4167void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
4168 u32 *zcull_map_tiles)
4169{
4170 u32 val;
4171
4172 gk20a_dbg_fn("");
4173
4174 if (zcull_num_entries >= 8) {
4175 gk20a_dbg_fn("map0");
4176 val =
4177 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(
4178 zcull_map_tiles[0]) |
4179 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(
4180 zcull_map_tiles[1]) |
4181 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(
4182 zcull_map_tiles[2]) |
4183 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(
4184 zcull_map_tiles[3]) |
4185 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(
4186 zcull_map_tiles[4]) |
4187 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(
4188 zcull_map_tiles[5]) |
4189 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(
4190 zcull_map_tiles[6]) |
4191 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(
4192 zcull_map_tiles[7]);
4193
4194 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(), val);
4195 }
4196
4197 if (zcull_num_entries >= 16) {
4198 gk20a_dbg_fn("map1");
4199 val =
4200 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(
4201 zcull_map_tiles[8]) |
4202 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(
4203 zcull_map_tiles[9]) |
4204 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(
4205 zcull_map_tiles[10]) |
4206 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(
4207 zcull_map_tiles[11]) |
4208 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(
4209 zcull_map_tiles[12]) |
4210 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(
4211 zcull_map_tiles[13]) |
4212 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(
4213 zcull_map_tiles[14]) |
4214 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(
4215 zcull_map_tiles[15]);
4216
4217 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(), val);
4218 }
4219
4220 if (zcull_num_entries >= 24) {
4221 gk20a_dbg_fn("map2");
4222 val =
4223 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(
4224 zcull_map_tiles[16]) |
4225 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(
4226 zcull_map_tiles[17]) |
4227 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(
4228 zcull_map_tiles[18]) |
4229 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(
4230 zcull_map_tiles[19]) |
4231 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(
4232 zcull_map_tiles[20]) |
4233 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(
4234 zcull_map_tiles[21]) |
4235 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(
4236 zcull_map_tiles[22]) |
4237 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(
4238 zcull_map_tiles[23]);
4239
4240 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(), val);
4241 }
4242
4243 if (zcull_num_entries >= 32) {
4244 gk20a_dbg_fn("map3");
4245 val =
4246 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(
4247 zcull_map_tiles[24]) |
4248 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(
4249 zcull_map_tiles[25]) |
4250 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(
4251 zcull_map_tiles[26]) |
4252 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(
4253 zcull_map_tiles[27]) |
4254 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(
4255 zcull_map_tiles[28]) |
4256 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(
4257 zcull_map_tiles[29]) |
4258 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(
4259 zcull_map_tiles[30]) |
4260 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(
4261 zcull_map_tiles[31]);
4262
4263 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(), val);
4264 }
4265
4266}
4267
4268static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
4269{
4270 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
4271 u32 *zcull_map_tiles, *zcull_bank_counters;
4272 u32 map_counter;
4273 u32 rcp_conserv;
4274 u32 offset;
4275 bool floorsweep = false;
4276 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
4277 u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
4278 u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
4279 GPU_LIT_NUM_TPC_PER_GPC);
4280 u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc;
4281 u32 map_tile_count;
4282
4283 if (!gr->map_tiles)
4284 return -1;
4285
4286 if (zcull_alloc_num % 8 != 0) {
4287 /* Total 8 fields per map reg i.e. tile_0 to tile_7*/
4288 zcull_alloc_num += (zcull_alloc_num % 8);
4289 }
4290 zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4291
4292 if (!zcull_map_tiles) {
4293 nvgpu_err(g,
4294 "failed to allocate zcull map titles");
4295 return -ENOMEM;
4296 }
4297
4298 zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4299
4300 if (!zcull_bank_counters) {
4301 nvgpu_err(g,
4302 "failed to allocate zcull bank counters");
4303 nvgpu_kfree(g, zcull_map_tiles);
4304 return -ENOMEM;
4305 }
4306
4307 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
4308 map_tile_count = gr_gk20a_get_map_tile_count(gr, map_counter);
4309 zcull_map_tiles[map_counter] =
4310 zcull_bank_counters[map_tile_count];
4311 zcull_bank_counters[map_tile_count]++;
4312 }
4313
4314 if (g->ops.gr.program_zcull_mapping)
4315 g->ops.gr.program_zcull_mapping(g, zcull_alloc_num,
4316 zcull_map_tiles);
4317
4318 nvgpu_kfree(g, zcull_map_tiles);
4319 nvgpu_kfree(g, zcull_bank_counters);
4320
4321 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4322 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
4323 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
4324
4325 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4326 gpc_zcull_count < gpc_tpc_count) {
4327 nvgpu_err(g,
4328 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
4329 gpc_zcull_count, gpc_tpc_count, gpc_index);
4330 return -EINVAL;
4331 }
4332 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4333 gpc_zcull_count != 0)
4334 floorsweep = true;
4335 }
4336
4337 /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */
4338 rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(),
4339 gr->gpc_tpc_count[0]);
4340
4341 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4342 offset = gpc_index * gpc_stride;
4343
4344 if (floorsweep) {
4345 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4346 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4347 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4348 gr->max_zcull_per_gpc_count));
4349 } else {
4350 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4351 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4352 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4353 gr->gpc_tpc_count[gpc_index]));
4354 }
4355
4356 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4357 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4358 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4359
4360 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4361 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4362 }
4363
4364 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4365 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4366
4367 return 0;
4368}
4369
4370void gk20a_gr_enable_exceptions(struct gk20a *g)
4371{
4372 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4373 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4374 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4375 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4376 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4377 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4378}
4379
4380void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4381{
4382 struct gr_gk20a *gr = &g->gr;
4383 u32 tpc_mask;
4384
4385 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(),
4386 gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f() |
4387 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f());
4388
4389 tpc_mask =
4390 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);
4391
4392 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
4393}
4394
4395
4396void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4397{
4398 /* enable exceptions */
4399 gk20a_writel(g, gr_fe_hww_esr_r(),
4400 gr_fe_hww_esr_en_enable_f() |
4401 gr_fe_hww_esr_reset_active_f());
4402 gk20a_writel(g, gr_memfmt_hww_esr_r(),
4403 gr_memfmt_hww_esr_en_enable_f() |
4404 gr_memfmt_hww_esr_reset_active_f());
4405}
4406
4407static int gk20a_init_gr_setup_hw(struct gk20a *g)
4408{
4409 struct gr_gk20a *gr = &g->gr;
4410 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4411 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4412 u32 data;
4413 u64 addr;
4414 u32 last_method_data = 0;
4415 u32 i, err;
4416
4417 gk20a_dbg_fn("");
4418
4419 /* init mmu debug buffer */
4420 addr = nvgpu_mem_get_addr(g, &gr->mmu_wr_mem);
4421 addr >>= fb_mmu_debug_wr_addr_alignment_v();
4422
4423 gk20a_writel(g, fb_mmu_debug_wr_r(),
4424 nvgpu_aperture_mask(g, &gr->mmu_wr_mem,
4425 fb_mmu_debug_wr_aperture_sys_mem_ncoh_f(),
4426 fb_mmu_debug_wr_aperture_vid_mem_f()) |
4427 fb_mmu_debug_wr_vol_false_f() |
4428 fb_mmu_debug_wr_addr_f(addr));
4429
4430 addr = nvgpu_mem_get_addr(g, &gr->mmu_rd_mem);
4431 addr >>= fb_mmu_debug_rd_addr_alignment_v();
4432
4433 gk20a_writel(g, fb_mmu_debug_rd_r(),
4434 nvgpu_aperture_mask(g, &gr->mmu_rd_mem,
4435 fb_mmu_debug_wr_aperture_sys_mem_ncoh_f(),
4436 fb_mmu_debug_rd_aperture_vid_mem_f()) |
4437 fb_mmu_debug_rd_vol_false_f() |
4438 fb_mmu_debug_rd_addr_f(addr));
4439
4440 if (g->ops.gr.init_gpc_mmu)
4441 g->ops.gr.init_gpc_mmu(g);
4442
4443 /* load gr floorsweeping registers */
4444 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4445 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4446 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4447 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4448
4449 gr_gk20a_zcull_init_hw(g, gr);
4450
4451 /* Bug 1340570: increase the clock timeout to avoid potential
4452 * operation failure at high gpcclk rate. Default values are 0x400.
4453 */
4454 gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4455 gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4456 gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4457
4458 /* enable fifo access */
4459 gk20a_writel(g, gr_gpfifo_ctl_r(),
4460 gr_gpfifo_ctl_access_enabled_f() |
4461 gr_gpfifo_ctl_semaphore_access_enabled_f());
4462
4463 /* TBD: reload gr ucode when needed */
4464
4465 /* enable interrupts */
4466 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4467 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4468
4469 /* enable fecs error interrupts */
4470 gk20a_writel(g, gr_fecs_host_int_enable_r(),
4471 gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
4472 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4473 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4474 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4475 gr_fecs_host_int_enable_watchdog_enable_f());
4476
4477 g->ops.gr.enable_hww_exceptions(g);
4478 g->ops.gr.set_hww_esr_report_mask(g);
4479
4480 /* enable TPC exceptions per GPC */
4481 if (g->ops.gr.enable_gpc_exceptions)
4482 g->ops.gr.enable_gpc_exceptions(g);
4483
4484 /* TBD: ECC for L1/SM */
4485 /* TBD: enable per BE exceptions */
4486
4487 /* reset and enable exceptions */
4488 g->ops.gr.enable_exceptions(g);
4489
4490 gr_gk20a_load_zbc_table(g, gr);
4491
4492 if (g->ops.ltc.init_cbc)
4493 g->ops.ltc.init_cbc(g, gr);
4494
4495 if (g->ops.fb.init_cbc)
4496 g->ops.fb.init_cbc(g, gr);
4497
4498 if (g->ops.gr.disable_rd_coalesce)
4499 g->ops.gr.disable_rd_coalesce(g);
4500
4501 /* load ctx init */
4502 for (i = 0; i < sw_ctx_load->count; i++)
4503 gk20a_writel(g, sw_ctx_load->l[i].addr,
4504 sw_ctx_load->l[i].value);
4505
4506 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4507 GR_IDLE_CHECK_DEFAULT);
4508 if (err)
4509 goto out;
4510
4511 if (g->ops.gr.init_preemption_state) {
4512 err = g->ops.gr.init_preemption_state(g);
4513 if (err)
4514 goto out;
4515 }
4516
4517 /* disable fe_go_idle */
4518 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4519 gr_fe_go_idle_timeout_count_disabled_f());
4520
4521 /* override a few ctx state registers */
4522 g->ops.gr.commit_global_timeslice(g, NULL);
4523
4524 /* floorsweep anything left */
4525 err = g->ops.gr.init_fs_state(g);
4526 if (err)
4527 goto out;
4528
4529 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4530 GR_IDLE_CHECK_DEFAULT);
4531 if (err)
4532 goto restore_fe_go_idle;
4533
4534restore_fe_go_idle:
4535 /* restore fe_go_idle */
4536 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4537 gr_fe_go_idle_timeout_count_prod_f());
4538
4539 if (err || gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4540 GR_IDLE_CHECK_DEFAULT))
4541 goto out;
4542
4543 /* load method init */
4544 if (sw_method_init->count) {
4545 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4546 sw_method_init->l[0].value);
4547 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4548 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4549 sw_method_init->l[0].addr);
4550 last_method_data = sw_method_init->l[0].value;
4551 }
4552 for (i = 1; i < sw_method_init->count; i++) {
4553 if (sw_method_init->l[i].value != last_method_data) {
4554 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4555 sw_method_init->l[i].value);
4556 last_method_data = sw_method_init->l[i].value;
4557 }
4558 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4559 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4560 sw_method_init->l[i].addr);
4561 }
4562
4563 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4564 GR_IDLE_CHECK_DEFAULT);
4565 if (err)
4566 goto out;
4567
4568 nvgpu_kfree(g, gr->sm_error_states);
4569
4570 /* we need to allocate this after g->ops.gr.init_fs_state() since
4571 * we initialize gr->no_of_sm in this function
4572 */
4573 gr->sm_error_states = nvgpu_kzalloc(g,
4574 sizeof(struct nvgpu_gr_sm_error_state) *
4575 gr->no_of_sm);
4576 if (!gr->sm_error_states) {
4577 err = -ENOMEM;
4578 goto restore_fe_go_idle;
4579 }
4580
4581out:
4582 gk20a_dbg_fn("done");
4583 return err;
4584}
4585
4586static void gr_gk20a_load_gating_prod(struct gk20a *g)
4587{
4588 gk20a_dbg_fn("");
4589
4590 /* slcg prod values */
4591 if (g->ops.clock_gating.slcg_bus_load_gating_prod)
4592 g->ops.clock_gating.slcg_bus_load_gating_prod(g,
4593 g->slcg_enabled);
4594 if (g->ops.clock_gating.slcg_chiplet_load_gating_prod)
4595 g->ops.clock_gating.slcg_chiplet_load_gating_prod(g,
4596 g->slcg_enabled);
4597 if (g->ops.clock_gating.slcg_gr_load_gating_prod)
4598 g->ops.clock_gating.slcg_gr_load_gating_prod(g,
4599 g->slcg_enabled);
4600 if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod)
4601 g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g,
4602 g->slcg_enabled);
4603 if (g->ops.clock_gating.slcg_perf_load_gating_prod)
4604 g->ops.clock_gating.slcg_perf_load_gating_prod(g,
4605 g->slcg_enabled);
4606 if (g->ops.clock_gating.slcg_xbar_load_gating_prod)
4607 g->ops.clock_gating.slcg_xbar_load_gating_prod(g,
4608 g->slcg_enabled);
4609
4610 /* blcg prod values */
4611 if (g->ops.clock_gating.blcg_bus_load_gating_prod)
4612 g->ops.clock_gating.blcg_bus_load_gating_prod(g,
4613 g->blcg_enabled);
4614 if (g->ops.clock_gating.blcg_gr_load_gating_prod)
4615 g->ops.clock_gating.blcg_gr_load_gating_prod(g,
4616 g->blcg_enabled);
4617 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
4618 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g,
4619 g->blcg_enabled);
4620 if (g->ops.clock_gating.blcg_xbar_load_gating_prod)
4621 g->ops.clock_gating.blcg_xbar_load_gating_prod(g,
4622 g->blcg_enabled);
4623 if (g->ops.clock_gating.pg_gr_load_gating_prod)
4624 g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4625
4626 gk20a_dbg_fn("done");
4627}
4628
4629static int gk20a_init_gr_prepare(struct gk20a *g)
4630{
4631 u32 gpfifo_ctrl, pmc_en;
4632 u32 err = 0;
4633
4634 /* disable fifo access */
4635 pmc_en = gk20a_readl(g, mc_enable_r());
4636 if (pmc_en & mc_enable_pgraph_enabled_f()) {
4637 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4638 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4639 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4640 }
4641
4642 /* reset gr engine */
4643 g->ops.mc.reset(g, mc_enable_pgraph_enabled_f() |
4644 mc_enable_blg_enabled_f() |
4645 mc_enable_perfmon_enabled_f());
4646
4647 gr_gk20a_load_gating_prod(g);
4648
4649 /* Disable elcg until it gets enabled later in the init*/
4650 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
4651
4652 /* enable fifo access */
4653 gk20a_writel(g, gr_gpfifo_ctl_r(),
4654 gr_gpfifo_ctl_access_enabled_f() |
4655 gr_gpfifo_ctl_semaphore_access_enabled_f());
4656
4657 if (!g->gr.ctx_vars.valid) {
4658 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4659 if (err)
4660 nvgpu_err(g,
4661 "fail to load gr init ctx");
4662 }
4663 return err;
4664}
4665
4666static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4667{
4668 struct nvgpu_timeout timeout;
4669 bool fecs_scrubbing;
4670 bool gpccs_scrubbing;
4671
4672 gk20a_dbg_fn("");
4673
4674 nvgpu_timeout_init(g, &timeout,
4675 CTXSW_MEM_SCRUBBING_TIMEOUT_MAX /
4676 CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT,
4677 NVGPU_TIMER_RETRY_TIMER);
4678 do {
4679 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4680 (gr_fecs_dmactl_imem_scrubbing_m() |
4681 gr_fecs_dmactl_dmem_scrubbing_m());
4682
4683 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4684 (gr_gpccs_dmactl_imem_scrubbing_m() |
4685 gr_gpccs_dmactl_imem_scrubbing_m());
4686
4687 if (!fecs_scrubbing && !gpccs_scrubbing) {
4688 gk20a_dbg_fn("done");
4689 return 0;
4690 }
4691
4692 nvgpu_udelay(CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT);
4693 } while (!nvgpu_timeout_expired(&timeout));
4694
4695 nvgpu_err(g, "Falcon mem scrubbing timeout");
4696 return -ETIMEDOUT;
4697}
4698
4699static int gr_gk20a_init_ctxsw(struct gk20a *g)
4700{
4701 u32 err = 0;
4702
4703 err = g->ops.gr.load_ctxsw_ucode(g);
4704 if (err)
4705 goto out;
4706
4707 err = gr_gk20a_wait_ctxsw_ready(g);
4708 if (err)
4709 goto out;
4710
4711out:
4712 if (err)
4713 nvgpu_err(g, "fail");
4714 else
4715 gk20a_dbg_fn("done");
4716
4717 return err;
4718}
4719
4720static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4721{
4722 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4723 u32 i, err = 0;
4724
4725 gk20a_dbg_fn("");
4726
4727 /* enable interrupts */
4728 gk20a_writel(g, gr_intr_r(), ~0);
4729 gk20a_writel(g, gr_intr_en_r(), ~0);
4730
4731 /* load non_ctx init */
4732 for (i = 0; i < sw_non_ctx_load->count; i++)
4733 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4734 sw_non_ctx_load->l[i].value);
4735
4736 err = gr_gk20a_wait_mem_scrubbing(g);
4737 if (err)
4738 goto out;
4739
4740 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4741 GR_IDLE_CHECK_DEFAULT);
4742 if (err)
4743 goto out;
4744
4745out:
4746 if (err)
4747 nvgpu_err(g, "fail");
4748 else
4749 gk20a_dbg_fn("done");
4750
4751 return 0;
4752}
4753
4754static int gr_gk20a_init_access_map(struct gk20a *g)
4755{
4756 struct gr_gk20a *gr = &g->gr;
4757 struct nvgpu_mem *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
4758 u32 w, nr_pages =
4759 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4760 PAGE_SIZE);
4761 u32 *whitelist = NULL;
4762 unsigned int num_entries = 0;
4763
4764 if (nvgpu_mem_begin(g, mem)) {
4765 nvgpu_err(g,
4766 "failed to map priv access map memory");
4767 return -ENOMEM;
4768 }
4769
4770 nvgpu_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
4771
4772 g->ops.gr.get_access_map(g, &whitelist, &num_entries);
4773
4774 for (w = 0; w < num_entries; w++) {
4775 u32 map_bit, map_byte, map_shift, x;
4776 map_bit = whitelist[w] >> 2;
4777 map_byte = map_bit >> 3;
4778 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4779 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4780 whitelist[w], map_byte, map_shift);
4781 x = nvgpu_mem_rd32(g, mem, map_byte / sizeof(u32));
4782 x |= 1 << (
4783 (map_byte % sizeof(u32) * BITS_PER_BYTE)
4784 + map_shift);
4785 nvgpu_mem_wr32(g, mem, map_byte / sizeof(u32), x);
4786 }
4787
4788 nvgpu_mem_end(g, mem);
4789 return 0;
4790}
4791
4792static int gk20a_init_gr_setup_sw(struct gk20a *g)
4793{
4794 struct gr_gk20a *gr = &g->gr;
4795 int err;
4796
4797 gk20a_dbg_fn("");
4798
4799 if (gr->sw_ready) {
4800 gk20a_dbg_fn("skip init");
4801 return 0;
4802 }
4803
4804 gr->g = g;
4805
4806#if defined(CONFIG_GK20A_CYCLE_STATS)
4807 nvgpu_mutex_init(&g->gr.cs_lock);
4808#endif
4809
4810 err = gr_gk20a_init_gr_config(g, gr);
4811 if (err)
4812 goto clean_up;
4813
4814 err = gr_gk20a_init_mmu_sw(g, gr);
4815 if (err)
4816 goto clean_up;
4817
4818 err = gr_gk20a_init_map_tiles(g, gr);
4819 if (err)
4820 goto clean_up;
4821
4822 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4823 gr->max_comptag_mem = totalram_pages
4824 >> (10 - (PAGE_SHIFT - 10));
4825 err = g->ops.ltc.init_comptags(g, gr);
4826 if (err)
4827 goto clean_up;
4828
4829 err = gr_gk20a_init_zcull(g, gr);
4830 if (err)
4831 goto clean_up;
4832
4833 err = gr_gk20a_alloc_global_ctx_buffers(g);
4834 if (err)
4835 goto clean_up;
4836
4837 err = gr_gk20a_init_access_map(g);
4838 if (err)
4839 goto clean_up;
4840
4841 gr_gk20a_load_zbc_default_table(g, gr);
4842
4843 if (g->ops.gr.init_czf_bypass)
4844 g->ops.gr.init_czf_bypass(g);
4845
4846 gr->gfxp_wfi_timeout_count = GFXP_WFI_TIMEOUT_COUNT_DEFAULT;
4847
4848 nvgpu_mutex_init(&gr->ctx_mutex);
4849 nvgpu_spinlock_init(&gr->ch_tlb_lock);
4850
4851 gr->remove_support = gk20a_remove_gr_support;
4852 gr->sw_ready = true;
4853
4854 if (g->ops.gr.create_gr_sysfs)
4855 g->ops.gr.create_gr_sysfs(g);
4856
4857 gk20a_dbg_fn("done");
4858 return 0;
4859
4860clean_up:
4861 nvgpu_err(g, "fail");
4862 gk20a_remove_gr_support(gr);
4863 return err;
4864}
4865
4866static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
4867{
4868 struct nvgpu_pmu *pmu = &g->pmu;
4869 struct mm_gk20a *mm = &g->mm;
4870 struct vm_gk20a *vm = mm->pmu.vm;
4871 int err = 0;
4872
4873 u32 size;
4874
4875 gk20a_dbg_fn("");
4876
4877 size = 0;
4878
4879 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4880 if (err) {
4881 nvgpu_err(g,
4882 "fail to query fecs pg buffer size");
4883 return err;
4884 }
4885
4886 if (!pmu->pg_buf.cpu_va) {
4887 err = nvgpu_dma_alloc_map_sys(vm, size, &pmu->pg_buf);
4888 if (err) {
4889 nvgpu_err(g, "failed to allocate memory");
4890 return -ENOMEM;
4891 }
4892 }
4893
4894
4895 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &mm->pmu.inst_block);
4896 if (err) {
4897 nvgpu_err(g,
4898 "fail to bind pmu inst to gr");
4899 return err;
4900 }
4901
4902 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va);
4903 if (err) {
4904 nvgpu_err(g,
4905 "fail to set pg buffer pmu va");
4906 return err;
4907 }
4908
4909 return err;
4910}
4911
4912int gk20a_init_gr_support(struct gk20a *g)
4913{
4914 u32 err;
4915
4916 gk20a_dbg_fn("");
4917
4918 /* this is required before gr_gk20a_init_ctx_state */
4919 nvgpu_mutex_init(&g->gr.fecs_mutex);
4920
4921 err = gr_gk20a_init_ctxsw(g);
4922 if (err)
4923 return err;
4924
4925 /* this appears query for sw states but fecs actually init
4926 ramchain, etc so this is hw init */
4927 err = g->ops.gr.init_ctx_state(g);
4928 if (err)
4929 return err;
4930
4931 err = gk20a_init_gr_setup_sw(g);
4932 if (err)
4933 return err;
4934
4935 err = gk20a_init_gr_setup_hw(g);
4936 if (err)
4937 return err;
4938
4939 if (g->can_elpg) {
4940 err = gk20a_init_gr_bind_fecs_elpg(g);
4941 if (err)
4942 return err;
4943 }
4944
4945 gr_gk20a_enable_elcg(g);
4946 /* GR is inialized, signal possible waiters */
4947 g->gr.initialized = true;
4948 nvgpu_cond_signal(&g->gr.init_wq);
4949
4950 return 0;
4951}
4952
4953/* Wait until GR is initialized */
4954void gk20a_gr_wait_initialized(struct gk20a *g)
4955{
4956 NVGPU_COND_WAIT(&g->gr.init_wq, g->gr.initialized, 0);
4957}
4958
4959#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
4960#define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
4961#define NVA297_SET_SHADER_EXCEPTIONS 0x1528
4962#define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
4963
4964#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4965
4966void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4967{
4968 gk20a_dbg_fn("");
4969
4970 if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4971 gk20a_writel(g,
4972 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4973 gk20a_writel(g,
4974 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4975 } else {
4976 /* setup sm warp esr report masks */
4977 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4978 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4979 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4980 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4981 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4982 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4983 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4984 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4985 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4986 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4987 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4988 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4989 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4990 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4991 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4992 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4993 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4994 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4995 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4996 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4997 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4998
4999 /* setup sm global esr report mask */
5000 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
5001 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
5002 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
5003 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
5004 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
5005 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
5006 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
5007 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
5008 }
5009}
5010
5011int gk20a_enable_gr_hw(struct gk20a *g)
5012{
5013 int err;
5014
5015 gk20a_dbg_fn("");
5016
5017 err = gk20a_init_gr_prepare(g);
5018 if (err)
5019 return err;
5020
5021 err = gk20a_init_gr_reset_enable_hw(g);
5022 if (err)
5023 return err;
5024
5025 gk20a_dbg_fn("done");
5026
5027 return 0;
5028}
5029
5030static void gr_gk20a_enable_elcg(struct gk20a *g)
5031{
5032 if (g->elcg_enabled) {
5033 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_AUTO);
5034 } else {
5035 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
5036 }
5037}
5038
5039int gk20a_gr_reset(struct gk20a *g)
5040{
5041 int err;
5042 u32 size;
5043
5044 nvgpu_mutex_acquire(&g->gr.fecs_mutex);
5045
5046 err = gk20a_enable_gr_hw(g);
5047 if (err) {
5048 nvgpu_mutex_release(&g->gr.fecs_mutex);
5049 return err;
5050 }
5051
5052 err = gk20a_init_gr_setup_hw(g);
5053 if (err) {
5054 nvgpu_mutex_release(&g->gr.fecs_mutex);
5055 return err;
5056 }
5057
5058 err = gr_gk20a_init_ctxsw(g);
5059 if (err) {
5060 nvgpu_mutex_release(&g->gr.fecs_mutex);
5061 return err;
5062 }
5063
5064 nvgpu_mutex_release(&g->gr.fecs_mutex);
5065
5066 /* this appears query for sw states but fecs actually init
5067 ramchain, etc so this is hw init */
5068 err = g->ops.gr.init_ctx_state(g);
5069 if (err)
5070 return err;
5071
5072 size = 0;
5073 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
5074 if (err) {
5075 nvgpu_err(g,
5076 "fail to query fecs pg buffer size");
5077 return err;
5078 }
5079
5080 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &g->mm.pmu.inst_block);
5081 if (err) {
5082 nvgpu_err(g,
5083 "fail to bind pmu inst to gr");
5084 return err;
5085 }
5086
5087 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va);
5088 if (err) {
5089 nvgpu_err(g,
5090 "fail to set pg buffer pmu va");
5091 return err;
5092 }
5093
5094 gr_gk20a_load_gating_prod(g);
5095 gr_gk20a_enable_elcg(g);
5096
5097 return err;
5098}
5099
5100static void gk20a_gr_set_error_notifier(struct gk20a *g,
5101 struct gr_gk20a_isr_data *isr_data, u32 error_notifier)
5102{
5103 struct fifo_gk20a *f = &g->fifo;
5104 struct channel_gk20a *ch;
5105 struct tsg_gk20a *tsg;
5106 struct channel_gk20a *ch_tsg;
5107
5108 if (isr_data->chid != FIFO_INVAL_CHANNEL_ID) {
5109 ch = &f->channel[isr_data->chid];
5110
5111 if (gk20a_is_channel_marked_as_tsg(ch)) {
5112 tsg = &g->fifo.tsg[ch->tsgid];
5113 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
5114 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
5115 if (gk20a_channel_get(ch_tsg)) {
5116 gk20a_set_error_notifier(ch_tsg,
5117 error_notifier);
5118 gk20a_channel_put(ch_tsg);
5119 }
5120 }
5121 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
5122 } else {
5123 gk20a_set_error_notifier(ch, error_notifier);
5124 }
5125 }
5126}
5127
5128static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
5129 struct gr_gk20a_isr_data *isr_data)
5130{
5131 gk20a_dbg_fn("");
5132 gk20a_gr_set_error_notifier(g, isr_data,
5133 NVGPU_CHANNEL_GR_SEMAPHORE_TIMEOUT);
5134 nvgpu_err(g,
5135 "gr semaphore timeout");
5136 return -EINVAL;
5137}
5138
5139static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
5140 struct gr_gk20a_isr_data *isr_data)
5141{
5142 gk20a_dbg_fn("");
5143 gk20a_gr_set_error_notifier(g, isr_data,
5144 NVGPU_CHANNEL_GR_ILLEGAL_NOTIFY);
5145 /* This is an unrecoverable error, reset is needed */
5146 nvgpu_err(g,
5147 "gr semaphore timeout");
5148 return -EINVAL;
5149}
5150
5151static int gk20a_gr_handle_illegal_method(struct gk20a *g,
5152 struct gr_gk20a_isr_data *isr_data)
5153{
5154 int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
5155 isr_data->class_num, isr_data->offset,
5156 isr_data->data_lo);
5157 if (ret) {
5158 gk20a_gr_set_error_notifier(g, isr_data,
5159 NVGPU_CHANNEL_GR_ILLEGAL_NOTIFY);
5160 nvgpu_err(g, "invalid method class 0x%08x"
5161 ", offset 0x%08x address 0x%08x",
5162 isr_data->class_num, isr_data->offset, isr_data->addr);
5163 }
5164 return ret;
5165}
5166
5167static int gk20a_gr_handle_illegal_class(struct gk20a *g,
5168 struct gr_gk20a_isr_data *isr_data)
5169{
5170 gk20a_dbg_fn("");
5171 gk20a_gr_set_error_notifier(g, isr_data,
5172 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
5173 nvgpu_err(g,
5174 "invalid class 0x%08x, offset 0x%08x",
5175 isr_data->class_num, isr_data->offset);
5176 return -EINVAL;
5177}
5178
5179int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
5180 struct gr_gk20a_isr_data *isr_data)
5181{
5182 u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r());
5183 int ret = 0;
5184
5185 gk20a_dbg_fn("");
5186
5187 if (!gr_fecs_intr)
5188 return 0;
5189
5190 nvgpu_err(g,
5191 "unhandled fecs error interrupt 0x%08x for channel %u",
5192 gr_fecs_intr, isr_data->chid);
5193
5194 if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
5195 gk20a_gr_set_error_notifier(g, isr_data,
5196 NVGPU_CHANNEL_FECS_ERR_UNIMP_FIRMWARE_METHOD);
5197 nvgpu_err(g,
5198 "firmware method error 0x%08x for offset 0x%04x",
5199 gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
5200 isr_data->data_lo);
5201 ret = -1;
5202 }
5203
5204 gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);
5205 return ret;
5206}
5207
5208static int gk20a_gr_handle_class_error(struct gk20a *g,
5209 struct gr_gk20a_isr_data *isr_data)
5210{
5211 u32 gr_class_error;
5212
5213 gk20a_dbg_fn("");
5214
5215 gr_class_error =
5216 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
5217 gk20a_gr_set_error_notifier(g, isr_data,
5218 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
5219 nvgpu_err(g, "class error 0x%08x, offset 0x%08x,"
5220 "sub channel 0x%08x mme generated %d,"
5221 " mme pc 0x%08xdata high %d priv status %d"
5222 " unhandled intr 0x%08x for channel %u",
5223 isr_data->class_num, (isr_data->offset << 2),
5224 gr_trapped_addr_subch_v(isr_data->addr),
5225 gr_trapped_addr_mme_generated_v(isr_data->addr),
5226 gr_trapped_data_mme_pc_v(
5227 gk20a_readl(g, gr_trapped_data_mme_r())),
5228 gr_trapped_addr_datahigh_v(isr_data->addr),
5229 gr_trapped_addr_priv_v(isr_data->addr),
5230 gr_class_error, isr_data->chid);
5231
5232 nvgpu_err(g, "trapped data low 0x%08x",
5233 gk20a_readl(g, gr_trapped_data_lo_r()));
5234 if (gr_trapped_addr_datahigh_v(isr_data->addr))
5235 nvgpu_err(g, "trapped data high 0x%08x",
5236 gk20a_readl(g, gr_trapped_data_hi_r()));
5237
5238 return -EINVAL;
5239}
5240
5241static int gk20a_gr_handle_firmware_method(struct gk20a *g,
5242 struct gr_gk20a_isr_data *isr_data)
5243{
5244 gk20a_dbg_fn("");
5245
5246 gk20a_gr_set_error_notifier(g, isr_data,
5247 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
5248 nvgpu_err(g,
5249 "firmware method 0x%08x, offset 0x%08x for channel %u",
5250 isr_data->class_num, isr_data->offset,
5251 isr_data->chid);
5252 return -EINVAL;
5253}
5254
5255static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
5256 struct gr_gk20a_isr_data *isr_data)
5257{
5258 struct fifo_gk20a *f = &g->fifo;
5259 struct channel_gk20a *ch = &f->channel[isr_data->chid];
5260
5261 if (gk20a_is_channel_marked_as_tsg(ch)) {
5262 struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
5263
5264 gk20a_tsg_event_id_post_event(tsg,
5265 NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
5266 } else {
5267 gk20a_channel_event_id_post_event(ch,
5268 NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
5269 }
5270
5271 nvgpu_cond_broadcast(&ch->semaphore_wq);
5272
5273 return 0;
5274}
5275
5276#if defined(CONFIG_GK20A_CYCLE_STATS)
5277static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
5278 u32 offset)
5279{
5280 /* support only 24-bit 4-byte aligned offsets */
5281 bool valid = !(offset & 0xFF000003);
5282
5283 if (g->allow_all)
5284 return true;
5285
5286 /* whitelist check */
5287 valid = valid &&
5288 is_bar0_global_offset_whitelisted_gk20a(g, offset);
5289 /* resource size check in case there was a problem
5290 * with allocating the assumed size of bar0 */
5291 valid = valid && gk20a_io_valid_reg(g, offset);
5292 return valid;
5293}
5294#endif
5295
5296static int gk20a_gr_handle_notify_pending(struct gk20a *g,
5297 struct gr_gk20a_isr_data *isr_data)
5298{
5299 struct fifo_gk20a *f = &g->fifo;
5300 struct channel_gk20a *ch = &f->channel[isr_data->chid];
5301
5302#if defined(CONFIG_GK20A_CYCLE_STATS)
5303 void *virtual_address;
5304 u32 buffer_size;
5305 u32 offset;
5306 bool exit;
5307
5308 /* GL will never use payload 0 for cycle state */
5309 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
5310 return 0;
5311
5312 nvgpu_mutex_acquire(&ch->cyclestate.cyclestate_buffer_mutex);
5313
5314 virtual_address = ch->cyclestate.cyclestate_buffer;
5315 buffer_size = ch->cyclestate.cyclestate_buffer_size;
5316 offset = isr_data->data_lo;
5317 exit = false;
5318 while (!exit) {
5319 struct share_buffer_head *sh_hdr;
5320 u32 min_element_size;
5321
5322 /* validate offset */
5323 if (offset + sizeof(struct share_buffer_head) > buffer_size ||
5324 offset + sizeof(struct share_buffer_head) < offset) {
5325 nvgpu_err(g,
5326 "cyclestats buffer overrun at offset 0x%x",
5327 offset);
5328 break;
5329 }
5330
5331 sh_hdr = (struct share_buffer_head *)
5332 ((char *)virtual_address + offset);
5333
5334 min_element_size =
5335 (sh_hdr->operation == OP_END ?
5336 sizeof(struct share_buffer_head) :
5337 sizeof(struct gk20a_cyclestate_buffer_elem));
5338
5339 /* validate sh_hdr->size */
5340 if (sh_hdr->size < min_element_size ||
5341 offset + sh_hdr->size > buffer_size ||
5342 offset + sh_hdr->size < offset) {
5343 nvgpu_err(g,
5344 "bad cyclestate buffer header size at offset 0x%x",
5345 offset);
5346 sh_hdr->failed = true;
5347 break;
5348 }
5349
5350 switch (sh_hdr->operation) {
5351 case OP_END:
5352 exit = true;
5353 break;
5354
5355 case BAR0_READ32:
5356 case BAR0_WRITE32:
5357 {
5358 struct gk20a_cyclestate_buffer_elem *op_elem =
5359 (struct gk20a_cyclestate_buffer_elem *)sh_hdr;
5360 bool valid = is_valid_cyclestats_bar0_offset_gk20a(
5361 g, op_elem->offset_bar0);
5362 u32 raw_reg;
5363 u64 mask_orig;
5364 u64 v;
5365
5366 if (!valid) {
5367 nvgpu_err(g,
5368 "invalid cycletstats op offset: 0x%x",
5369 op_elem->offset_bar0);
5370
5371 sh_hdr->failed = exit = true;
5372 break;
5373 }
5374
5375
5376 mask_orig =
5377 ((1ULL <<
5378 (op_elem->last_bit + 1))
5379 -1)&~((1ULL <<
5380 op_elem->first_bit)-1);
5381
5382 raw_reg =
5383 gk20a_readl(g,
5384 op_elem->offset_bar0);
5385
5386 switch (sh_hdr->operation) {
5387 case BAR0_READ32:
5388 op_elem->data =
5389 (raw_reg & mask_orig)
5390 >> op_elem->first_bit;
5391 break;
5392
5393 case BAR0_WRITE32:
5394 v = 0;
5395 if ((unsigned int)mask_orig !=
5396 (unsigned int)~0) {
5397 v = (unsigned int)
5398 (raw_reg & ~mask_orig);
5399 }
5400
5401 v |= ((op_elem->data
5402 << op_elem->first_bit)
5403 & mask_orig);
5404
5405 gk20a_writel(g,
5406 op_elem->offset_bar0,
5407 (unsigned int)v);
5408 break;
5409 default:
5410 /* nop ok?*/
5411 break;
5412 }
5413 }
5414 break;
5415
5416 default:
5417 /* no operation content case */
5418 exit = true;
5419 break;
5420 }
5421 sh_hdr->completed = true;
5422 offset += sh_hdr->size;
5423 }
5424 nvgpu_mutex_release(&ch->cyclestate.cyclestate_buffer_mutex);
5425#endif
5426 gk20a_dbg_fn("");
5427 nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
5428 return 0;
5429}
5430
5431/* Used by sw interrupt thread to translate current ctx to chid.
5432 * Also used by regops to translate current ctx to chid and tsgid.
5433 * For performance, we don't want to go through 128 channels every time.
5434 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5435 * A small tlb is used here to cache translation.
5436 *
5437 * Returned channel must be freed with gk20a_channel_put() */
5438static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
5439 struct gk20a *g, u32 curr_ctx, int *curr_tsgid)
5440{
5441 struct fifo_gk20a *f = &g->fifo;
5442 struct gr_gk20a *gr = &g->gr;
5443 u32 chid = -1;
5444 int tsgid = NVGPU_INVALID_TSG_ID;
5445 u32 i;
5446 struct channel_gk20a *ret = NULL;
5447
5448 /* when contexts are unloaded from GR, the valid bit is reset
5449 * but the instance pointer information remains intact.
5450 * This might be called from gr_isr where contexts might be
5451 * unloaded. No need to check ctx_valid bit
5452 */
5453
5454 nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
5455
5456 /* check cache first */
5457 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5458 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5459 chid = gr->chid_tlb[i].chid;
5460 tsgid = gr->chid_tlb[i].tsgid;
5461 ret = gk20a_channel_get(&f->channel[chid]);
5462 goto unlock;
5463 }
5464 }
5465
5466 /* slow path */
5467 for (chid = 0; chid < f->num_channels; chid++) {
5468 struct channel_gk20a *ch = &f->channel[chid];
5469 if (!gk20a_channel_get(ch))
5470 continue;
5471
5472 if ((u32)(nvgpu_inst_block_addr(g, &ch->inst_block) >>
5473 ram_in_base_shift_v()) ==
5474 gr_fecs_current_ctx_ptr_v(curr_ctx)) {
5475 tsgid = ch->tsgid;
5476 /* found it */
5477 ret = ch;
5478 break;
5479 }
5480 gk20a_channel_put(ch);
5481 }
5482
5483 if (!ret)
5484 goto unlock;
5485
5486 /* add to free tlb entry */
5487 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5488 if (gr->chid_tlb[i].curr_ctx == 0) {
5489 gr->chid_tlb[i].curr_ctx = curr_ctx;
5490 gr->chid_tlb[i].chid = chid;
5491 gr->chid_tlb[i].tsgid = tsgid;
5492 goto unlock;
5493 }
5494 }
5495
5496 /* no free entry, flush one */
5497 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5498 gr->chid_tlb[gr->channel_tlb_flush_index].chid = chid;
5499 gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid;
5500
5501 gr->channel_tlb_flush_index =
5502 (gr->channel_tlb_flush_index + 1) &
5503 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5504
5505unlock:
5506 nvgpu_spinlock_release(&gr->ch_tlb_lock);
5507 if (curr_tsgid)
5508 *curr_tsgid = tsgid;
5509 return ret;
5510}
5511
5512int gk20a_gr_lock_down_sm(struct gk20a *g,
5513 u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
5514 bool check_errors)
5515{
5516 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5517 u32 dbgr_control0;
5518
5519 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5520 "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm);
5521
5522 /* assert stop trigger */
5523 dbgr_control0 =
5524 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
5525 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5526 gk20a_writel(g,
5527 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
5528
5529 return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask,
5530 check_errors);
5531}
5532
5533bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5534{
5535 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5536
5537 /* check if an sm debugger is attached.
5538 * assumption: all SMs will have debug mode enabled/disabled
5539 * uniformly. */
5540 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5541 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5542 return true;
5543
5544 return false;
5545}
5546
5547int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
5548 bool *post_event, struct channel_gk20a *fault_ch,
5549 u32 *hww_global_esr)
5550{
5551 int ret = 0;
5552 bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
5553 bool disable_sm_exceptions = true;
5554 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5555 bool sm_debugger_attached;
5556 u32 global_esr, warp_esr, global_mask;
5557
5558 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5559
5560 sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
5561
5562 global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
5563 *hww_global_esr = global_esr;
5564 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
5565 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
5566
5567 if (!sm_debugger_attached) {
5568 nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
5569 global_esr, warp_esr);
5570 return -EFAULT;
5571 }
5572
5573 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5574 "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
5575
5576 gr_gk20a_elpg_protected_call(g,
5577 g->ops.gr.record_sm_error_state(g, gpc, tpc));
5578
5579 if (g->ops.gr.pre_process_sm_exception) {
5580 ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
5581 global_esr, warp_esr,
5582 sm_debugger_attached,
5583 fault_ch,
5584 &early_exit,
5585 &ignore_debugger);
5586 if (ret) {
5587 nvgpu_err(g, "could not pre-process sm error!");
5588 return ret;
5589 }
5590 }
5591
5592 if (early_exit) {
5593 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5594 "returning early");
5595 return ret;
5596 }
5597
5598 /*
5599 * Disable forwarding of tpc exceptions,
5600 * the debugger will reenable exceptions after servicing them.
5601 *
5602 * Do not disable exceptions if the only SM exception is BPT_INT
5603 */
5604 if ((global_esr == gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f())
5605 && (warp_esr == 0))
5606 disable_sm_exceptions = false;
5607
5608 if (!ignore_debugger && disable_sm_exceptions) {
5609 u32 tpc_exception_en = gk20a_readl(g,
5610 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
5611 offset);
5612 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5613 gk20a_writel(g,
5614 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
5615 tpc_exception_en);
5616 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM Exceptions disabled");
5617 }
5618
5619 /* if a debugger is present and an error has occurred, do a warp sync */
5620 if (!ignore_debugger &&
5621 ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5622 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5623 do_warp_sync = true;
5624 }
5625
5626 if (do_warp_sync) {
5627 ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
5628 global_mask, true);
5629 if (ret) {
5630 nvgpu_err(g, "sm did not lock down!");
5631 return ret;
5632 }
5633 }
5634
5635 if (ignore_debugger)
5636 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5637 "ignore_debugger set, skipping event posting");
5638 else
5639 *post_event |= true;
5640
5641 return ret;
5642}
5643
5644int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
5645 bool *post_event)
5646{
5647 int ret = 0;
5648 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
5649 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
5650 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
5651 u32 esr;
5652
5653 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5654
5655 esr = gk20a_readl(g,
5656 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
5657 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);
5658
5659 gk20a_writel(g,
5660 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
5661 esr);
5662
5663 return ret;
5664}
5665
5666void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
5667 u32 *esr_sm_sel)
5668{
5669 *esr_sm_sel = 1;
5670}
5671
5672static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5673 bool *post_event, struct channel_gk20a *fault_ch,
5674 u32 *hww_global_esr)
5675{
5676 int ret = 0;
5677 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5678 u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r()
5679 + offset);
5680 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
5681
5682 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5683 "GPC%d TPC%d: pending exception 0x%x",
5684 gpc, tpc, tpc_exception);
5685
5686 /* check if an sm exeption is pending */
5687 if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) ==
5688 gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) {
5689 u32 esr_sm_sel, sm;
5690
5691 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5692 "GPC%d TPC%d: SM exception pending", gpc, tpc);
5693
5694 if (g->ops.gr.handle_tpc_sm_ecc_exception)
5695 g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc,
5696 post_event, fault_ch, hww_global_esr);
5697
5698 g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel);
5699
5700 for (sm = 0; sm < sm_per_tpc; sm++) {
5701
5702 if (!(esr_sm_sel & (1 << sm)))
5703 continue;
5704
5705 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5706 "GPC%d TPC%d: SM%d exception pending",
5707 gpc, tpc, sm);
5708
5709 ret = g->ops.gr.handle_sm_exception(g,
5710 gpc, tpc, sm, post_event, fault_ch,
5711 hww_global_esr);
5712 /* clear the hwws, also causes tpc and gpc
5713 * exceptions to be cleared. Should be cleared
5714 * only if SM is locked down or empty.
5715 */
5716 g->ops.gr.clear_sm_hww(g,
5717 gpc, tpc, sm, *hww_global_esr);
5718
5719 }
5720
5721 }
5722
5723 /* check if a tex exeption is pending */
5724 if (gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(tpc_exception) ==
5725 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) {
5726 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5727 "GPC%d TPC%d: TEX exception pending", gpc, tpc);
5728 ret = g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event);
5729 }
5730
5731 if (g->ops.gr.handle_tpc_mpc_exception)
5732 ret = g->ops.gr.handle_tpc_mpc_exception(g,
5733 gpc, tpc, post_event);
5734
5735 return ret;
5736}
5737
5738static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5739 struct channel_gk20a *fault_ch, u32 *hww_global_esr)
5740{
5741 int ret = 0;
5742 u32 gpc_offset, gpc, tpc;
5743 struct gr_gk20a *gr = &g->gr;
5744 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5745 u32 gpc_exception;
5746
5747 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5748
5749 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
5750 if ((exception1 & (1 << gpc)) == 0)
5751 continue;
5752
5753 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5754 "GPC%d exception pending", gpc);
5755
5756 gpc_offset = gk20a_gr_gpc_offset(g, gpc);
5757
5758 gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r()
5759 + gpc_offset);
5760
5761 /* check if any tpc has an exception */
5762 for (tpc = 0; tpc < gr->tpc_count; tpc++) {
5763 if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) &
5764 (1 << tpc)) == 0)
5765 continue;
5766
5767 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5768 "GPC%d: TPC%d exception pending", gpc, tpc);
5769
5770 ret = gk20a_gr_handle_tpc_exception(g, gpc, tpc,
5771 post_event, fault_ch, hww_global_esr);
5772
5773 }
5774
5775 /* Handle GCC exception */
5776 if (gr_gpc0_gpccs_gpc_exception_gcc_v(gpc_exception) &&
5777 g->ops.gr.handle_gcc_exception) {
5778 int gcc_ret = 0;
5779 gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc,
5780 post_event, fault_ch, hww_global_esr);
5781 ret = ret ? ret : gcc_ret;
5782 }
5783
5784 /* Handle GPCCS exceptions */
5785 if (g->ops.gr.handle_gpc_gpccs_exception) {
5786 int ret_ecc = 0;
5787 ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
5788 gpc_exception);
5789 ret = ret ? ret : ret_ecc;
5790 }
5791
5792 /* Handle GPCMMU exceptions */
5793 if (g->ops.gr.handle_gpc_gpcmmu_exception) {
5794 int ret_mmu = 0;
5795
5796 ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
5797 gpc_exception);
5798 ret = ret ? ret : ret_mmu;
5799 }
5800
5801 }
5802
5803 return ret;
5804}
5805
5806static int gk20a_gr_post_bpt_events(struct gk20a *g, struct channel_gk20a *ch,
5807 u32 global_esr)
5808{
5809 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) {
5810 if (gk20a_is_channel_marked_as_tsg(ch)) {
5811 struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
5812
5813 gk20a_tsg_event_id_post_event(tsg,
5814 NVGPU_EVENT_ID_BPT_INT);
5815 } else {
5816 gk20a_channel_event_id_post_event(ch,
5817 NVGPU_EVENT_ID_BPT_INT);
5818 }
5819 }
5820 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) {
5821 if (gk20a_is_channel_marked_as_tsg(ch)) {
5822 struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
5823
5824 gk20a_tsg_event_id_post_event(tsg,
5825 NVGPU_EVENT_ID_BPT_PAUSE);
5826 } else {
5827 gk20a_channel_event_id_post_event(ch,
5828 NVGPU_EVENT_ID_BPT_PAUSE);
5829 }
5830 }
5831
5832 return 0;
5833}
5834
5835int gk20a_gr_isr(struct gk20a *g)
5836{
5837 struct gr_gk20a_isr_data isr_data;
5838 u32 grfifo_ctl;
5839 u32 obj_table;
5840 int need_reset = 0;
5841 u32 gr_intr = gk20a_readl(g, gr_intr_r());
5842 struct channel_gk20a *ch = NULL;
5843 struct channel_gk20a *fault_ch = NULL;
5844 int tsgid = NVGPU_INVALID_TSG_ID;
5845 u32 gr_engine_id;
5846 u32 global_esr = 0;
5847
5848 gk20a_dbg_fn("");
5849 gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5850
5851 if (!gr_intr)
5852 return 0;
5853
5854 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
5855 if (gr_engine_id != FIFO_INVAL_ENGINE_ID)
5856 gr_engine_id = BIT(gr_engine_id);
5857
5858 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5859 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5860 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5861
5862 gk20a_writel(g, gr_gpfifo_ctl_r(),
5863 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5864 gr_gpfifo_ctl_semaphore_access_f(0));
5865
5866 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5867 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5868 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5869 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5870 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5871 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5872 obj_table = (isr_data.sub_chan < 4) ? gk20a_readl(g,
5873 gr_fe_object_table_r(isr_data.sub_chan)) : 0;
5874 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5875
5876 ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, &tsgid);
5877 if (ch) {
5878 isr_data.chid = ch->chid;
5879 } else {
5880 isr_data.chid = FIFO_INVAL_CHANNEL_ID;
5881 nvgpu_err(g, "ch id is INVALID 0xffffffff");
5882 }
5883
5884 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5885 "channel %d: addr 0x%08x, "
5886 "data 0x%08x 0x%08x,"
5887 "ctx 0x%08x, offset 0x%08x, "
5888 "subchannel 0x%08x, class 0x%08x",
5889 isr_data.chid, isr_data.addr,
5890 isr_data.data_hi, isr_data.data_lo,
5891 isr_data.curr_ctx, isr_data.offset,
5892 isr_data.sub_chan, isr_data.class_num);
5893
5894 if (gr_intr & gr_intr_notify_pending_f()) {
5895 gk20a_gr_handle_notify_pending(g, &isr_data);
5896 gk20a_writel(g, gr_intr_r(),
5897 gr_intr_notify_reset_f());
5898 gr_intr &= ~gr_intr_notify_pending_f();
5899 }
5900
5901 if (gr_intr & gr_intr_semaphore_pending_f()) {
5902 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5903 gk20a_writel(g, gr_intr_r(),
5904 gr_intr_semaphore_reset_f());
5905 gr_intr &= ~gr_intr_semaphore_pending_f();
5906 }
5907
5908 if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5909 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5910 &isr_data);
5911 gk20a_writel(g, gr_intr_r(),
5912 gr_intr_semaphore_reset_f());
5913 gr_intr &= ~gr_intr_semaphore_pending_f();
5914 }
5915
5916 if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5917 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5918 &isr_data);
5919 gk20a_writel(g, gr_intr_r(),
5920 gr_intr_illegal_notify_reset_f());
5921 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5922 }
5923
5924 if (gr_intr & gr_intr_illegal_method_pending_f()) {
5925 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5926 gk20a_writel(g, gr_intr_r(),
5927 gr_intr_illegal_method_reset_f());
5928 gr_intr &= ~gr_intr_illegal_method_pending_f();
5929 }
5930
5931 if (gr_intr & gr_intr_illegal_class_pending_f()) {
5932 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5933 gk20a_writel(g, gr_intr_r(),
5934 gr_intr_illegal_class_reset_f());
5935 gr_intr &= ~gr_intr_illegal_class_pending_f();
5936 }
5937
5938 if (gr_intr & gr_intr_fecs_error_pending_f()) {
5939 need_reset |= g->ops.gr.handle_fecs_error(g, ch, &isr_data);
5940 gk20a_writel(g, gr_intr_r(),
5941 gr_intr_fecs_error_reset_f());
5942 gr_intr &= ~gr_intr_fecs_error_pending_f();
5943 }
5944
5945 if (gr_intr & gr_intr_class_error_pending_f()) {
5946 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5947 gk20a_writel(g, gr_intr_r(),
5948 gr_intr_class_error_reset_f());
5949 gr_intr &= ~gr_intr_class_error_pending_f();
5950 }
5951
5952 /* this one happens if someone tries to hit a non-whitelisted
5953 * register using set_falcon[4] */
5954 if (gr_intr & gr_intr_firmware_method_pending_f()) {
5955 need_reset |= gk20a_gr_handle_firmware_method(g, &isr_data);
5956 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5957 gk20a_writel(g, gr_intr_r(),
5958 gr_intr_firmware_method_reset_f());
5959 gr_intr &= ~gr_intr_firmware_method_pending_f();
5960 }
5961
5962 if (gr_intr & gr_intr_exception_pending_f()) {
5963 u32 exception = gk20a_readl(g, gr_exception_r());
5964
5965 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5966
5967 if (exception & gr_exception_fe_m()) {
5968 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5969 nvgpu_err(g, "fe warning %08x", fe);
5970 gk20a_writel(g, gr_fe_hww_esr_r(),
5971 gr_fe_hww_esr_reset_active_f());
5972 need_reset |= -EFAULT;
5973 }
5974
5975 if (exception & gr_exception_memfmt_m()) {
5976 u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
5977 nvgpu_err(g, "memfmt exception %08x", memfmt);
5978 gk20a_writel(g, gr_memfmt_hww_esr_r(),
5979 gr_memfmt_hww_esr_reset_active_f());
5980 need_reset |= -EFAULT;
5981 }
5982
5983 /* check if a gpc exception has occurred */
5984 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5985 bool post_event = false;
5986
5987 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5988 "GPC exception pending");
5989
5990 fault_ch = gk20a_fifo_channel_from_chid(g,
5991 isr_data.chid);
5992
5993 /*isr_data.chid can be ~0 and fault_ch can be NULL */
5994 /* check if any gpc has an exception */
5995 need_reset |= gk20a_gr_handle_gpc_exception(g,
5996 &post_event, fault_ch, &global_esr);
5997
5998 /* signal clients waiting on an event */
5999 if (g->ops.gr.sm_debugger_attached(g) &&
6000 post_event && fault_ch) {
6001 gk20a_dbg_gpu_post_events(fault_ch);
6002 }
6003 }
6004
6005 if (exception & gr_exception_ds_m()) {
6006 u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
6007 nvgpu_err(g, "ds exception %08x", ds);
6008 gk20a_writel(g, gr_ds_hww_esr_r(),
6009 gr_ds_hww_esr_reset_task_f());
6010 need_reset |= -EFAULT;
6011 }
6012
6013 if (exception & gr_exception_sked_m()) {
6014 u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
6015
6016 nvgpu_err(g, "sked exception %08x", sked);
6017 gk20a_writel(g, gr_sked_hww_esr_r(),
6018 gr_sked_hww_esr_reset_active_f());
6019 }
6020
6021 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
6022 gr_intr &= ~gr_intr_exception_pending_f();
6023
6024 if (need_reset) {
6025 nvgpu_err(g, "set gr exception notifier");
6026 gk20a_gr_set_error_notifier(g, &isr_data,
6027 NVGPU_CHANNEL_GR_EXCEPTION);
6028 }
6029 }
6030
6031 if (need_reset) {
6032 if (tsgid != NVGPU_INVALID_TSG_ID)
6033 gk20a_fifo_recover(g, gr_engine_id,
6034 tsgid, true, true, true);
6035 else if (ch)
6036 gk20a_fifo_recover(g, gr_engine_id,
6037 ch->chid, false, true, true);
6038 else
6039 gk20a_fifo_recover(g, gr_engine_id,
6040 0, false, false, true);
6041 }
6042
6043 if (gr_intr && !ch) {
6044 /* Clear interrupts for unused channel. This is
6045 probably an interrupt during gk20a_free_channel() */
6046 nvgpu_err(g,
6047 "unhandled gr interrupt 0x%08x for unreferenceable channel, clearing",
6048 gr_intr);
6049 gk20a_writel(g, gr_intr_r(), gr_intr);
6050 gr_intr = 0;
6051 }
6052
6053 gk20a_writel(g, gr_gpfifo_ctl_r(),
6054 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
6055 gr_gpfifo_ctl_semaphore_access_f(1));
6056
6057 if (gr_intr)
6058 nvgpu_err(g,
6059 "unhandled gr interrupt 0x%08x", gr_intr);
6060
6061 /* Posting of BPT events should be the last thing in this function */
6062 if (global_esr && fault_ch)
6063 gk20a_gr_post_bpt_events(g, fault_ch, global_esr);
6064
6065 if (ch)
6066 gk20a_channel_put(ch);
6067
6068 return 0;
6069}
6070
6071int gk20a_gr_nonstall_isr(struct gk20a *g)
6072{
6073 int ops = 0;
6074 u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
6075
6076 gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
6077
6078 if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
6079 /* Clear the interrupt */
6080 gk20a_writel(g, gr_intr_nonstall_r(),
6081 gr_intr_nonstall_trap_pending_f());
6082 ops |= (gk20a_nonstall_ops_wakeup_semaphore |
6083 gk20a_nonstall_ops_post_events);
6084 }
6085 return ops;
6086}
6087
6088int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
6089{
6090 BUG_ON(size == NULL);
6091 return gr_gk20a_submit_fecs_method_op(g,
6092 (struct fecs_method_op_gk20a) {
6093 .mailbox.id = 0,
6094 .mailbox.data = 0,
6095 .mailbox.clr = ~0,
6096 .method.data = 1,
6097 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
6098 .mailbox.ret = size,
6099 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
6100 .mailbox.ok = 0,
6101 .cond.fail = GR_IS_UCODE_OP_SKIP,
6102 .mailbox.fail = 0}, false);
6103}
6104
6105int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g,
6106 struct nvgpu_mem *inst_block)
6107{
6108 u32 data = fecs_current_ctx_data(g, inst_block);
6109
6110 return gr_gk20a_submit_fecs_method_op(g,
6111 (struct fecs_method_op_gk20a){
6112 .mailbox.id = 4,
6113 .mailbox.data = data,
6114 .mailbox.clr = ~0,
6115 .method.data = 1,
6116 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
6117 .mailbox.ret = NULL,
6118 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6119 .mailbox.ok = 1,
6120 .cond.fail = GR_IS_UCODE_OP_SKIP,
6121 .mailbox.fail = 0}, false);
6122}
6123
6124int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
6125{
6126 return gr_gk20a_submit_fecs_method_op(g,
6127 (struct fecs_method_op_gk20a) {
6128 .mailbox.id = 4,
6129 .mailbox.data = u64_lo32(pmu_va >> 8),
6130 .mailbox.clr = ~0,
6131 .method.data = 1,
6132 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
6133 .mailbox.ret = NULL,
6134 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6135 .mailbox.ok = 1,
6136 .cond.fail = GR_IS_UCODE_OP_SKIP,
6137 .mailbox.fail = 0}, false);
6138}
6139
6140int gk20a_gr_suspend(struct gk20a *g)
6141{
6142 u32 ret = 0;
6143
6144 gk20a_dbg_fn("");
6145
6146 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
6147 GR_IDLE_CHECK_DEFAULT);
6148 if (ret)
6149 return ret;
6150
6151 gk20a_writel(g, gr_gpfifo_ctl_r(),
6152 gr_gpfifo_ctl_access_disabled_f());
6153
6154 /* disable gr intr */
6155 gk20a_writel(g, gr_intr_r(), 0);
6156 gk20a_writel(g, gr_intr_en_r(), 0);
6157
6158 /* disable all exceptions */
6159 gk20a_writel(g, gr_exception_r(), 0);
6160 gk20a_writel(g, gr_exception_en_r(), 0);
6161 gk20a_writel(g, gr_exception1_r(), 0);
6162 gk20a_writel(g, gr_exception1_en_r(), 0);
6163 gk20a_writel(g, gr_exception2_r(), 0);
6164 gk20a_writel(g, gr_exception2_en_r(), 0);
6165
6166 gk20a_gr_flush_channel_tlb(&g->gr);
6167
6168 g->gr.initialized = false;
6169
6170 gk20a_dbg_fn("done");
6171 return ret;
6172}
6173
6174static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6175 u32 addr,
6176 bool is_quad, u32 quad,
6177 u32 *context_buffer,
6178 u32 context_buffer_size,
6179 u32 *priv_offset);
6180
6181static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
6182 u32 addr,
6183 u32 *priv_offset);
6184
6185/* This function will decode a priv address and return the partition type and numbers. */
6186static int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
6187 int *addr_type, /* enum ctxsw_addr_type */
6188 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
6189 u32 *broadcast_flags)
6190{
6191 u32 gpc_addr;
6192 u32 ppc_address;
6193 u32 ppc_broadcast_addr;
6194
6195 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6196
6197 /* setup defaults */
6198 ppc_address = 0;
6199 ppc_broadcast_addr = 0;
6200 *addr_type = CTXSW_ADDR_TYPE_SYS;
6201 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
6202 *gpc_num = 0;
6203 *tpc_num = 0;
6204 *ppc_num = 0;
6205 *be_num = 0;
6206
6207 if (pri_is_gpc_addr(g, addr)) {
6208 *addr_type = CTXSW_ADDR_TYPE_GPC;
6209 gpc_addr = pri_gpccs_addr_mask(addr);
6210 if (pri_is_gpc_addr_shared(g, addr)) {
6211 *addr_type = CTXSW_ADDR_TYPE_GPC;
6212 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
6213 } else
6214 *gpc_num = pri_get_gpc_num(g, addr);
6215
6216 if (pri_is_ppc_addr(g, gpc_addr)) {
6217 *addr_type = CTXSW_ADDR_TYPE_PPC;
6218 if (pri_is_ppc_addr_shared(g, gpc_addr)) {
6219 *broadcast_flags |= PRI_BROADCAST_FLAGS_PPC;
6220 return 0;
6221 }
6222 }
6223 if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
6224 *addr_type = CTXSW_ADDR_TYPE_TPC;
6225 if (pri_is_tpc_addr_shared(g, gpc_addr)) {
6226 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
6227 return 0;
6228 }
6229 *tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6230 }
6231 return 0;
6232 } else if (pri_is_be_addr(g, addr)) {
6233 *addr_type = CTXSW_ADDR_TYPE_BE;
6234 if (pri_is_be_addr_shared(g, addr)) {
6235 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
6236 return 0;
6237 }
6238 *be_num = pri_get_be_num(g, addr);
6239 return 0;
6240 } else if (pri_is_ltc_addr(addr)) {
6241 *addr_type = CTXSW_ADDR_TYPE_LTCS;
6242 if (g->ops.gr.is_ltcs_ltss_addr(g, addr))
6243 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS;
6244 else if (g->ops.gr.is_ltcn_ltss_addr(g, addr))
6245 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS;
6246 return 0;
6247 } else if (pri_is_fbpa_addr(g, addr)) {
6248 *addr_type = CTXSW_ADDR_TYPE_FBPA;
6249 if (pri_is_fbpa_addr_shared(g, addr)) {
6250 *broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA;
6251 return 0;
6252 }
6253 return 0;
6254 } else if (g->ops.gr.is_egpc_addr && g->ops.gr.is_egpc_addr(g, addr)) {
6255 return g->ops.gr.decode_egpc_addr(g,
6256 addr, addr_type, gpc_num,
6257 tpc_num, broadcast_flags);
6258 } else {
6259 *addr_type = CTXSW_ADDR_TYPE_SYS;
6260 return 0;
6261 }
6262 /* PPC!?!?!?! */
6263
6264 /*NOTREACHED*/
6265 return -EINVAL;
6266}
6267
6268static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
6269 u32 gpc_num,
6270 u32 *priv_addr_table, u32 *t)
6271{
6272 u32 ppc_num;
6273
6274 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6275
6276 for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
6277 priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr),
6278 gpc_num, ppc_num);
6279
6280 return 0;
6281}
6282
6283/*
6284 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
6285 * unicast addresses. This function will convert a BE unicast address to a BE
6286 * broadcast address and split a GPC/TPC broadcast address into a table of
6287 * GPC/TPC addresses. The addresses generated by this function can be
6288 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
6289 */
6290static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
6291 u32 addr,
6292 u32 *priv_addr_table,
6293 u32 *num_registers)
6294{
6295 int addr_type; /*enum ctxsw_addr_type */
6296 u32 gpc_num, tpc_num, ppc_num, be_num;
6297 u32 broadcast_flags;
6298 u32 t;
6299 int err;
6300 int fbpa_num;
6301
6302 t = 0;
6303 *num_registers = 0;
6304
6305 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6306
6307 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
6308 &gpc_num, &tpc_num, &ppc_num, &be_num,
6309 &broadcast_flags);
6310 gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
6311 if (err)
6312 return err;
6313
6314 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6315 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6316 /* The BE broadcast registers are included in the compressed PRI
6317 * table. Convert a BE unicast address to a broadcast address
6318 * so that we can look up the offset. */
6319 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
6320 !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
6321 priv_addr_table[t++] = pri_be_shared_addr(g, addr);
6322 else
6323 priv_addr_table[t++] = addr;
6324
6325 *num_registers = t;
6326 return 0;
6327 }
6328
6329 /* The GPC/TPC unicast registers are included in the compressed PRI
6330 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
6331 * that we can look up the offsets. */
6332 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
6333 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
6334
6335 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
6336 for (tpc_num = 0;
6337 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6338 tpc_num++)
6339 priv_addr_table[t++] =
6340 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6341 gpc_num, tpc_num);
6342
6343 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
6344 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
6345 priv_addr_table, &t);
6346 if (err)
6347 return err;
6348 } else
6349 priv_addr_table[t++] =
6350 pri_gpc_addr(g, pri_gpccs_addr_mask(addr),
6351 gpc_num);
6352 }
6353 } else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
6354 (addr_type == CTXSW_ADDR_TYPE_ETPC)) &&
6355 g->ops.gr.egpc_etpc_priv_addr_table) {
6356 gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC");
6357 g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num,
6358 broadcast_flags, priv_addr_table, &t);
6359 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) {
6360 g->ops.gr.split_lts_broadcast_addr(g, addr,
6361 priv_addr_table, &t);
6362 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) {
6363 g->ops.gr.split_ltc_broadcast_addr(g, addr,
6364 priv_addr_table, &t);
6365 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) {
6366 for (fbpa_num = 0;
6367 fbpa_num < nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
6368 fbpa_num++)
6369 priv_addr_table[t++] = pri_fbpa_addr(g,
6370 pri_fbpa_addr_mask(g, addr), fbpa_num);
6371 } else if (!(broadcast_flags & PRI_BROADCAST_FLAGS_GPC)) {
6372 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
6373 for (tpc_num = 0;
6374 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6375 tpc_num++)
6376 priv_addr_table[t++] =
6377 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6378 gpc_num, tpc_num);
6379 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
6380 err = gr_gk20a_split_ppc_broadcast_addr(g,
6381 addr, gpc_num, priv_addr_table, &t);
6382 else
6383 priv_addr_table[t++] = addr;
6384 }
6385
6386 *num_registers = t;
6387 return 0;
6388}
6389
6390int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
6391 u32 addr,
6392 u32 max_offsets,
6393 u32 *offsets, u32 *offset_addrs,
6394 u32 *num_offsets,
6395 bool is_quad, u32 quad)
6396{
6397 u32 i;
6398 u32 priv_offset = 0;
6399 u32 *priv_registers;
6400 u32 num_registers = 0;
6401 int err = 0;
6402 struct gr_gk20a *gr = &g->gr;
6403 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6404 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6405 sm_per_tpc;
6406
6407 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6408
6409 /* implementation is crossed-up if either of these happen */
6410 if (max_offsets > potential_offsets) {
6411 gk20a_dbg_fn("max_offsets > potential_offsets");
6412 return -EINVAL;
6413 }
6414
6415 if (!g->gr.ctx_vars.golden_image_initialized)
6416 return -ENODEV;
6417
6418 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6419 if (!priv_registers) {
6420 gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
6421 err = PTR_ERR(priv_registers);
6422 goto cleanup;
6423 }
6424 memset(offsets, 0, sizeof(u32) * max_offsets);
6425 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6426 *num_offsets = 0;
6427
6428 gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
6429
6430 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6431 gk20a_dbg_fn("max_offsets = %d, num_registers = %d",
6432 max_offsets, num_registers);
6433 err = -EINVAL;
6434 goto cleanup;
6435 }
6436
6437 if ((max_offsets == 1) && (num_registers > 1))
6438 num_registers = 1;
6439
6440 if (!g->gr.ctx_vars.local_golden_image) {
6441 gk20a_dbg_fn("no context switch header info to work with");
6442 err = -EINVAL;
6443 goto cleanup;
6444 }
6445
6446 for (i = 0; i < num_registers; i++) {
6447 err = gr_gk20a_find_priv_offset_in_buffer(g,
6448 priv_registers[i],
6449 is_quad, quad,
6450 g->gr.ctx_vars.local_golden_image,
6451 g->gr.ctx_vars.golden_image_size,
6452 &priv_offset);
6453 if (err) {
6454 gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
6455 addr); /*, grPriRegStr(addr)));*/
6456 goto cleanup;
6457 }
6458
6459 offsets[i] = priv_offset;
6460 offset_addrs[i] = priv_registers[i];
6461 }
6462
6463 *num_offsets = num_registers;
6464cleanup:
6465 if (!IS_ERR_OR_NULL(priv_registers))
6466 nvgpu_kfree(g, priv_registers);
6467
6468 return err;
6469}
6470
6471int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
6472 u32 addr,
6473 u32 max_offsets,
6474 u32 *offsets, u32 *offset_addrs,
6475 u32 *num_offsets)
6476{
6477 u32 i;
6478 u32 priv_offset = 0;
6479 u32 *priv_registers;
6480 u32 num_registers = 0;
6481 int err = 0;
6482 struct gr_gk20a *gr = &g->gr;
6483 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6484 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6485 sm_per_tpc;
6486
6487 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6488
6489 /* implementation is crossed-up if either of these happen */
6490 if (max_offsets > potential_offsets)
6491 return -EINVAL;
6492
6493 if (!g->gr.ctx_vars.golden_image_initialized)
6494 return -ENODEV;
6495
6496 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6497 if (ZERO_OR_NULL_PTR(priv_registers)) {
6498 gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
6499 return -ENOMEM;
6500 }
6501 memset(offsets, 0, sizeof(u32) * max_offsets);
6502 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6503 *num_offsets = 0;
6504
6505 gr_gk20a_create_priv_addr_table(g, addr, priv_registers, &num_registers);
6506
6507 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6508 err = -EINVAL;
6509 goto cleanup;
6510 }
6511
6512 if ((max_offsets == 1) && (num_registers > 1))
6513 num_registers = 1;
6514
6515 if (!g->gr.ctx_vars.local_golden_image) {
6516 gk20a_dbg_fn("no context switch header info to work with");
6517 err = -EINVAL;
6518 goto cleanup;
6519 }
6520
6521 for (i = 0; i < num_registers; i++) {
6522 err = gr_gk20a_find_priv_offset_in_pm_buffer(g,
6523 priv_registers[i],
6524 &priv_offset);
6525 if (err) {
6526 gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
6527 addr); /*, grPriRegStr(addr)));*/
6528 goto cleanup;
6529 }
6530
6531 offsets[i] = priv_offset;
6532 offset_addrs[i] = priv_registers[i];
6533 }
6534
6535 *num_offsets = num_registers;
6536cleanup:
6537 nvgpu_kfree(g, priv_registers);
6538
6539 return err;
6540}
6541
6542/* Setup some register tables. This looks hacky; our
6543 * register/offset functions are just that, functions.
6544 * So they can't be used as initializers... TBD: fix to
6545 * generate consts at least on an as-needed basis.
6546 */
6547static const u32 _num_ovr_perf_regs = 17;
6548static u32 _ovr_perf_regs[17] = { 0, };
6549/* Following are the blocks of registers that the ucode
6550 stores in the extended region.*/
6551
6552void gk20a_gr_init_ovr_sm_dsm_perf(void)
6553{
6554 if (_ovr_perf_regs[0] != 0)
6555 return;
6556
6557 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
6558 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
6559 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
6560 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
6561 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
6562 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
6563 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
6564 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
6565 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
6566 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
6567 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
6568 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
6569 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
6570 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
6571 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
6572 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
6573 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
6574
6575}
6576
6577/* TBD: would like to handle this elsewhere, at a higher level.
6578 * these are currently constructed in a "test-then-write" style
6579 * which makes it impossible to know externally whether a ctx
6580 * write will actually occur. so later we should put a lazy,
6581 * map-and-hold system in the patch write state */
6582static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
6583 struct channel_ctx_gk20a *ch_ctx,
6584 u32 addr, u32 data,
6585 struct nvgpu_mem *mem)
6586{
6587 u32 num_gpc = g->gr.gpc_count;
6588 u32 num_tpc;
6589 u32 tpc, gpc, reg;
6590 u32 chk_addr;
6591 u32 vaddr_lo;
6592 u32 vaddr_hi;
6593 u32 tmp;
6594 u32 num_ovr_perf_regs = 0;
6595 u32 *ovr_perf_regs = NULL;
6596 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6597 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6598 struct ctx_header_desc *ctx = &ch_ctx->ctx_header;
6599 struct nvgpu_mem *ctxheader = &ctx->mem;
6600
6601 g->ops.gr.init_ovr_sm_dsm_perf();
6602 g->ops.gr.init_sm_dsm_reg_info();
6603 g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);
6604
6605 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6606
6607 for (reg = 0; reg < num_ovr_perf_regs; reg++) {
6608 for (gpc = 0; gpc < num_gpc; gpc++) {
6609 num_tpc = g->gr.gpc_tpc_count[gpc];
6610 for (tpc = 0; tpc < num_tpc; tpc++) {
6611 chk_addr = ((gpc_stride * gpc) +
6612 (tpc_in_gpc_stride * tpc) +
6613 ovr_perf_regs[reg]);
6614 if (chk_addr != addr)
6615 continue;
6616 /* reset the patch count from previous
6617 runs,if ucode has already processed
6618 it */
6619 tmp = nvgpu_mem_rd(g, mem,
6620 ctxsw_prog_main_image_patch_count_o());
6621
6622 if (!tmp)
6623 ch_ctx->patch_ctx.data_count = 0;
6624
6625 gr_gk20a_ctx_patch_write(g, ch_ctx,
6626 addr, data, true);
6627
6628 vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
6629 vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
6630
6631 nvgpu_mem_wr(g, mem,
6632 ctxsw_prog_main_image_patch_count_o(),
6633 ch_ctx->patch_ctx.data_count);
6634 if (ctxheader->gpu_va) {
6635 /*
6636 * Main context can be gr_ctx or pm_ctx.
6637 * CPU access for relevant ctx is taken
6638 * care of in the calling function
6639 * __gr_gk20a_exec_ctx_ops. Need to take
6640 * care of cpu access to ctxheader here.
6641 */
6642 if (nvgpu_mem_begin(g, ctxheader))
6643 return -ENOMEM;
6644 nvgpu_mem_wr(g, ctxheader,
6645 ctxsw_prog_main_image_patch_adr_lo_o(),
6646 vaddr_lo);
6647 nvgpu_mem_wr(g, ctxheader,
6648 ctxsw_prog_main_image_patch_adr_hi_o(),
6649 vaddr_hi);
6650 nvgpu_mem_end(g, ctxheader);
6651 } else {
6652 nvgpu_mem_wr(g, mem,
6653 ctxsw_prog_main_image_patch_adr_lo_o(),
6654 vaddr_lo);
6655 nvgpu_mem_wr(g, mem,
6656 ctxsw_prog_main_image_patch_adr_hi_o(),
6657 vaddr_hi);
6658 }
6659
6660 /* we're not caching these on cpu side,
6661 but later watch for it */
6662 return 0;
6663 }
6664 }
6665 }
6666
6667 return 0;
6668}
6669
6670#define ILLEGAL_ID ((u32)~0)
6671
6672static inline bool check_main_image_header_magic(u8 *context)
6673{
6674 u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
6675 gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
6676 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
6677}
6678static inline bool check_local_header_magic(u8 *context)
6679{
6680 u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
6681 gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x", magic);
6682 return magic == ctxsw_prog_local_magic_value_v_value_v();
6683
6684}
6685
6686/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
6687static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
6688{
6689 return 256;
6690}
6691
6692void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
6693 u32 **ovr_perf_regs)
6694{
6695 *num_ovr_perf_regs = _num_ovr_perf_regs;
6696 *ovr_perf_regs = _ovr_perf_regs;
6697}
6698
6699static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
6700 u32 addr,
6701 bool is_quad, u32 quad,
6702 u32 *context_buffer,
6703 u32 context_buffer_size,
6704 u32 *priv_offset)
6705{
6706 u32 i, data32;
6707 u32 gpc_num, tpc_num;
6708 u32 num_gpcs, num_tpcs;
6709 u32 chk_addr;
6710 u32 ext_priv_offset, ext_priv_size;
6711 u8 *context;
6712 u32 offset_to_segment, offset_to_segment_end;
6713 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
6714 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
6715 u32 num_ext_gpccs_ext_buffer_segments;
6716 u32 inter_seg_offset;
6717 u32 max_tpc_count;
6718 u32 *sm_dsm_perf_ctrl_regs = NULL;
6719 u32 num_sm_dsm_perf_ctrl_regs = 0;
6720 u32 *sm_dsm_perf_regs = NULL;
6721 u32 num_sm_dsm_perf_regs = 0;
6722 u32 buffer_segments_size = 0;
6723 u32 marker_size = 0;
6724 u32 control_register_stride = 0;
6725 u32 perf_register_stride = 0;
6726 struct gr_gk20a *gr = &g->gr;
6727 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
6728 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6729 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
6730 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6731 u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1);
6732
6733 /* Only have TPC registers in extended region, so if not a TPC reg,
6734 then return error so caller can look elsewhere. */
6735 if (pri_is_gpc_addr(g, addr)) {
6736 u32 gpc_addr = 0;
6737 gpc_num = pri_get_gpc_num(g, addr);
6738 gpc_addr = pri_gpccs_addr_mask(addr);
6739 if (g->ops.gr.is_tpc_addr(g, gpc_addr))
6740 tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6741 else
6742 return -EINVAL;
6743
6744 gk20a_dbg_info(" gpc = %d tpc = %d",
6745 gpc_num, tpc_num);
6746 } else if ((g->ops.gr.is_etpc_addr) &&
6747 g->ops.gr.is_etpc_addr(g, addr)) {
6748 g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num);
6749 gpc_base = g->ops.gr.get_egpc_base(g);
6750 } else {
6751 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
6752 "does not exist in extended region");
6753 return -EINVAL;
6754 }
6755
6756 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
6757 /* note below is in words/num_registers */
6758 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
6759
6760 context = (u8 *)context_buffer;
6761 /* sanity check main header */
6762 if (!check_main_image_header_magic(context)) {
6763 nvgpu_err(g,
6764 "Invalid main header: magic value");
6765 return -EINVAL;
6766 }
6767 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
6768 if (gpc_num >= num_gpcs) {
6769 nvgpu_err(g,
6770 "GPC 0x%08x is greater than total count 0x%08x!",
6771 gpc_num, num_gpcs);
6772 return -EINVAL;
6773 }
6774
6775 data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
6776 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
6777 if (0 == ext_priv_size) {
6778 gk20a_dbg_info(" No extended memory in context buffer");
6779 return -EINVAL;
6780 }
6781 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
6782
6783 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
6784 offset_to_segment_end = offset_to_segment +
6785 (ext_priv_size * buffer_segments_size);
6786
6787 /* check local header magic */
6788 context += ctxsw_prog_ucode_header_size_in_bytes();
6789 if (!check_local_header_magic(context)) {
6790 nvgpu_err(g,
6791 "Invalid local header: magic value");
6792 return -EINVAL;
6793 }
6794
6795 /*
6796 * See if the incoming register address is in the first table of
6797 * registers. We check this by decoding only the TPC addr portion.
6798 * If we get a hit on the TPC bit, we then double check the address
6799 * by computing it from the base gpc/tpc strides. Then make sure
6800 * it is a real match.
6801 */
6802 g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
6803 &sm_dsm_perf_regs,
6804 &perf_register_stride);
6805
6806 g->ops.gr.init_sm_dsm_reg_info();
6807
6808 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
6809 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
6810 sm_dsm_perf_reg_id = i;
6811
6812 gk20a_dbg_info("register match: 0x%08x",
6813 sm_dsm_perf_regs[i]);
6814
6815 chk_addr = (gpc_base + gpc_stride * gpc_num) +
6816 tpc_in_gpc_base +
6817 (tpc_in_gpc_stride * tpc_num) +
6818 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask);
6819
6820 if (chk_addr != addr) {
6821 nvgpu_err(g,
6822 "Oops addr miss-match! : 0x%08x != 0x%08x",
6823 addr, chk_addr);
6824 return -EINVAL;
6825 }
6826 break;
6827 }
6828 }
6829
6830 /* Didn't find reg in supported group 1.
6831 * so try the second group now */
6832 g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
6833 &sm_dsm_perf_ctrl_regs,
6834 &control_register_stride);
6835
6836 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
6837 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
6838 if ((addr & tpc_gpc_mask) ==
6839 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
6840 sm_dsm_perf_ctrl_reg_id = i;
6841
6842 gk20a_dbg_info("register match: 0x%08x",
6843 sm_dsm_perf_ctrl_regs[i]);
6844
6845 chk_addr = (gpc_base + gpc_stride * gpc_num) +
6846 tpc_in_gpc_base +
6847 tpc_in_gpc_stride * tpc_num +
6848 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
6849 tpc_gpc_mask);
6850
6851 if (chk_addr != addr) {
6852 nvgpu_err(g,
6853 "Oops addr miss-match! : 0x%08x != 0x%08x",
6854 addr, chk_addr);
6855 return -EINVAL;
6856
6857 }
6858
6859 break;
6860 }
6861 }
6862 }
6863
6864 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
6865 (ILLEGAL_ID == sm_dsm_perf_reg_id))
6866 return -EINVAL;
6867
6868 /* Skip the FECS extended header, nothing there for us now. */
6869 offset_to_segment += buffer_segments_size;
6870
6871 /* skip through the GPCCS extended headers until we get to the data for
6872 * our GPC. The size of each gpc extended segment is enough to hold the
6873 * max tpc count for the gpcs,in 256b chunks.
6874 */
6875
6876 max_tpc_count = gr->max_tpc_per_gpc_count;
6877
6878 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
6879
6880 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
6881 buffer_segments_size * gpc_num);
6882
6883 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
6884
6885 /* skip the head marker to start with */
6886 inter_seg_offset = marker_size;
6887
6888 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
6889 /* skip over control regs of TPC's before the one we want.
6890 * then skip to the register in this tpc */
6891 inter_seg_offset = inter_seg_offset +
6892 (tpc_num * control_register_stride) +
6893 sm_dsm_perf_ctrl_reg_id;
6894 } else {
6895 /* skip all the control registers */
6896 inter_seg_offset = inter_seg_offset +
6897 (num_tpcs * control_register_stride);
6898
6899 /* skip the marker between control and counter segments */
6900 inter_seg_offset += marker_size;
6901
6902 /* skip over counter regs of TPCs before the one we want */
6903 inter_seg_offset = inter_seg_offset +
6904 (tpc_num * perf_register_stride) *
6905 ctxsw_prog_extended_num_smpc_quadrants_v();
6906
6907 /* skip over the register for the quadrants we do not want.
6908 * then skip to the register in this tpc */
6909 inter_seg_offset = inter_seg_offset +
6910 (perf_register_stride * quad) +
6911 sm_dsm_perf_reg_id;
6912 }
6913
6914 /* set the offset to the segment offset plus the inter segment offset to
6915 * our register */
6916 offset_to_segment += (inter_seg_offset * 4);
6917
6918 /* last sanity check: did we somehow compute an offset outside the
6919 * extended buffer? */
6920 if (offset_to_segment > offset_to_segment_end) {
6921 nvgpu_err(g,
6922 "Overflow ctxsw buffer! 0x%08x > 0x%08x",
6923 offset_to_segment, offset_to_segment_end);
6924 return -EINVAL;
6925 }
6926
6927 *priv_offset = offset_to_segment;
6928
6929 return 0;
6930}
6931
6932
6933static int
6934gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
6935 int addr_type,/* enum ctxsw_addr_type */
6936 u32 pri_addr,
6937 u32 gpc_num, u32 num_tpcs,
6938 u32 num_ppcs, u32 ppc_mask,
6939 u32 *priv_offset)
6940{
6941 u32 i;
6942 u32 address, base_address;
6943 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
6944 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
6945 struct aiv_gk20a *reg;
6946 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
6947 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6948 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
6949 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
6950 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
6951 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6952
6953 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
6954
6955 if (!g->gr.ctx_vars.valid)
6956 return -EINVAL;
6957
6958 /* Process the SYS/BE segment. */
6959 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6960 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6961 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
6962 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
6963 address = reg->addr;
6964 sys_offset = reg->index;
6965
6966 if (pri_addr == address) {
6967 *priv_offset = sys_offset;
6968 return 0;
6969 }
6970 }
6971 }
6972
6973 /* Process the TPC segment. */
6974 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6975 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
6976 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
6977 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
6978 address = reg->addr;
6979 tpc_addr = pri_tpccs_addr_mask(address);
6980 base_address = gpc_base +
6981 (gpc_num * gpc_stride) +
6982 tpc_in_gpc_base +
6983 (tpc_num * tpc_in_gpc_stride);
6984 address = base_address + tpc_addr;
6985 /*
6986 * The data for the TPCs is interleaved in the context buffer.
6987 * Example with num_tpcs = 2
6988 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
6989 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
6990 */
6991 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
6992
6993 if (pri_addr == address) {
6994 *priv_offset = tpc_offset;
6995 return 0;
6996 }
6997 }
6998 }
6999 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7000 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7001 if (!(g->ops.gr.get_egpc_base))
7002 return -EINVAL;
7003
7004 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
7005 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.etpc.count; i++) {
7006 reg = &g->gr.ctx_vars.ctxsw_regs.etpc.l[i];
7007 address = reg->addr;
7008 tpc_addr = pri_tpccs_addr_mask(address);
7009 base_address = g->ops.gr.get_egpc_base(g) +
7010 (gpc_num * gpc_stride) +
7011 tpc_in_gpc_base +
7012 (tpc_num * tpc_in_gpc_stride);
7013 address = base_address + tpc_addr;
7014 /*
7015 * The data for the TPCs is interleaved in the context buffer.
7016 * Example with num_tpcs = 2
7017 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7018 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7019 */
7020 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
7021
7022 if (pri_addr == address) {
7023 *priv_offset = tpc_offset;
7024 nvgpu_log(g,
7025 gpu_dbg_fn | gpu_dbg_gpu_dbg,
7026 "egpc/etpc priv_offset=0x%#08x",
7027 *priv_offset);
7028 return 0;
7029 }
7030 }
7031 }
7032 }
7033
7034
7035 /* Process the PPC segment. */
7036 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7037 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
7038 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
7039 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
7040 address = reg->addr;
7041 ppc_addr = pri_ppccs_addr_mask(address);
7042 base_address = gpc_base +
7043 (gpc_num * gpc_stride) +
7044 ppc_in_gpc_base +
7045 (ppc_num * ppc_in_gpc_stride);
7046 address = base_address + ppc_addr;
7047 /*
7048 * The data for the PPCs is interleaved in the context buffer.
7049 * Example with numPpcs = 2
7050 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7051 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7052 */
7053 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
7054
7055 if (pri_addr == address) {
7056 *priv_offset = ppc_offset;
7057 return 0;
7058 }
7059 }
7060 }
7061 }
7062
7063
7064 /* Process the GPC segment. */
7065 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7066 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
7067 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
7068
7069 address = reg->addr;
7070 gpc_addr = pri_gpccs_addr_mask(address);
7071 gpc_offset = reg->index;
7072
7073 base_address = gpc_base + (gpc_num * gpc_stride);
7074 address = base_address + gpc_addr;
7075
7076 if (pri_addr == address) {
7077 *priv_offset = gpc_offset;
7078 return 0;
7079 }
7080 }
7081 }
7082 return -EINVAL;
7083}
7084
7085static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
7086 u8 *context,
7087 u32 *num_ppcs, u32 *ppc_mask,
7088 u32 *reg_ppc_count)
7089{
7090 u32 data32;
7091 u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
7092
7093 /*
7094 * if there is only 1 PES_PER_GPC, then we put the PES registers
7095 * in the GPC reglist, so we can't error out if ppc.count == 0
7096 */
7097 if ((!g->gr.ctx_vars.valid) ||
7098 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
7099 (num_pes_per_gpc > 1)))
7100 return -EINVAL;
7101
7102 data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
7103
7104 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
7105 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
7106
7107 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
7108
7109 return 0;
7110}
7111
7112/*
7113 * This function will return the 32 bit offset for a priv register if it is
7114 * present in the context buffer. The context buffer is in CPU memory.
7115 */
7116static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
7117 u32 addr,
7118 bool is_quad, u32 quad,
7119 u32 *context_buffer,
7120 u32 context_buffer_size,
7121 u32 *priv_offset)
7122{
7123 struct gr_gk20a *gr = &g->gr;
7124 u32 i, data32;
7125 int err;
7126 int addr_type; /*enum ctxsw_addr_type */
7127 u32 broadcast_flags;
7128 u32 gpc_num, tpc_num, ppc_num, be_num;
7129 u32 num_gpcs, num_tpcs, num_ppcs;
7130 u32 offset;
7131 u32 sys_priv_offset, gpc_priv_offset;
7132 u32 ppc_mask, reg_list_ppc_count;
7133 u8 *context;
7134 u32 offset_to_segment;
7135
7136 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
7137
7138 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
7139 &gpc_num, &tpc_num, &ppc_num, &be_num,
7140 &broadcast_flags);
7141 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
7142 "addr_type = %d, broadcast_flags: %08x",
7143 addr_type, broadcast_flags);
7144 if (err)
7145 return err;
7146
7147 context = (u8 *)context_buffer;
7148 if (!check_main_image_header_magic(context)) {
7149 nvgpu_err(g,
7150 "Invalid main header: magic value");
7151 return -EINVAL;
7152 }
7153 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
7154
7155 /* Parse the FECS local header. */
7156 context += ctxsw_prog_ucode_header_size_in_bytes();
7157 if (!check_local_header_magic(context)) {
7158 nvgpu_err(g,
7159 "Invalid FECS local header: magic value");
7160 return -EINVAL;
7161 }
7162 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7163 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7164 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);
7165
7166 /* If found in Ext buffer, ok.
7167 * If it failed and we expected to find it there (quad offset)
7168 * then return the error. Otherwise continue on.
7169 */
7170 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
7171 addr, is_quad, quad, context_buffer,
7172 context_buffer_size, priv_offset);
7173 if (!err || (err && is_quad)) {
7174 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
7175 "err = %d, is_quad = %s",
7176 err, is_quad ? "true" : false);
7177 return err;
7178 }
7179
7180 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
7181 (addr_type == CTXSW_ADDR_TYPE_BE)) {
7182 /* Find the offset in the FECS segment. */
7183 offset_to_segment = sys_priv_offset *
7184 ctxsw_prog_ucode_header_size_in_bytes();
7185
7186 err = gr_gk20a_process_context_buffer_priv_segment(g,
7187 addr_type, addr,
7188 0, 0, 0, 0,
7189 &offset);
7190 if (err)
7191 return err;
7192
7193 *priv_offset = (offset_to_segment + offset);
7194 return 0;
7195 }
7196
7197 if ((gpc_num + 1) > num_gpcs) {
7198 nvgpu_err(g,
7199 "GPC %d not in this context buffer.",
7200 gpc_num);
7201 return -EINVAL;
7202 }
7203
7204 /* Parse the GPCCS local header(s).*/
7205 for (i = 0; i < num_gpcs; i++) {
7206 context += ctxsw_prog_ucode_header_size_in_bytes();
7207 if (!check_local_header_magic(context)) {
7208 nvgpu_err(g,
7209 "Invalid GPCCS local header: magic value");
7210 return -EINVAL;
7211
7212 }
7213 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7214 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7215
7216 err = gr_gk20a_determine_ppc_configuration(g, context,
7217 &num_ppcs, &ppc_mask,
7218 &reg_list_ppc_count);
7219 if (err) {
7220 nvgpu_err(g, "determine ppc configuration failed");
7221 return err;
7222 }
7223
7224
7225 num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
7226
7227 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
7228 nvgpu_err(g,
7229 "GPC %d TPC %d not in this context buffer.",
7230 gpc_num, tpc_num);
7231 return -EINVAL;
7232 }
7233
7234 /* Find the offset in the GPCCS segment.*/
7235 if (i == gpc_num) {
7236 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7237 "gpc_priv_offset 0x%#08x",
7238 gpc_priv_offset);
7239 offset_to_segment = gpc_priv_offset *
7240 ctxsw_prog_ucode_header_size_in_bytes();
7241
7242 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
7243 /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
7244 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7245 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7246 nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg,
7247 "egpc etpc offset_to_segment 0x%#08x",
7248 offset_to_segment);
7249 offset_to_segment +=
7250 ((gr->ctx_vars.ctxsw_regs.tpc.count *
7251 num_tpcs) << 2);
7252 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7253 /* The ucode stores TPC data before PPC data.
7254 * Advance offset past TPC data to PPC data. */
7255 offset_to_segment +=
7256 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7257 gr->ctx_vars.ctxsw_regs.etpc.count) *
7258 num_tpcs) << 2);
7259 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7260 /* The ucode stores TPC/PPC data before GPC data.
7261 * Advance offset past TPC/PPC data to GPC data. */
7262 /* note 1 PES_PER_GPC case */
7263 u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
7264 GPU_LIT_NUM_PES_PER_GPC);
7265 if (num_pes_per_gpc > 1) {
7266 offset_to_segment +=
7267 ((((gr->ctx_vars.ctxsw_regs.tpc.count +
7268 gr->ctx_vars.ctxsw_regs.etpc.count) *
7269 num_tpcs) << 2) +
7270 ((reg_list_ppc_count * num_ppcs) << 2));
7271 } else {
7272 offset_to_segment +=
7273 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7274 gr->ctx_vars.ctxsw_regs.etpc.count) *
7275 num_tpcs) << 2);
7276 }
7277 } else {
7278 gk20a_dbg_fn("Unknown address type.");
7279 return -EINVAL;
7280 }
7281 err = gr_gk20a_process_context_buffer_priv_segment(g,
7282 addr_type, addr,
7283 i, num_tpcs,
7284 num_ppcs, ppc_mask,
7285 &offset);
7286 if (err)
7287 return -EINVAL;
7288
7289 *priv_offset = offset_to_segment + offset;
7290 return 0;
7291 }
7292 }
7293
7294 return -EINVAL;
7295}
7296
7297static int map_cmp(const void *a, const void *b)
7298{
7299 struct ctxsw_buf_offset_map_entry *e1 =
7300 (struct ctxsw_buf_offset_map_entry *)a;
7301 struct ctxsw_buf_offset_map_entry *e2 =
7302 (struct ctxsw_buf_offset_map_entry *)b;
7303
7304 if (e1->addr < e2->addr)
7305 return -1;
7306
7307 if (e1->addr > e2->addr)
7308 return 1;
7309 return 0;
7310}
7311
7312static int add_ctxsw_buffer_map_entries_pmsys(struct ctxsw_buf_offset_map_entry *map,
7313 struct aiv_list_gk20a *regs,
7314 u32 *count, u32 *offset,
7315 u32 max_cnt, u32 base, u32 mask)
7316{
7317 u32 idx;
7318 u32 cnt = *count;
7319 u32 off = *offset;
7320
7321 if ((cnt + regs->count) > max_cnt)
7322 return -EINVAL;
7323
7324 for (idx = 0; idx < regs->count; idx++) {
7325 if ((base + (regs->l[idx].addr & mask)) < 0xFFF)
7326 map[cnt].addr = base + (regs->l[idx].addr & mask)
7327 + NV_PCFG_BASE;
7328 else
7329 map[cnt].addr = base + (regs->l[idx].addr & mask);
7330 map[cnt++].offset = off;
7331 off += 4;
7332 }
7333 *count = cnt;
7334 *offset = off;
7335 return 0;
7336}
7337
7338static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g,
7339 struct ctxsw_buf_offset_map_entry *map,
7340 struct aiv_list_gk20a *regs,
7341 u32 *count, u32 *offset,
7342 u32 max_cnt, u32 base, u32 mask)
7343{
7344 u32 idx;
7345 u32 cnt = *count;
7346 u32 off = *offset;
7347
7348 if ((cnt + regs->count) > max_cnt)
7349 return -EINVAL;
7350
7351 /* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1
7352 * To handle the case of PPC registers getting added into GPC, the below
7353 * code specifically checks for any PPC offsets and adds them using
7354 * proper mask
7355 */
7356 for (idx = 0; idx < regs->count; idx++) {
7357 /* Check if the address is PPC address */
7358 if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) {
7359 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
7360 GPU_LIT_PPC_IN_GPC_BASE);
7361 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
7362 GPU_LIT_PPC_IN_GPC_STRIDE);
7363 /* Use PPC mask instead of the GPC mask provided */
7364 u32 ppcmask = ppc_in_gpc_stride - 1;
7365
7366 map[cnt].addr = base + ppc_in_gpc_base
7367 + (regs->l[idx].addr & ppcmask);
7368 } else
7369 map[cnt].addr = base + (regs->l[idx].addr & mask);
7370 map[cnt++].offset = off;
7371 off += 4;
7372 }
7373 *count = cnt;
7374 *offset = off;
7375 return 0;
7376}
7377
7378static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
7379 struct aiv_list_gk20a *regs,
7380 u32 *count, u32 *offset,
7381 u32 max_cnt, u32 base, u32 mask)
7382{
7383 u32 idx;
7384 u32 cnt = *count;
7385 u32 off = *offset;
7386
7387 if ((cnt + regs->count) > max_cnt)
7388 return -EINVAL;
7389
7390 for (idx = 0; idx < regs->count; idx++) {
7391 map[cnt].addr = base + (regs->l[idx].addr & mask);
7392 map[cnt++].offset = off;
7393 off += 4;
7394 }
7395 *count = cnt;
7396 *offset = off;
7397 return 0;
7398}
7399
7400/* Helper function to add register entries to the register map for all
7401 * subunits
7402 */
7403static int add_ctxsw_buffer_map_entries_subunits(
7404 struct ctxsw_buf_offset_map_entry *map,
7405 struct aiv_list_gk20a *regs,
7406 u32 *count, u32 *offset,
7407 u32 max_cnt, u32 base,
7408 u32 num_units, u32 stride, u32 mask)
7409{
7410 u32 unit;
7411 u32 idx;
7412 u32 cnt = *count;
7413 u32 off = *offset;
7414
7415 if ((cnt + (regs->count * num_units)) > max_cnt)
7416 return -EINVAL;
7417
7418 /* Data is interleaved for units in ctxsw buffer */
7419 for (idx = 0; idx < regs->count; idx++) {
7420 for (unit = 0; unit < num_units; unit++) {
7421 map[cnt].addr = base + (regs->l[idx].addr & mask) +
7422 (unit * stride);
7423 map[cnt++].offset = off;
7424 off += 4;
7425 }
7426 }
7427 *count = cnt;
7428 *offset = off;
7429 return 0;
7430}
7431
7432static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
7433 struct ctxsw_buf_offset_map_entry *map,
7434 u32 *count, u32 *offset, u32 max_cnt)
7435{
7436 u32 num_gpcs = g->gr.gpc_count;
7437 u32 num_ppcs, num_tpcs, gpc_num, base;
7438 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7439 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7440 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
7441 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
7442 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7443 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7444
7445 for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
7446 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
7447 base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base;
7448 if (add_ctxsw_buffer_map_entries_subunits(map,
7449 &g->gr.ctx_vars.ctxsw_regs.pm_tpc,
7450 count, offset, max_cnt, base, num_tpcs,
7451 tpc_in_gpc_stride,
7452 (tpc_in_gpc_stride - 1)))
7453 return -EINVAL;
7454
7455 num_ppcs = g->gr.gpc_ppc_count[gpc_num];
7456 base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base;
7457 if (add_ctxsw_buffer_map_entries_subunits(map,
7458 &g->gr.ctx_vars.ctxsw_regs.pm_ppc,
7459 count, offset, max_cnt, base, num_ppcs,
7460 ppc_in_gpc_stride,
7461 (ppc_in_gpc_stride - 1)))
7462 return -EINVAL;
7463
7464 base = gpc_base + (gpc_stride * gpc_num);
7465 if (add_ctxsw_buffer_map_entries_pmgpc(g, map,
7466 &g->gr.ctx_vars.ctxsw_regs.pm_gpc,
7467 count, offset, max_cnt, base,
7468 (gpc_stride - 1)))
7469 return -EINVAL;
7470
7471 base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num;
7472 if (add_ctxsw_buffer_map_entries(map,
7473 &g->gr.ctx_vars.ctxsw_regs.pm_ucgpc,
7474 count, offset, max_cnt, base, ~0))
7475 return -EINVAL;
7476
7477 base = (NV_PERF_PMMGPC_CHIPLET_OFFSET * gpc_num);
7478 if (add_ctxsw_buffer_map_entries(map,
7479 &g->gr.ctx_vars.ctxsw_regs.perf_gpc,
7480 count, offset, max_cnt, base, ~0))
7481 return -EINVAL;
7482
7483 base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
7484 if (add_ctxsw_buffer_map_entries(map,
7485 &g->gr.ctx_vars.ctxsw_regs.gpc_router,
7486 count, offset, max_cnt, base, ~0))
7487 return -EINVAL;
7488
7489 *offset = ALIGN(*offset, 256);
7490 }
7491 return 0;
7492}
7493
7494/*
7495 * PM CTXSW BUFFER LAYOUT :
7496 *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE
7497 *| |
7498 *| LIST_compressed_pm_ctx_reg_SYS |Space allocated: numRegs words
7499 *|---------------------------------------------|
7500 *| |
7501 *| LIST_compressed_nv_perf_ctx_reg_SYS |Space allocated: numRegs words
7502 *|---------------------------------------------|
7503 *| |
7504 *| LIST_compressed_nv_perf_ctx_reg_sysrouter|Space allocated: numRegs words
7505 *|---------------------------------------------|
7506 *| |
7507 *| LIST_compressed_nv_perf_ctx_reg_PMA |Space allocated: numRegs words
7508 *|---------------------------------------------|
7509 *| PADDING for 256 byte alignment |
7510 *|---------------------------------------------|<----256 byte aligned
7511 *| LIST_compressed_nv_perf_fbp_ctx_regs |
7512 *| |Space allocated: numRegs * n words (for n FB units)
7513 *|---------------------------------------------|
7514 *| LIST_compressed_nv_perf_fbprouter_ctx_regs |
7515 *| |Space allocated: numRegs * n words (for n FB units)
7516 *|---------------------------------------------|
7517 *| LIST_compressed_pm_fbpa_ctx_regs |
7518 *| |Space allocated: numRegs * n words (for n FB units)
7519 *|---------------------------------------------|
7520 *| LIST_compressed_pm_rop_ctx_regs |
7521 *|---------------------------------------------|
7522 *| LIST_compressed_pm_ltc_ctx_regs |
7523 *| LTC0 LTS0 |
7524 *| LTC1 LTS0 |Space allocated: numRegs * n words (for n LTC units)
7525 *| LTCn LTS0 |
7526 *| LTC0 LTS1 |
7527 *| LTC1 LTS1 |
7528 *| LTCn LTS1 |
7529 *| LTC0 LTSn |
7530 *| LTC1 LTSn |
7531 *| LTCn LTSn |
7532 *|---------------------------------------------|
7533 *| PADDING for 256 byte alignment |
7534 *|---------------------------------------------|<----256 byte aligned
7535 *| GPC0 REG0 TPC0 |Each GPC has space allocated to accommodate
7536 *| REG0 TPC1 | all the GPC/TPC register lists
7537 *| Lists in each GPC region: REG0 TPCn |Per GPC allocated space is always 256 byte aligned
7538 *| LIST_pm_ctx_reg_TPC REG1 TPC0 |
7539 *| * numTpcs REG1 TPC1 |
7540 *| LIST_pm_ctx_reg_PPC REG1 TPCn |
7541 *| * numPpcs REGn TPC0 |
7542 *| LIST_pm_ctx_reg_GPC REGn TPC1 |
7543 *| List_pm_ctx_reg_uc_GPC REGn TPCn |
7544 *| LIST_nv_perf_ctx_reg_GPC |
7545 *| ---- |--
7546 *| GPC1 . |
7547 *| . |<----
7548 *|---------------------------------------------|
7549 *= =
7550 *| GPCn |
7551 *= =
7552 *|---------------------------------------------|
7553 */
7554
7555static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
7556{
7557 u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size;
7558 u32 hwpm_ctxsw_reg_count_max;
7559 u32 map_size;
7560 u32 i, count = 0;
7561 u32 offset = 0;
7562 struct ctxsw_buf_offset_map_entry *map;
7563 u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
7564 u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
7565 u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
7566 u32 num_ltc = g->ops.gr.get_max_ltc_per_fbp(g) * g->gr.num_fbps;
7567
7568 if (hwpm_ctxsw_buffer_size == 0) {
7569 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
7570 "no PM Ctxsw buffer memory in context buffer");
7571 return -EINVAL;
7572 }
7573
7574 hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
7575 map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
7576
7577 map = nvgpu_big_zalloc(g, map_size);
7578 if (!map)
7579 return -ENOMEM;
7580
7581 /* Add entries from _LIST_pm_ctx_reg_SYS */
7582 if (add_ctxsw_buffer_map_entries_pmsys(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys,
7583 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
7584 goto cleanup;
7585
7586 /* Add entries from _LIST_nv_perf_ctx_reg_SYS */
7587 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys,
7588 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
7589 goto cleanup;
7590
7591 /* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/
7592 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys_router,
7593 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
7594 goto cleanup;
7595
7596 /* Add entries from _LIST_nv_perf_pma_ctx_reg*/
7597 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_pma,
7598 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
7599 goto cleanup;
7600
7601 offset = ALIGN(offset, 256);
7602
7603 /* Add entries from _LIST_nv_perf_fbp_ctx_regs */
7604 if (add_ctxsw_buffer_map_entries_subunits(map,
7605 &g->gr.ctx_vars.ctxsw_regs.fbp,
7606 &count, &offset,
7607 hwpm_ctxsw_reg_count_max, 0,
7608 g->gr.num_fbps, NV_PMM_FBP_STRIDE, ~0))
7609 goto cleanup;
7610
7611 /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
7612 if (add_ctxsw_buffer_map_entries_subunits(map,
7613 &g->gr.ctx_vars.ctxsw_regs.fbp_router,
7614 &count, &offset,
7615 hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps,
7616 NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0))
7617 goto cleanup;
7618
7619 /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
7620 if (add_ctxsw_buffer_map_entries_subunits(map,
7621 &g->gr.ctx_vars.ctxsw_regs.pm_fbpa,
7622 &count, &offset,
7623 hwpm_ctxsw_reg_count_max, 0,
7624 num_fbpas, fbpa_stride, ~0))
7625 goto cleanup;
7626
7627 /* Add entries from _LIST_nv_pm_rop_ctx_regs */
7628 if (add_ctxsw_buffer_map_entries(map,
7629 &g->gr.ctx_vars.ctxsw_regs.pm_rop,
7630 &count, &offset,
7631 hwpm_ctxsw_reg_count_max, 0, ~0))
7632 goto cleanup;
7633
7634 /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
7635 if (add_ctxsw_buffer_map_entries_subunits(map,
7636 &g->gr.ctx_vars.ctxsw_regs.pm_ltc,
7637 &count, &offset,
7638 hwpm_ctxsw_reg_count_max, 0,
7639 num_ltc, ltc_stride, ~0))
7640 goto cleanup;
7641
7642 offset = ALIGN(offset, 256);
7643
7644 /* Add GPC entries */
7645 if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
7646 hwpm_ctxsw_reg_count_max))
7647 goto cleanup;
7648
7649 if (offset > hwpm_ctxsw_buffer_size) {
7650 nvgpu_err(g, "offset > buffer size");
7651 goto cleanup;
7652 }
7653
7654 sort(map, count, sizeof(*map), map_cmp, NULL);
7655
7656 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map;
7657 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count;
7658
7659 gk20a_dbg_info("Reg Addr => HWPM Ctxt switch buffer offset");
7660
7661 for (i = 0; i < count; i++)
7662 gk20a_dbg_info("%08x => %08x", map[i].addr, map[i].offset);
7663
7664 return 0;
7665cleanup:
7666 nvgpu_err(g, "Failed to create HWPM buffer offset map");
7667 nvgpu_big_free(g, map);
7668 return -EINVAL;
7669}
7670
7671/*
7672 * This function will return the 32 bit offset for a priv register if it is
7673 * present in the PM context buffer.
7674 */
7675static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
7676 u32 addr,
7677 u32 *priv_offset)
7678{
7679 struct gr_gk20a *gr = &g->gr;
7680 int err = 0;
7681 u32 count;
7682 struct ctxsw_buf_offset_map_entry *map, *result, map_key;
7683
7684 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
7685
7686 /* Create map of pri address and pm offset if necessary */
7687 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) {
7688 err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g);
7689 if (err)
7690 return err;
7691 }
7692
7693 *priv_offset = 0;
7694
7695 map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map;
7696 count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count;
7697
7698 map_key.addr = addr;
7699 result = bsearch(&map_key, map, count, sizeof(*map), map_cmp);
7700
7701 if (result)
7702 *priv_offset = result->offset;
7703 else {
7704 nvgpu_err(g, "Lookup failed for address 0x%x", addr);
7705 err = -EINVAL;
7706 }
7707 return err;
7708}
7709
7710bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
7711{
7712 int curr_gr_ctx, curr_gr_tsgid;
7713 struct gk20a *g = ch->g;
7714 struct channel_gk20a *curr_ch;
7715 bool ret = false;
7716
7717 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
7718
7719 /* when contexts are unloaded from GR, the valid bit is reset
7720 * but the instance pointer information remains intact. So the
7721 * valid bit must be checked to be absolutely certain that a
7722 * valid context is currently resident.
7723 */
7724 if (!gr_fecs_current_ctx_valid_v(curr_gr_ctx))
7725 return NULL;
7726
7727 curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
7728 &curr_gr_tsgid);
7729
7730 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
7731 "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
7732 " ch->chid=%d",
7733 curr_ch ? curr_ch->chid : -1,
7734 curr_gr_tsgid,
7735 ch->tsgid,
7736 ch->chid);
7737
7738 if (!curr_ch)
7739 return false;
7740
7741 if (ch->chid == curr_ch->chid)
7742 ret = true;
7743
7744 if (gk20a_is_channel_marked_as_tsg(ch) && (ch->tsgid == curr_gr_tsgid))
7745 ret = true;
7746
7747 gk20a_channel_put(curr_ch);
7748 return ret;
7749}
7750
7751int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
7752 struct nvgpu_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
7753 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
7754 bool ch_is_curr_ctx)
7755{
7756 struct gk20a *g = ch->g;
7757 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
7758 bool gr_ctx_ready = false;
7759 bool pm_ctx_ready = false;
7760 struct nvgpu_mem *current_mem = NULL;
7761 u32 i, j, offset, v;
7762 struct gr_gk20a *gr = &g->gr;
7763 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
7764 u32 max_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
7765 sm_per_tpc;
7766 u32 *offsets = NULL;
7767 u32 *offset_addrs = NULL;
7768 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
7769 int err = 0, pass;
7770
7771 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
7772 num_ctx_wr_ops, num_ctx_rd_ops);
7773
7774 if (ch_is_curr_ctx) {
7775 for (pass = 0; pass < 2; pass++) {
7776 ctx_op_nr = 0;
7777 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
7778 /* only do ctx ops and only on the right pass */
7779 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
7780 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
7781 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
7782 continue;
7783
7784 /* if this is a quad access, setup for special access*/
7785 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
7786 && g->ops.gr.access_smpc_reg)
7787 g->ops.gr.access_smpc_reg(g,
7788 ctx_ops[i].quad,
7789 ctx_ops[i].offset);
7790 offset = ctx_ops[i].offset;
7791
7792 if (pass == 0) { /* write pass */
7793 v = gk20a_readl(g, offset);
7794 v &= ~ctx_ops[i].and_n_mask_lo;
7795 v |= ctx_ops[i].value_lo;
7796 gk20a_writel(g, offset, v);
7797
7798 gk20a_dbg(gpu_dbg_gpu_dbg,
7799 "direct wr: offset=0x%x v=0x%x",
7800 offset, v);
7801
7802 if (ctx_ops[i].op == REGOP(WRITE_64)) {
7803 v = gk20a_readl(g, offset + 4);
7804 v &= ~ctx_ops[i].and_n_mask_hi;
7805 v |= ctx_ops[i].value_hi;
7806 gk20a_writel(g, offset + 4, v);
7807
7808 gk20a_dbg(gpu_dbg_gpu_dbg,
7809 "direct wr: offset=0x%x v=0x%x",
7810 offset + 4, v);
7811 }
7812
7813 } else { /* read pass */
7814 ctx_ops[i].value_lo =
7815 gk20a_readl(g, offset);
7816
7817 gk20a_dbg(gpu_dbg_gpu_dbg,
7818 "direct rd: offset=0x%x v=0x%x",
7819 offset, ctx_ops[i].value_lo);
7820
7821 if (ctx_ops[i].op == REGOP(READ_64)) {
7822 ctx_ops[i].value_hi =
7823 gk20a_readl(g, offset + 4);
7824
7825 gk20a_dbg(gpu_dbg_gpu_dbg,
7826 "direct rd: offset=0x%x v=0x%x",
7827 offset, ctx_ops[i].value_lo);
7828 } else
7829 ctx_ops[i].value_hi = 0;
7830 }
7831 ctx_op_nr++;
7832 }
7833 }
7834 goto cleanup;
7835 }
7836
7837 /* they're the same size, so just use one alloc for both */
7838 offsets = nvgpu_kzalloc(g, 2 * sizeof(u32) * max_offsets);
7839 if (!offsets) {
7840 err = -ENOMEM;
7841 goto cleanup;
7842 }
7843 offset_addrs = offsets + max_offsets;
7844
7845 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
7846 if (err)
7847 goto cleanup;
7848
7849 g->ops.mm.l2_flush(g, true);
7850
7851 /* write to appropriate place in context image,
7852 * first have to figure out where that really is */
7853
7854 /* first pass is writes, second reads */
7855 for (pass = 0; pass < 2; pass++) {
7856 ctx_op_nr = 0;
7857 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
7858 u32 num_offsets;
7859
7860 /* only do ctx ops and only on the right pass */
7861 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
7862 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
7863 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
7864 continue;
7865
7866 err = gr_gk20a_get_ctx_buffer_offsets(g,
7867 ctx_ops[i].offset,
7868 max_offsets,
7869 offsets, offset_addrs,
7870 &num_offsets,
7871 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
7872 ctx_ops[i].quad);
7873 if (!err) {
7874 if (!gr_ctx_ready) {
7875 /* would have been a variant of
7876 * gr_gk20a_apply_instmem_overrides,
7877 * recoded in-place instead.
7878 */
7879 if (nvgpu_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
7880 err = -ENOMEM;
7881 goto cleanup;
7882 }
7883 gr_ctx_ready = true;
7884 }
7885 current_mem = &ch_ctx->gr_ctx->mem;
7886 } else {
7887 err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
7888 ctx_ops[i].offset,
7889 max_offsets,
7890 offsets, offset_addrs,
7891 &num_offsets);
7892 if (err) {
7893 gk20a_dbg(gpu_dbg_gpu_dbg,
7894 "ctx op invalid offset: offset=0x%x",
7895 ctx_ops[i].offset);
7896 ctx_ops[i].status =
7897 NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
7898 continue;
7899 }
7900 if (!pm_ctx_ready) {
7901 /* Make sure ctx buffer was initialized */
7902 if (!ch_ctx->pm_ctx.mem.priv.pages) {
7903 nvgpu_err(g,
7904 "Invalid ctx buffer");
7905 err = -EINVAL;
7906 goto cleanup;
7907 }
7908 if (nvgpu_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
7909 err = -ENOMEM;
7910 goto cleanup;
7911 }
7912 pm_ctx_ready = true;
7913 }
7914 current_mem = &ch_ctx->pm_ctx.mem;
7915 }
7916
7917 /* if this is a quad access, setup for special access*/
7918 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
7919 g->ops.gr.access_smpc_reg)
7920 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
7921 ctx_ops[i].offset);
7922
7923 for (j = 0; j < num_offsets; j++) {
7924 /* sanity check gr ctxt offsets,
7925 * don't write outside, worst case
7926 */
7927 if ((current_mem == &ch_ctx->gr_ctx->mem) &&
7928 (offsets[j] >= g->gr.ctx_vars.golden_image_size))
7929 continue;
7930 if (pass == 0) { /* write pass */
7931 v = nvgpu_mem_rd(g, current_mem, offsets[j]);
7932 v &= ~ctx_ops[i].and_n_mask_lo;
7933 v |= ctx_ops[i].value_lo;
7934 nvgpu_mem_wr(g, current_mem, offsets[j], v);
7935
7936 gk20a_dbg(gpu_dbg_gpu_dbg,
7937 "context wr: offset=0x%x v=0x%x",
7938 offsets[j], v);
7939
7940 if (ctx_ops[i].op == REGOP(WRITE_64)) {
7941 v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4);
7942 v &= ~ctx_ops[i].and_n_mask_hi;
7943 v |= ctx_ops[i].value_hi;
7944 nvgpu_mem_wr(g, current_mem, offsets[j] + 4, v);
7945
7946 gk20a_dbg(gpu_dbg_gpu_dbg,
7947 "context wr: offset=0x%x v=0x%x",
7948 offsets[j] + 4, v);
7949 }
7950
7951 /* check to see if we need to add a special WAR
7952 for some of the SMPC perf regs */
7953 gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
7954 v, current_mem);
7955
7956 } else { /* read pass */
7957 ctx_ops[i].value_lo =
7958 nvgpu_mem_rd(g, current_mem, offsets[0]);
7959
7960 gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
7961 offsets[0], ctx_ops[i].value_lo);
7962
7963 if (ctx_ops[i].op == REGOP(READ_64)) {
7964 ctx_ops[i].value_hi =
7965 nvgpu_mem_rd(g, current_mem, offsets[0] + 4);
7966
7967 gk20a_dbg(gpu_dbg_gpu_dbg,
7968 "context rd: offset=0x%x v=0x%x",
7969 offsets[0] + 4, ctx_ops[i].value_hi);
7970 } else
7971 ctx_ops[i].value_hi = 0;
7972 }
7973 }
7974 ctx_op_nr++;
7975 }
7976 }
7977
7978 cleanup:
7979 if (offsets)
7980 nvgpu_kfree(g, offsets);
7981
7982 if (ch_ctx->patch_ctx.mem.cpu_va)
7983 gr_gk20a_ctx_patch_write_end(g, ch_ctx, gr_ctx_ready);
7984 if (gr_ctx_ready)
7985 nvgpu_mem_end(g, &ch_ctx->gr_ctx->mem);
7986 if (pm_ctx_ready)
7987 nvgpu_mem_end(g, &ch_ctx->pm_ctx.mem);
7988
7989 return err;
7990}
7991
7992int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
7993 struct nvgpu_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
7994 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
7995{
7996 struct gk20a *g = ch->g;
7997 int err, tmp_err;
7998 bool ch_is_curr_ctx;
7999
8000 /* disable channel switching.
8001 * at that point the hardware state can be inspected to
8002 * determine if the context we're interested in is current.
8003 */
8004 err = gr_gk20a_disable_ctxsw(g);
8005 if (err) {
8006 nvgpu_err(g, "unable to stop gr ctxsw");
8007 /* this should probably be ctx-fatal... */
8008 return err;
8009 }
8010
8011 ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
8012
8013 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
8014 ch_is_curr_ctx);
8015
8016 err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
8017 num_ctx_rd_ops, ch_is_curr_ctx);
8018
8019 tmp_err = gr_gk20a_enable_ctxsw(g);
8020 if (tmp_err) {
8021 nvgpu_err(g, "unable to restart ctxsw!");
8022 err = tmp_err;
8023 }
8024
8025 return err;
8026}
8027
8028void gr_gk20a_commit_global_pagepool(struct gk20a *g,
8029 struct channel_ctx_gk20a *ch_ctx,
8030 u64 addr, u32 size, bool patch)
8031{
8032 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
8033 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
8034
8035 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
8036 gr_scc_pagepool_total_pages_f(size) |
8037 gr_scc_pagepool_valid_true_f(), patch);
8038
8039 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
8040 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
8041
8042 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
8043 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
8044
8045 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
8046 gr_pd_pagepool_total_pages_f(size) |
8047 gr_pd_pagepool_valid_true_f(), patch);
8048}
8049
8050void gk20a_init_gr(struct gk20a *g)
8051{
8052 nvgpu_cond_init(&g->gr.init_wq);
8053}
8054
8055int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
8056 u32 global_esr_mask, bool check_errors)
8057{
8058 bool locked_down;
8059 bool no_error_pending;
8060 u32 delay = GR_IDLE_CHECK_DEFAULT;
8061 bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g);
8062 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8063 u32 dbgr_status0 = 0, dbgr_control0 = 0;
8064 u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
8065 struct nvgpu_timeout timeout;
8066 u32 warp_esr;
8067
8068 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
8069 "GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm);
8070
8071 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
8072 NVGPU_TIMER_CPU_TIMER);
8073
8074 /* wait for the sm to lock down */
8075 do {
8076 u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
8077 gpc, tpc, sm);
8078 dbgr_status0 = gk20a_readl(g,
8079 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
8080
8081 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
8082
8083 locked_down =
8084 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
8085 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
8086 no_error_pending =
8087 check_errors &&
8088 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
8089 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
8090 ((global_esr & ~global_esr_mask) == 0);
8091
8092 if (locked_down || no_error_pending) {
8093 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
8094 "GPC%d TPC%d SM%d: locked down SM",
8095 gpc, tpc, sm);
8096 return 0;
8097 }
8098
8099 /* if an mmu fault is pending and mmu debug mode is not
8100 * enabled, the sm will never lock down. */
8101 if (!mmu_debug_mode_enabled &&
8102 (g->ops.mm.mmu_fault_pending(g))) {
8103 nvgpu_err(g,
8104 "GPC%d TPC%d: mmu fault pending,"
8105 " SM%d will never lock down!", gpc, tpc, sm);
8106 return -EFAULT;
8107 }
8108
8109 nvgpu_usleep_range(delay, delay * 2);
8110 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
8111 } while (!nvgpu_timeout_expired(&timeout));
8112
8113 dbgr_control0 = gk20a_readl(g,
8114 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8115
8116 /* 64 bit read */
8117 warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
8118 warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);
8119
8120 /* 64 bit read */
8121 warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
8122 warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);
8123
8124 /* 64 bit read */
8125 warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
8126 warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);
8127
8128 nvgpu_err(g,
8129 "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
8130 nvgpu_err(g,
8131 "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx",
8132 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
8133 warps_valid, warps_paused, warps_trapped);
8134
8135 return -ETIMEDOUT;
8136}
8137
8138void gk20a_gr_suspend_single_sm(struct gk20a *g,
8139 u32 gpc, u32 tpc, u32 sm,
8140 u32 global_esr_mask, bool check_errors)
8141{
8142 int err;
8143 u32 dbgr_control0;
8144 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8145
8146 /* if an SM debugger isn't attached, skip suspend */
8147 if (!g->ops.gr.sm_debugger_attached(g)) {
8148 nvgpu_err(g,
8149 "SM debugger not attached, skipping suspend!");
8150 return;
8151 }
8152
8153 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
8154 "suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm);
8155
8156 /* assert stop trigger. */
8157 dbgr_control0 = gk20a_readl(g,
8158 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8159 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8160 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset,
8161 dbgr_control0);
8162
8163 err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm,
8164 global_esr_mask, check_errors);
8165 if (err) {
8166 nvgpu_err(g,
8167 "SuspendSm failed");
8168 return;
8169 }
8170}
8171
8172void gk20a_gr_suspend_all_sms(struct gk20a *g,
8173 u32 global_esr_mask, bool check_errors)
8174{
8175 struct gr_gk20a *gr = &g->gr;
8176 u32 gpc, tpc, sm;
8177 int err;
8178 u32 dbgr_control0;
8179 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8180
8181 /* if an SM debugger isn't attached, skip suspend */
8182 if (!g->ops.gr.sm_debugger_attached(g)) {
8183 nvgpu_err(g,
8184 "SM debugger not attached, skipping suspend!");
8185 return;
8186 }
8187
8188 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms");
8189 /* assert stop trigger. uniformity assumption: all SMs will have
8190 * the same state in dbg_control0.
8191 */
8192 dbgr_control0 =
8193 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8194 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8195
8196 /* broadcast write */
8197 gk20a_writel(g,
8198 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8199
8200 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
8201 for (tpc = 0; tpc < gr_gk20a_get_tpc_count(gr, gpc); tpc++) {
8202 for (sm = 0; sm < sm_per_tpc; sm++) {
8203 err = g->ops.gr.wait_for_sm_lock_down(g,
8204 gpc, tpc, sm,
8205 global_esr_mask, check_errors);
8206 if (err) {
8207 nvgpu_err(g, "SuspendAllSms failed");
8208 return;
8209 }
8210 }
8211 }
8212 }
8213}
8214
8215void gk20a_gr_resume_single_sm(struct gk20a *g,
8216 u32 gpc, u32 tpc, u32 sm)
8217{
8218 u32 dbgr_control0;
8219 u32 offset;
8220 /*
8221 * The following requires some clarification. Despite the fact that both
8222 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8223 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8224 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8225 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8226 * (_DISABLE) as well.
8227
8228 * Advice from the arch group: Disable the stop trigger first, as a
8229 * separate operation, in order to ensure that the trigger has taken
8230 * effect, before enabling the run trigger.
8231 */
8232
8233 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8234
8235 /*De-assert stop trigger */
8236 dbgr_control0 =
8237 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8238 dbgr_control0 = set_field(dbgr_control0,
8239 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(),
8240 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f());
8241 gk20a_writel(g,
8242 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8243
8244 /* Run trigger */
8245 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8246 gk20a_writel(g,
8247 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8248}
8249
8250void gk20a_gr_resume_all_sms(struct gk20a *g)
8251{
8252 u32 dbgr_control0;
8253 /*
8254 * The following requires some clarification. Despite the fact that both
8255 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8256 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8257 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8258 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8259 * (_DISABLE) as well.
8260
8261 * Advice from the arch group: Disable the stop trigger first, as a
8262 * separate operation, in order to ensure that the trigger has taken
8263 * effect, before enabling the run trigger.
8264 */
8265
8266 /*De-assert stop trigger */
8267 dbgr_control0 =
8268 gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
8269 dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8270 gk20a_writel(g,
8271 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8272
8273 /* Run trigger */
8274 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8275 gk20a_writel(g,
8276 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8277}
8278
8279int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
8280 struct channel_gk20a *ch, u64 sms, bool enable)
8281{
8282 struct nvgpu_dbg_gpu_reg_op *ops;
8283 unsigned int i = 0, sm_id;
8284 int err;
8285 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
8286 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
8287
8288 ops = nvgpu_kcalloc(g, g->gr.no_of_sm, sizeof(*ops));
8289 if (!ops)
8290 return -ENOMEM;
8291 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
8292 int gpc, tpc;
8293 u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val;
8294
8295 if (!(sms & (1 << sm_id)))
8296 continue;
8297
8298 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8299 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8300
8301 tpc_offset = tpc_in_gpc_stride * tpc;
8302 gpc_offset = gpc_stride * gpc;
8303 reg_offset = tpc_offset + gpc_offset;
8304
8305 ops[i].op = REGOP(WRITE_32);
8306 ops[i].type = REGOP(TYPE_GR_CTX);
8307 ops[i].offset = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset;
8308
8309 reg_mask = 0;
8310 reg_val = 0;
8311 if (enable) {
8312 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8313 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f();
8314 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m();
8315 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f();
8316 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m();
8317 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f();
8318 } else {
8319 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8320 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f();
8321 }
8322
8323 ops[i].and_n_mask_lo = reg_mask;
8324 ops[i].value_lo = reg_val;
8325 i++;
8326 }
8327
8328 err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0);
8329 if (err)
8330 nvgpu_err(g, "Failed to access register");
8331 nvgpu_kfree(g, ops);
8332 return err;
8333}
8334
8335/*
8336 * gr_gk20a_suspend_context()
8337 * This API should be called with dbg_session lock held
8338 * and ctxsw disabled
8339 * Returns bool value indicating if context was resident
8340 * or not
8341 */
8342bool gr_gk20a_suspend_context(struct channel_gk20a *ch)
8343{
8344 struct gk20a *g = ch->g;
8345 bool ctx_resident = false;
8346
8347 if (gk20a_is_channel_ctx_resident(ch)) {
8348 g->ops.gr.suspend_all_sms(g, 0, false);
8349 ctx_resident = true;
8350 } else {
8351 gk20a_disable_channel_tsg(g, ch);
8352 }
8353
8354 return ctx_resident;
8355}
8356
8357bool gr_gk20a_resume_context(struct channel_gk20a *ch)
8358{
8359 struct gk20a *g = ch->g;
8360 bool ctx_resident = false;
8361
8362 if (gk20a_is_channel_ctx_resident(ch)) {
8363 g->ops.gr.resume_all_sms(g);
8364 ctx_resident = true;
8365 } else {
8366 gk20a_enable_channel_tsg(g, ch);
8367 }
8368
8369 return ctx_resident;
8370}
8371
8372int gr_gk20a_suspend_contexts(struct gk20a *g,
8373 struct dbg_session_gk20a *dbg_s,
8374 int *ctx_resident_ch_fd)
8375{
8376 int local_ctx_resident_ch_fd = -1;
8377 bool ctx_resident;
8378 struct channel_gk20a *ch;
8379 struct dbg_session_channel_data *ch_data;
8380 int err = 0;
8381
8382 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8383
8384 err = gr_gk20a_disable_ctxsw(g);
8385 if (err) {
8386 nvgpu_err(g, "unable to stop gr ctxsw");
8387 goto clean_up;
8388 }
8389
8390 nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
8391
8392 list_for_each_entry(ch_data, &dbg_s->ch_list, ch_entry) {
8393 ch = g->fifo.channel + ch_data->chid;
8394
8395 ctx_resident = gr_gk20a_suspend_context(ch);
8396 if (ctx_resident)
8397 local_ctx_resident_ch_fd = ch_data->channel_fd;
8398 }
8399
8400 nvgpu_mutex_release(&dbg_s->ch_list_lock);
8401
8402 err = gr_gk20a_enable_ctxsw(g);
8403 if (err)
8404 nvgpu_err(g, "unable to restart ctxsw!");
8405
8406 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8407
8408clean_up:
8409 nvgpu_mutex_release(&g->dbg_sessions_lock);
8410
8411 return err;
8412}
8413
8414int gr_gk20a_resume_contexts(struct gk20a *g,
8415 struct dbg_session_gk20a *dbg_s,
8416 int *ctx_resident_ch_fd)
8417{
8418 int local_ctx_resident_ch_fd = -1;
8419 bool ctx_resident;
8420 struct channel_gk20a *ch;
8421 int err = 0;
8422 struct dbg_session_channel_data *ch_data;
8423
8424 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8425
8426 err = gr_gk20a_disable_ctxsw(g);
8427 if (err) {
8428 nvgpu_err(g, "unable to stop gr ctxsw");
8429 goto clean_up;
8430 }
8431
8432 list_for_each_entry(ch_data, &dbg_s->ch_list, ch_entry) {
8433 ch = g->fifo.channel + ch_data->chid;
8434
8435 ctx_resident = gr_gk20a_resume_context(ch);
8436 if (ctx_resident)
8437 local_ctx_resident_ch_fd = ch_data->channel_fd;
8438 }
8439
8440 err = gr_gk20a_enable_ctxsw(g);
8441 if (err)
8442 nvgpu_err(g, "unable to restart ctxsw!");
8443
8444 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8445
8446clean_up:
8447 nvgpu_mutex_release(&g->dbg_sessions_lock);
8448
8449 return err;
8450}
8451
8452int gr_gk20a_inval_icache(struct gk20a *g, struct channel_gk20a *ch)
8453{
8454 int err = 0;
8455 u32 cache_ctrl, regval;
8456 struct nvgpu_dbg_gpu_reg_op ops;
8457
8458 ops.op = REGOP(READ_32);
8459 ops.type = REGOP(TYPE_GR_CTX);
8460 ops.status = REGOP(STATUS_SUCCESS);
8461 ops.value_hi = 0;
8462 ops.and_n_mask_lo = 0;
8463 ops.and_n_mask_hi = 0;
8464 ops.offset = gr_pri_gpc0_gcc_dbg_r();
8465
8466 err = gr_gk20a_exec_ctx_ops(ch, &ops, 1, 0, 1);
8467 if (err) {
8468 nvgpu_err(g, "Failed to read register");
8469 return err;
8470 }
8471
8472 regval = ops.value_lo;
8473
8474 ops.op = REGOP(WRITE_32);
8475 ops.value_lo = set_field(regval, gr_pri_gpcs_gcc_dbg_invalidate_m(), 1);
8476 err = gr_gk20a_exec_ctx_ops(ch, &ops, 1, 1, 0);
8477 if (err) {
8478 nvgpu_err(g, "Failed to write register");
8479 return err;
8480 }
8481
8482 ops.op = REGOP(READ_32);
8483 ops.offset = gr_pri_gpc0_tpc0_sm_cache_control_r();
8484 err = gr_gk20a_exec_ctx_ops(ch, &ops, 1, 0, 1);
8485 if (err) {
8486 nvgpu_err(g, "Failed to read register");
8487 return err;
8488 }
8489
8490 cache_ctrl = gk20a_readl(g, gr_pri_gpc0_tpc0_sm_cache_control_r());
8491 cache_ctrl = set_field(cache_ctrl, gr_pri_gpcs_tpcs_sm_cache_control_invalidate_cache_m(), 1);
8492 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cache_control_r(), cache_ctrl);
8493
8494 return 0;
8495}
8496
8497int gr_gk20a_trigger_suspend(struct gk20a *g)
8498{
8499 int err = 0;
8500 u32 dbgr_control0;
8501
8502 /* assert stop trigger. uniformity assumption: all SMs will have
8503 * the same state in dbg_control0. */
8504 dbgr_control0 =
8505 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8506 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8507
8508 /* broadcast write */
8509 gk20a_writel(g,
8510 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8511
8512 return err;
8513}
8514
8515int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state)
8516{
8517 int err = 0;
8518 struct gr_gk20a *gr = &g->gr;
8519 u32 gpc, tpc, sm, sm_id;
8520 u32 global_mask;
8521
8522 /* Wait for the SMs to reach full stop. This condition is:
8523 * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE)
8524 * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp
8525 * masks.
8526 */
8527 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
8528
8529 /* Lock down all SMs */
8530 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
8531
8532 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8533 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8534 sm = g->gr.sm_to_cluster[sm_id].sm_index;
8535
8536 err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
8537 global_mask, false);
8538 if (err) {
8539 nvgpu_err(g, "sm did not lock down!");
8540 return err;
8541 }
8542 }
8543
8544 /* Read the warp status */
8545 g->ops.gr.bpt_reg_info(g, w_state);
8546
8547 return 0;
8548}
8549
8550int gr_gk20a_resume_from_pause(struct gk20a *g)
8551{
8552 int err = 0;
8553 u32 reg_val;
8554
8555 /* Clear the pause mask to tell the GPU we want to resume everyone */
8556 gk20a_writel(g,
8557 gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0);
8558
8559 /* explicitly re-enable forwarding of SM interrupts upon any resume */
8560 reg_val = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
8561 reg_val |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
8562 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), reg_val);
8563
8564 /* Now resume all sms, write a 0 to the stop trigger
8565 * then a 1 to the run trigger */
8566 g->ops.gr.resume_all_sms(g);
8567
8568 return err;
8569}
8570
8571int gr_gk20a_clear_sm_errors(struct gk20a *g)
8572{
8573 int ret = 0;
8574 u32 gpc, tpc, sm;
8575 struct gr_gk20a *gr = &g->gr;
8576 u32 global_esr;
8577 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8578
8579 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
8580
8581 /* check if any tpc has an exception */
8582 for (tpc = 0; tpc < gr->tpc_count; tpc++) {
8583
8584 for (sm = 0; sm < sm_per_tpc; sm++) {
8585 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
8586 gpc, tpc, sm);
8587
8588 /* clearing hwws, also causes tpc and gpc
8589 * exceptions to be cleared
8590 */
8591 g->ops.gr.clear_sm_hww(g,
8592 gpc, tpc, sm, global_esr);
8593 }
8594 }
8595 }
8596
8597 return ret;
8598}
8599
8600u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g)
8601{
8602 struct gr_gk20a *gr = &g->gr;
8603 u32 sm_id, tpc_exception_en = 0;
8604 u32 offset, regval, tpc_offset, gpc_offset;
8605 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
8606 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
8607
8608 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
8609
8610 tpc_offset = tpc_in_gpc_stride * g->gr.sm_to_cluster[sm_id].tpc_index;
8611 gpc_offset = gpc_stride * g->gr.sm_to_cluster[sm_id].gpc_index;
8612 offset = tpc_offset + gpc_offset;
8613
8614 regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
8615 offset);
8616 /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */
8617 tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id;
8618 }
8619
8620 return tpc_exception_en;
8621}
8622
8623u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
8624{
8625 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8626 u32 hww_warp_esr = gk20a_readl(g,
8627 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
8628 return hww_warp_esr;
8629}
8630
8631u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
8632{
8633 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8634
8635 u32 hww_global_esr = gk20a_readl(g,
8636 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
8637
8638 return hww_global_esr;
8639}
8640
8641u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g)
8642{
8643 /*
8644 * These three interrupts don't require locking down the SM. They can
8645 * be handled by usermode clients as they aren't fatal. Additionally,
8646 * usermode clients may wish to allow some warps to execute while others
8647 * are at breakpoints, as opposed to fatal errors where all warps should
8648 * halt.
8649 */
8650 u32 global_esr_mask =
8651 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
8652 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
8653 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
8654
8655 return global_esr_mask;
8656}