aboutsummaryrefslogtreecommitdiffstats
path: root/include/gk20a/gr_gk20a.c
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2023-06-28 18:24:25 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2023-06-28 18:24:25 -0400
commit01e6fac4d61fdd7fff5433942ec93fc2ea1e4df1 (patch)
tree4ef34501728a087be24f4ba0af90f91486bf780b /include/gk20a/gr_gk20a.c
parent306a03d18b305e4e573be3b2931978fa10679eb9 (diff)
Include nvgpu headers
These are needed to build on NVIDIA's Jetson boards for the time being. Only a couple structs are required, so it should be fairly easy to remove this dependency at some point in the future.
Diffstat (limited to 'include/gk20a/gr_gk20a.c')
-rw-r--r--include/gk20a/gr_gk20a.c8998
1 files changed, 8998 insertions, 0 deletions
diff --git a/include/gk20a/gr_gk20a.c b/include/gk20a/gr_gk20a.c
new file mode 100644
index 0000000..7bcf528
--- /dev/null
+++ b/include/gk20a/gr_gk20a.c
@@ -0,0 +1,8998 @@
1/*
2 * GK20A Graphics
3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25#include <nvgpu/dma.h>
26#include <nvgpu/kmem.h>
27#include <nvgpu/gmmu.h>
28#include <nvgpu/timers.h>
29#include <nvgpu/nvgpu_common.h>
30#include <nvgpu/log.h>
31#include <nvgpu/bsearch.h>
32#include <nvgpu/sort.h>
33#include <nvgpu/bug.h>
34#include <nvgpu/firmware.h>
35#include <nvgpu/enabled.h>
36#include <nvgpu/debug.h>
37#include <nvgpu/barrier.h>
38#include <nvgpu/mm.h>
39#include <nvgpu/ctxsw_trace.h>
40#include <nvgpu/error_notifier.h>
41#include <nvgpu/ecc.h>
42#include <nvgpu/io.h>
43#include <nvgpu/utils.h>
44#include <nvgpu/channel.h>
45#include <nvgpu/unit.h>
46#include <nvgpu/power_features/pg.h>
47#include <nvgpu/power_features/cg.h>
48
49#include "gk20a.h"
50#include "gr_gk20a.h"
51#include "gk20a/fecs_trace_gk20a.h"
52#include "gr_ctx_gk20a.h"
53#include "gr_pri_gk20a.h"
54#include "regops_gk20a.h"
55#include "dbg_gpu_gk20a.h"
56
57#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
58#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
59#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
60#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
61#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
62#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
63#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
64#include <nvgpu/hw/gk20a/hw_pri_ringmaster_gk20a.h>
65#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
66#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
67
68#define BLK_SIZE (256)
69#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200
70#define NV_PERF_PMMGPCROUTER_STRIDE 0x0200
71#define NV_PCFG_BASE 0x00088000
72#define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE 0x0020
73#define FE_PWR_MODE_TIMEOUT_MAX 2000
74#define FE_PWR_MODE_TIMEOUT_DEFAULT 10
75#define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000
76#define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10
77#define FECS_ARB_CMD_TIMEOUT_MAX 40
78#define FECS_ARB_CMD_TIMEOUT_DEFAULT 2
79
80static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
81
82static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
83 struct vm_gk20a *vm,
84 struct nvgpu_gr_ctx *gr_ctx);
85
86/* channel patch ctx buffer */
87static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
88 struct channel_gk20a *c);
89static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
90 struct vm_gk20a *vm,
91 struct nvgpu_gr_ctx *gr_ctx);
92
93/* golden ctx image */
94static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
95 struct channel_gk20a *c);
96
97int gr_gk20a_get_ctx_id(struct gk20a *g,
98 struct channel_gk20a *c,
99 u32 *ctx_id)
100{
101 struct tsg_gk20a *tsg;
102 struct nvgpu_gr_ctx *gr_ctx = NULL;
103 struct nvgpu_mem *mem = NULL;
104
105 tsg = tsg_gk20a_from_ch(c);
106 if (tsg == NULL) {
107 return -EINVAL;
108 }
109
110 gr_ctx = &tsg->gr_ctx;
111 mem = &gr_ctx->mem;
112
113 /* Channel gr_ctx buffer is gpu cacheable.
114 Flush and invalidate before cpu update. */
115 g->ops.mm.l2_flush(g, true);
116
117 *ctx_id = nvgpu_mem_rd(g, mem,
118 ctxsw_prog_main_image_context_id_o());
119 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "ctx_id: 0x%x", *ctx_id);
120
121 return 0;
122}
123
124void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
125{
126 unsigned int i;
127
128 nvgpu_err(g, "gr_fecs_os_r : %d",
129 gk20a_readl(g, gr_fecs_os_r()));
130 nvgpu_err(g, "gr_fecs_cpuctl_r : 0x%x",
131 gk20a_readl(g, gr_fecs_cpuctl_r()));
132 nvgpu_err(g, "gr_fecs_idlestate_r : 0x%x",
133 gk20a_readl(g, gr_fecs_idlestate_r()));
134 nvgpu_err(g, "gr_fecs_mailbox0_r : 0x%x",
135 gk20a_readl(g, gr_fecs_mailbox0_r()));
136 nvgpu_err(g, "gr_fecs_mailbox1_r : 0x%x",
137 gk20a_readl(g, gr_fecs_mailbox1_r()));
138 nvgpu_err(g, "gr_fecs_irqstat_r : 0x%x",
139 gk20a_readl(g, gr_fecs_irqstat_r()));
140 nvgpu_err(g, "gr_fecs_irqmode_r : 0x%x",
141 gk20a_readl(g, gr_fecs_irqmode_r()));
142 nvgpu_err(g, "gr_fecs_irqmask_r : 0x%x",
143 gk20a_readl(g, gr_fecs_irqmask_r()));
144 nvgpu_err(g, "gr_fecs_irqdest_r : 0x%x",
145 gk20a_readl(g, gr_fecs_irqdest_r()));
146 nvgpu_err(g, "gr_fecs_debug1_r : 0x%x",
147 gk20a_readl(g, gr_fecs_debug1_r()));
148 nvgpu_err(g, "gr_fecs_debuginfo_r : 0x%x",
149 gk20a_readl(g, gr_fecs_debuginfo_r()));
150 nvgpu_err(g, "gr_fecs_ctxsw_status_1_r : 0x%x",
151 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
152
153 for (i = 0; i < g->ops.gr.fecs_ctxsw_mailbox_size(); i++) {
154 nvgpu_err(g, "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
155 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
156 }
157
158 nvgpu_err(g, "gr_fecs_engctl_r : 0x%x",
159 gk20a_readl(g, gr_fecs_engctl_r()));
160 nvgpu_err(g, "gr_fecs_curctx_r : 0x%x",
161 gk20a_readl(g, gr_fecs_curctx_r()));
162 nvgpu_err(g, "gr_fecs_nxtctx_r : 0x%x",
163 gk20a_readl(g, gr_fecs_nxtctx_r()));
164
165 gk20a_writel(g, gr_fecs_icd_cmd_r(),
166 gr_fecs_icd_cmd_opc_rreg_f() |
167 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
168 nvgpu_err(g, "FECS_FALCON_REG_IMB : 0x%x",
169 gk20a_readl(g, gr_fecs_icd_rdata_r()));
170
171 gk20a_writel(g, gr_fecs_icd_cmd_r(),
172 gr_fecs_icd_cmd_opc_rreg_f() |
173 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
174 nvgpu_err(g, "FECS_FALCON_REG_DMB : 0x%x",
175 gk20a_readl(g, gr_fecs_icd_rdata_r()));
176
177 gk20a_writel(g, gr_fecs_icd_cmd_r(),
178 gr_fecs_icd_cmd_opc_rreg_f() |
179 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
180 nvgpu_err(g, "FECS_FALCON_REG_CSW : 0x%x",
181 gk20a_readl(g, gr_fecs_icd_rdata_r()));
182
183 gk20a_writel(g, gr_fecs_icd_cmd_r(),
184 gr_fecs_icd_cmd_opc_rreg_f() |
185 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
186 nvgpu_err(g, "FECS_FALCON_REG_CTX : 0x%x",
187 gk20a_readl(g, gr_fecs_icd_rdata_r()));
188
189 gk20a_writel(g, gr_fecs_icd_cmd_r(),
190 gr_fecs_icd_cmd_opc_rreg_f() |
191 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
192 nvgpu_err(g, "FECS_FALCON_REG_EXCI : 0x%x",
193 gk20a_readl(g, gr_fecs_icd_rdata_r()));
194
195 for (i = 0; i < 4; i++) {
196 gk20a_writel(g, gr_fecs_icd_cmd_r(),
197 gr_fecs_icd_cmd_opc_rreg_f() |
198 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
199 nvgpu_err(g, "FECS_FALCON_REG_PC : 0x%x",
200 gk20a_readl(g, gr_fecs_icd_rdata_r()));
201
202 gk20a_writel(g, gr_fecs_icd_cmd_r(),
203 gr_fecs_icd_cmd_opc_rreg_f() |
204 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
205 nvgpu_err(g, "FECS_FALCON_REG_SP : 0x%x",
206 gk20a_readl(g, gr_fecs_icd_rdata_r()));
207 }
208}
209
210static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
211{
212 u32 i, ucode_u32_size;
213 const u32 *ucode_u32_data;
214 u32 checksum;
215
216 nvgpu_log_fn(g, " ");
217
218 gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
219 gr_gpccs_dmemc_blk_f(0) |
220 gr_gpccs_dmemc_aincw_f(1)));
221
222 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
223 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
224
225 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
226 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
227 checksum += ucode_u32_data[i];
228 }
229
230 gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
231 gr_fecs_dmemc_blk_f(0) |
232 gr_fecs_dmemc_aincw_f(1)));
233
234 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
235 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
236
237 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
238 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
239 checksum += ucode_u32_data[i];
240 }
241 nvgpu_log_fn(g, "done");
242}
243
244static void gr_gk20a_load_falcon_imem(struct gk20a *g)
245{
246 u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
247 const u32 *ucode_u32_data;
248 u32 tag, i, pad_start, pad_end;
249 u32 checksum;
250
251 nvgpu_log_fn(g, " ");
252
253 cfg = gk20a_readl(g, gr_fecs_cfg_r());
254 fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
255
256 cfg = gk20a_readl(g, gr_gpc0_cfg_r());
257 gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
258
259 /* Use the broadcast address to access all of the GPCCS units. */
260 gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
261 gr_gpccs_imemc_blk_f(0) |
262 gr_gpccs_imemc_aincw_f(1)));
263
264 /* Setup the tags for the instruction memory. */
265 tag = 0;
266 gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
267
268 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
269 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
270
271 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
272 if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
273 tag++;
274 gk20a_writel(g, gr_gpccs_imemt_r(0),
275 gr_gpccs_imemt_tag_f(tag));
276 }
277 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
278 checksum += ucode_u32_data[i];
279 }
280
281 pad_start = i * 4U;
282 pad_end = pad_start + (256U - pad_start % 256U) + 256U;
283 for (i = pad_start;
284 (i < gpccs_imem_size * 256U) && (i < pad_end);
285 i += 4U) {
286 if ((i != 0U) && ((i % 256U) == 0U)) {
287 tag++;
288 gk20a_writel(g, gr_gpccs_imemt_r(0),
289 gr_gpccs_imemt_tag_f(tag));
290 }
291 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
292 }
293
294 gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
295 gr_fecs_imemc_blk_f(0) |
296 gr_fecs_imemc_aincw_f(1)));
297
298 /* Setup the tags for the instruction memory. */
299 tag = 0;
300 gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
301
302 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
303 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
304
305 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
306 if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
307 tag++;
308 gk20a_writel(g, gr_fecs_imemt_r(0),
309 gr_fecs_imemt_tag_f(tag));
310 }
311 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
312 checksum += ucode_u32_data[i];
313 }
314
315 pad_start = i * 4U;
316 pad_end = pad_start + (256U - pad_start % 256U) + 256U;
317 for (i = pad_start;
318 (i < fecs_imem_size * 256U) && i < pad_end;
319 i += 4U) {
320 if ((i != 0U) && ((i % 256U) == 0U)) {
321 tag++;
322 gk20a_writel(g, gr_fecs_imemt_r(0),
323 gr_fecs_imemt_tag_f(tag));
324 }
325 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
326 }
327}
328
329int gr_gk20a_wait_idle(struct gk20a *g, unsigned long duration_ms,
330 u32 expect_delay)
331{
332 u32 delay = expect_delay;
333 bool ctxsw_active;
334 bool gr_busy;
335 u32 gr_engine_id;
336 u32 engine_status;
337 bool ctx_status_invalid;
338 struct nvgpu_timeout timeout;
339
340 nvgpu_log_fn(g, " ");
341
342 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
343
344 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
345
346 do {
347 /* fmodel: host gets fifo_engine_status(gr) from gr
348 only when gr_status is read */
349 (void) gk20a_readl(g, gr_status_r());
350
351 engine_status = gk20a_readl(g,
352 fifo_engine_status_r(gr_engine_id));
353
354 ctxsw_active = engine_status &
355 fifo_engine_status_ctxsw_in_progress_f();
356
357 ctx_status_invalid =
358 (fifo_engine_status_ctx_status_v(engine_status) ==
359 fifo_engine_status_ctx_status_invalid_v());
360
361 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
362 gr_engine_status_value_busy_f();
363
364 if (ctx_status_invalid || (!gr_busy && !ctxsw_active)) {
365 nvgpu_log_fn(g, "done");
366 return 0;
367 }
368
369 nvgpu_usleep_range(delay, delay * 2);
370 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
371
372 } while (nvgpu_timeout_expired(&timeout) == 0);
373
374 nvgpu_err(g,
375 "timeout, ctxsw busy : %d, gr busy : %d",
376 ctxsw_active, gr_busy);
377
378 return -EAGAIN;
379}
380
381int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long duration_ms,
382 u32 expect_delay)
383{
384 u32 val;
385 u32 delay = expect_delay;
386 struct nvgpu_timeout timeout;
387
388 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
389 return 0;
390 }
391
392 nvgpu_log_fn(g, " ");
393
394 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
395
396 do {
397 val = gk20a_readl(g, gr_status_r());
398
399 if (gr_status_fe_method_lower_v(val) == 0U) {
400 nvgpu_log_fn(g, "done");
401 return 0;
402 }
403
404 nvgpu_usleep_range(delay, delay * 2);
405 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
406 } while (nvgpu_timeout_expired(&timeout) == 0);
407
408 nvgpu_err(g,
409 "timeout, fe busy : %x", val);
410
411 return -EAGAIN;
412}
413
414int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
415 u32 *mailbox_ret, u32 opc_success,
416 u32 mailbox_ok, u32 opc_fail,
417 u32 mailbox_fail, bool sleepduringwait)
418{
419 struct nvgpu_timeout timeout;
420 u32 delay = GR_FECS_POLL_INTERVAL;
421 u32 check = WAIT_UCODE_LOOP;
422 u32 reg;
423
424 nvgpu_log_fn(g, " ");
425
426 if (sleepduringwait) {
427 delay = GR_IDLE_CHECK_DEFAULT;
428 }
429
430 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
431 NVGPU_TIMER_CPU_TIMER);
432
433 while (check == WAIT_UCODE_LOOP) {
434 if (nvgpu_timeout_expired(&timeout)) {
435 check = WAIT_UCODE_TIMEOUT;
436 }
437
438 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
439
440 if (mailbox_ret) {
441 *mailbox_ret = reg;
442 }
443
444 switch (opc_success) {
445 case GR_IS_UCODE_OP_EQUAL:
446 if (reg == mailbox_ok) {
447 check = WAIT_UCODE_OK;
448 }
449 break;
450 case GR_IS_UCODE_OP_NOT_EQUAL:
451 if (reg != mailbox_ok) {
452 check = WAIT_UCODE_OK;
453 }
454 break;
455 case GR_IS_UCODE_OP_AND:
456 if (reg & mailbox_ok) {
457 check = WAIT_UCODE_OK;
458 }
459 break;
460 case GR_IS_UCODE_OP_LESSER:
461 if (reg < mailbox_ok) {
462 check = WAIT_UCODE_OK;
463 }
464 break;
465 case GR_IS_UCODE_OP_LESSER_EQUAL:
466 if (reg <= mailbox_ok) {
467 check = WAIT_UCODE_OK;
468 }
469 break;
470 case GR_IS_UCODE_OP_SKIP:
471 /* do no success check */
472 break;
473 default:
474 nvgpu_err(g,
475 "invalid success opcode 0x%x", opc_success);
476
477 check = WAIT_UCODE_ERROR;
478 break;
479 }
480
481 switch (opc_fail) {
482 case GR_IS_UCODE_OP_EQUAL:
483 if (reg == mailbox_fail) {
484 check = WAIT_UCODE_ERROR;
485 }
486 break;
487 case GR_IS_UCODE_OP_NOT_EQUAL:
488 if (reg != mailbox_fail) {
489 check = WAIT_UCODE_ERROR;
490 }
491 break;
492 case GR_IS_UCODE_OP_AND:
493 if (reg & mailbox_fail) {
494 check = WAIT_UCODE_ERROR;
495 }
496 break;
497 case GR_IS_UCODE_OP_LESSER:
498 if (reg < mailbox_fail) {
499 check = WAIT_UCODE_ERROR;
500 }
501 break;
502 case GR_IS_UCODE_OP_LESSER_EQUAL:
503 if (reg <= mailbox_fail) {
504 check = WAIT_UCODE_ERROR;
505 }
506 break;
507 case GR_IS_UCODE_OP_SKIP:
508 /* do no check on fail*/
509 break;
510 default:
511 nvgpu_err(g,
512 "invalid fail opcode 0x%x", opc_fail);
513 check = WAIT_UCODE_ERROR;
514 break;
515 }
516
517 if (sleepduringwait) {
518 nvgpu_usleep_range(delay, delay * 2);
519 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
520 } else {
521 nvgpu_udelay(delay);
522 }
523 }
524
525 if (check == WAIT_UCODE_TIMEOUT) {
526 nvgpu_err(g,
527 "timeout waiting on mailbox=%d value=0x%08x",
528 mailbox_id, reg);
529 gk20a_fecs_dump_falcon_stats(g);
530 gk20a_gr_debug_dump(g);
531 return -1;
532 } else if (check == WAIT_UCODE_ERROR) {
533 nvgpu_err(g,
534 "ucode method failed on mailbox=%d value=0x%08x",
535 mailbox_id, reg);
536 gk20a_fecs_dump_falcon_stats(g);
537 return -1;
538 }
539
540 nvgpu_log_fn(g, "done");
541 return 0;
542}
543
544int gr_gk20a_submit_fecs_method_op_locked(struct gk20a *g,
545 struct fecs_method_op_gk20a op,
546 bool sleepduringwait)
547{
548 int ret;
549
550 if (op.mailbox.id != 0) {
551 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
552 op.mailbox.data);
553 }
554
555 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
556 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
557
558 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
559 gk20a_writel(g, gr_fecs_method_push_r(),
560 gr_fecs_method_push_adr_f(op.method.addr));
561
562 /* op.mailbox.id == 4 cases require waiting for completion on
563 * for op.mailbox.id == 0 */
564 if (op.mailbox.id == 4) {
565 op.mailbox.id = 0;
566 }
567
568 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
569 op.cond.ok, op.mailbox.ok,
570 op.cond.fail, op.mailbox.fail,
571 sleepduringwait);
572 if (ret) {
573 nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
574 op.method.data, op.method.addr);
575 }
576
577 return ret;
578}
579
580/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
581 * We should replace most, if not all, fecs method calls to this instead. */
582int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
583 struct fecs_method_op_gk20a op,
584 bool sleepduringwait)
585{
586 struct gr_gk20a *gr = &g->gr;
587 int ret;
588
589 nvgpu_mutex_acquire(&gr->fecs_mutex);
590
591 ret = gr_gk20a_submit_fecs_method_op_locked(g, op, sleepduringwait);
592
593 nvgpu_mutex_release(&gr->fecs_mutex);
594
595 return ret;
596}
597
598/* Sideband mailbox writes are done a bit differently */
599int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
600 struct fecs_method_op_gk20a op)
601{
602 struct gr_gk20a *gr = &g->gr;
603 int ret;
604
605 nvgpu_mutex_acquire(&gr->fecs_mutex);
606
607 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(op.mailbox.id),
608 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
609
610 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
611 gk20a_writel(g, gr_fecs_method_push_r(),
612 gr_fecs_method_push_adr_f(op.method.addr));
613
614 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
615 op.cond.ok, op.mailbox.ok,
616 op.cond.fail, op.mailbox.fail,
617 false);
618 if (ret) {
619 nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
620 op.method.data, op.method.addr);
621 }
622
623 nvgpu_mutex_release(&gr->fecs_mutex);
624
625 return ret;
626}
627
628static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
629{
630 return gr_gk20a_submit_fecs_method_op(g,
631 (struct fecs_method_op_gk20a) {
632 .method.addr = fecs_method,
633 .method.data = ~0,
634 .mailbox = { .id = 1, /*sideband?*/
635 .data = ~0, .clr = ~0, .ret = ret,
636 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
637 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
638 .cond.ok = GR_IS_UCODE_OP_EQUAL,
639 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
640}
641
642/**
643 * Stop processing (stall) context switches at FECS:-
644 * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
645 * and may timeout. It could manifest as different error signatures
646 * depending on when stop_ctxsw fecs method gets sent with respect
647 * to pmu elpg sequence. It could come as pmu halt or abort or
648 * maybe ext error too.
649*/
650int gr_gk20a_disable_ctxsw(struct gk20a *g)
651{
652 int err = 0;
653
654 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
655
656 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
657 g->ctxsw_disable_count++;
658 if (g->ctxsw_disable_count == 1) {
659 err = nvgpu_pg_elpg_disable(g);
660 if (err != 0) {
661 nvgpu_err(g, "failed to disable elpg. not safe to "
662 "stop_ctxsw");
663 /* stop ctxsw command is not sent */
664 g->ctxsw_disable_count--;
665 } else {
666 err = gr_gk20a_ctrl_ctxsw(g,
667 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
668 if (err != 0) {
669 nvgpu_err(g, "failed to stop fecs ctxsw");
670 /* stop ctxsw failed */
671 g->ctxsw_disable_count--;
672 }
673 }
674 } else {
675 nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
676 g->ctxsw_disable_count);
677 }
678 nvgpu_mutex_release(&g->ctxsw_disable_lock);
679
680 return err;
681}
682
683/* Start processing (continue) context switches at FECS */
684int gr_gk20a_enable_ctxsw(struct gk20a *g)
685{
686 int err = 0;
687
688 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
689
690 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
691
692 if (g->ctxsw_disable_count == 0) {
693 goto ctxsw_already_enabled;
694 }
695 g->ctxsw_disable_count--;
696 WARN_ON(g->ctxsw_disable_count < 0);
697 if (g->ctxsw_disable_count == 0) {
698 err = gr_gk20a_ctrl_ctxsw(g,
699 gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
700 if (err != 0) {
701 nvgpu_err(g, "failed to start fecs ctxsw");
702 } else {
703 if (nvgpu_pg_elpg_enable(g) != 0) {
704 nvgpu_err(g, "failed to enable elpg "
705 "after start_ctxsw");
706 }
707 }
708 } else {
709 nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
710 g->ctxsw_disable_count);
711 }
712ctxsw_already_enabled:
713 nvgpu_mutex_release(&g->ctxsw_disable_lock);
714
715 return err;
716}
717
718int gr_gk20a_halt_pipe(struct gk20a *g)
719{
720 return gr_gk20a_submit_fecs_method_op(g,
721 (struct fecs_method_op_gk20a) {
722 .method.addr =
723 gr_fecs_method_push_adr_halt_pipeline_v(),
724 .method.data = ~0,
725 .mailbox = { .id = 1, /*sideband?*/
726 .data = ~0, .clr = ~0, .ret = NULL,
727 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
728 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
729 .cond.ok = GR_IS_UCODE_OP_EQUAL,
730 .cond.fail = GR_IS_UCODE_OP_EQUAL }, false);
731}
732
733
734int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
735{
736 u32 addr_lo;
737 u32 addr_hi;
738
739 nvgpu_log_fn(c->g, " ");
740
741 addr_lo = u64_lo32(gpu_va) >> 12;
742 addr_hi = u64_hi32(gpu_va);
743
744 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
745 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
746 ram_in_gr_wfi_ptr_lo_f(addr_lo));
747
748 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
749 ram_in_gr_wfi_ptr_hi_f(addr_hi));
750
751 return 0;
752}
753
754/*
755 * Context state can be written directly, or "patched" at times. So that code
756 * can be used in either situation it is written using a series of
757 * _ctx_patch_write(..., patch) statements. However any necessary map overhead
758 * should be minimized; thus, bundle the sequence of these writes together, and
759 * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
760 */
761
762int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
763 struct nvgpu_gr_ctx *gr_ctx,
764 bool update_patch_count)
765{
766 if (update_patch_count) {
767 /* reset patch count if ucode has already processed it */
768 gr_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
769 &gr_ctx->mem,
770 ctxsw_prog_main_image_patch_count_o());
771 nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
772 gr_ctx->patch_ctx.data_count);
773 }
774 return 0;
775}
776
777void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
778 struct nvgpu_gr_ctx *gr_ctx,
779 bool update_patch_count)
780{
781 /* Write context count to context image if it is mapped */
782 if (update_patch_count) {
783 nvgpu_mem_wr(g, &gr_ctx->mem,
784 ctxsw_prog_main_image_patch_count_o(),
785 gr_ctx->patch_ctx.data_count);
786 nvgpu_log(g, gpu_dbg_info, "write patch count %d",
787 gr_ctx->patch_ctx.data_count);
788 }
789}
790
791void gr_gk20a_ctx_patch_write(struct gk20a *g,
792 struct nvgpu_gr_ctx *gr_ctx,
793 u32 addr, u32 data, bool patch)
794{
795 if (patch) {
796 u32 patch_slot = gr_ctx->patch_ctx.data_count *
797 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
798 if (patch_slot > (PATCH_CTX_ENTRIES_FROM_SIZE(
799 gr_ctx->patch_ctx.mem.size) -
800 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY)) {
801 nvgpu_err(g, "failed to access patch_slot %d",
802 patch_slot);
803 return;
804 }
805 nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot, addr);
806 nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot + 1, data);
807 gr_ctx->patch_ctx.data_count++;
808 nvgpu_log(g, gpu_dbg_info,
809 "patch addr = 0x%x data = 0x%x data_count %d",
810 addr, data, gr_ctx->patch_ctx.data_count);
811 } else {
812 gk20a_writel(g, addr, data);
813 }
814}
815
816static u32 fecs_current_ctx_data(struct gk20a *g, struct nvgpu_mem *inst_block)
817{
818 u64 ptr = nvgpu_inst_block_addr(g, inst_block) >>
819 ram_in_base_shift_v();
820 u32 aperture = nvgpu_aperture_mask(g, inst_block,
821 gr_fecs_current_ctx_target_sys_mem_ncoh_f(),
822 gr_fecs_current_ctx_target_sys_mem_coh_f(),
823 gr_fecs_current_ctx_target_vid_mem_f());
824
825 return gr_fecs_current_ctx_ptr_f(u64_lo32(ptr)) | aperture |
826 gr_fecs_current_ctx_valid_f(1);
827}
828
829int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
830 struct channel_gk20a *c)
831{
832 u32 inst_base_ptr = u64_lo32(nvgpu_inst_block_addr(g, &c->inst_block)
833 >> ram_in_base_shift_v());
834 u32 data = fecs_current_ctx_data(g, &c->inst_block);
835 u32 ret;
836
837 nvgpu_log_info(g, "bind channel %d inst ptr 0x%08x",
838 c->chid, inst_base_ptr);
839
840 ret = gr_gk20a_submit_fecs_method_op(g,
841 (struct fecs_method_op_gk20a) {
842 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
843 .method.data = data,
844 .mailbox = { .id = 0, .data = 0,
845 .clr = 0x30,
846 .ret = NULL,
847 .ok = 0x10,
848 .fail = 0x20, },
849 .cond.ok = GR_IS_UCODE_OP_AND,
850 .cond.fail = GR_IS_UCODE_OP_AND}, true);
851 if (ret) {
852 nvgpu_err(g,
853 "bind channel instance failed");
854 }
855
856 return ret;
857}
858
859void gr_gk20a_write_zcull_ptr(struct gk20a *g,
860 struct nvgpu_mem *mem, u64 gpu_va)
861{
862 u32 va = u64_lo32(gpu_va >> 8);
863
864 nvgpu_mem_wr(g, mem,
865 ctxsw_prog_main_image_zcull_ptr_o(), va);
866}
867
868void gr_gk20a_write_pm_ptr(struct gk20a *g,
869 struct nvgpu_mem *mem, u64 gpu_va)
870{
871 u32 va = u64_lo32(gpu_va >> 8);
872
873 nvgpu_mem_wr(g, mem,
874 ctxsw_prog_main_image_pm_ptr_o(), va);
875}
876
877static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
878{
879 struct tsg_gk20a *tsg;
880 struct nvgpu_gr_ctx *gr_ctx = NULL;
881 struct nvgpu_mem *mem = NULL;
882 struct nvgpu_mem *ctxheader = &c->ctx_header;
883 int ret = 0;
884
885 nvgpu_log_fn(g, " ");
886
887 tsg = tsg_gk20a_from_ch(c);
888 if (tsg == NULL) {
889 return -EINVAL;
890 }
891
892 gr_ctx = &tsg->gr_ctx;
893 mem = &gr_ctx->mem;
894
895 if (gr_ctx->zcull_ctx.gpu_va == 0 &&
896 gr_ctx->zcull_ctx.ctx_sw_mode ==
897 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
898 return -EINVAL;
899 }
900
901 ret = gk20a_disable_channel_tsg(g, c);
902 if (ret) {
903 nvgpu_err(g, "failed to disable channel/TSG");
904 return ret;
905 }
906 ret = gk20a_fifo_preempt(g, c);
907 if (ret) {
908 gk20a_enable_channel_tsg(g, c);
909 nvgpu_err(g, "failed to preempt channel/TSG");
910 return ret;
911 }
912
913 nvgpu_mem_wr(g, mem,
914 ctxsw_prog_main_image_zcull_o(),
915 gr_ctx->zcull_ctx.ctx_sw_mode);
916
917 if (ctxheader->gpu_va) {
918 g->ops.gr.write_zcull_ptr(g, ctxheader,
919 gr_ctx->zcull_ctx.gpu_va);
920 } else {
921 g->ops.gr.write_zcull_ptr(g, mem, gr_ctx->zcull_ctx.gpu_va);
922 }
923
924 gk20a_enable_channel_tsg(g, c);
925
926 return ret;
927}
928
929u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc)
930{
931 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
932 u32 gpc_offset = gpc_stride * gpc;
933
934 return gpc_offset;
935}
936
937u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc)
938{
939 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
940 GPU_LIT_TPC_IN_GPC_STRIDE);
941 u32 tpc_offset = tpc_in_gpc_stride * tpc;
942
943 return tpc_offset;
944}
945
946int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
947 struct channel_gk20a *c, bool patch)
948{
949 struct gr_gk20a *gr = &g->gr;
950 struct tsg_gk20a *tsg;
951 struct nvgpu_gr_ctx *gr_ctx = NULL;
952 u64 addr;
953 u32 size;
954
955 nvgpu_log_fn(g, " ");
956
957 tsg = tsg_gk20a_from_ch(c);
958 if (tsg == NULL) {
959 return -EINVAL;
960 }
961
962 gr_ctx = &tsg->gr_ctx;
963 if (patch) {
964 int err;
965 err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
966 if (err != 0) {
967 return err;
968 }
969 }
970
971 /* global pagepool buffer */
972 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
973 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
974 (u64_hi32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
975 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
976
977 size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
978 gr_scc_pagepool_total_pages_byte_granularity_v();
979
980 if (size == g->ops.gr.pagepool_default_size(g)) {
981 size = gr_scc_pagepool_total_pages_hwmax_v();
982 }
983
984 nvgpu_log_info(g, "pagepool buffer addr : 0x%016llx, size : %d",
985 addr, size);
986
987 g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, patch);
988
989 /* global bundle cb */
990 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
991 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
992 (u64_hi32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
993 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
994
995 size = gr->bundle_cb_default_size;
996
997 nvgpu_log_info(g, "bundle cb addr : 0x%016llx, size : %d",
998 addr, size);
999
1000 g->ops.gr.commit_global_bundle_cb(g, gr_ctx, addr, size, patch);
1001
1002 /* global attrib cb */
1003 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
1004 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
1005 (u64_hi32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
1006 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
1007
1008 nvgpu_log_info(g, "attrib cb addr : 0x%016llx", addr);
1009 g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, patch);
1010 g->ops.gr.commit_global_cb_manager(g, c, patch);
1011
1012 if (patch) {
1013 gr_gk20a_ctx_patch_write_end(g, gr_ctx, false);
1014 }
1015
1016 return 0;
1017}
1018
1019int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
1020{
1021 struct gr_gk20a *gr = &g->gr;
1022 struct nvgpu_gr_ctx *gr_ctx = NULL;
1023 u32 gpm_pd_cfg;
1024 u32 pd_ab_dist_cfg0;
1025 u32 ds_debug;
1026 u32 mpc_vtg_debug;
1027 u32 pe_vaf;
1028 u32 pe_vsc_vpc;
1029
1030 nvgpu_log_fn(g, " ");
1031
1032 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1033 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1034 ds_debug = gk20a_readl(g, gr_ds_debug_r());
1035 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1036
1037 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1038 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1039 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1040
1041 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1042 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1043 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1044 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1045 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1046 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1047
1048 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
1049 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
1050 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
1051 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
1052 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
1053 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
1054 } else {
1055 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1056 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1057 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1058 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1059
1060 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
1061 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
1062 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
1063 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
1064 }
1065
1066 return 0;
1067}
1068
1069/*
1070 * Return map tiles count for given index
1071 * Return 0 if index is out-of-bounds
1072 */
1073static u32 gr_gk20a_get_map_tile_count(struct gr_gk20a *gr, u32 index)
1074{
1075 if (index >= gr->map_tile_count) {
1076 return 0;
1077 }
1078
1079 return gr->map_tiles[index];
1080}
1081
1082int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1083{
1084 u32 norm_entries, norm_shift;
1085 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1086 u32 map0, map1, map2, map3, map4, map5;
1087
1088 if (gr->map_tiles == NULL) {
1089 return -1;
1090 }
1091
1092 nvgpu_log_fn(g, " ");
1093
1094 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1095 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1096 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1097
1098 map0 = gr_crstr_gpc_map0_tile0_f(gr_gk20a_get_map_tile_count(gr, 0)) |
1099 gr_crstr_gpc_map0_tile1_f(gr_gk20a_get_map_tile_count(gr, 1)) |
1100 gr_crstr_gpc_map0_tile2_f(gr_gk20a_get_map_tile_count(gr, 2)) |
1101 gr_crstr_gpc_map0_tile3_f(gr_gk20a_get_map_tile_count(gr, 3)) |
1102 gr_crstr_gpc_map0_tile4_f(gr_gk20a_get_map_tile_count(gr, 4)) |
1103 gr_crstr_gpc_map0_tile5_f(gr_gk20a_get_map_tile_count(gr, 5));
1104
1105 map1 = gr_crstr_gpc_map1_tile6_f(gr_gk20a_get_map_tile_count(gr, 6)) |
1106 gr_crstr_gpc_map1_tile7_f(gr_gk20a_get_map_tile_count(gr, 7)) |
1107 gr_crstr_gpc_map1_tile8_f(gr_gk20a_get_map_tile_count(gr, 8)) |
1108 gr_crstr_gpc_map1_tile9_f(gr_gk20a_get_map_tile_count(gr, 9)) |
1109 gr_crstr_gpc_map1_tile10_f(gr_gk20a_get_map_tile_count(gr, 10)) |
1110 gr_crstr_gpc_map1_tile11_f(gr_gk20a_get_map_tile_count(gr, 11));
1111
1112 map2 = gr_crstr_gpc_map2_tile12_f(gr_gk20a_get_map_tile_count(gr, 12)) |
1113 gr_crstr_gpc_map2_tile13_f(gr_gk20a_get_map_tile_count(gr, 13)) |
1114 gr_crstr_gpc_map2_tile14_f(gr_gk20a_get_map_tile_count(gr, 14)) |
1115 gr_crstr_gpc_map2_tile15_f(gr_gk20a_get_map_tile_count(gr, 15)) |
1116 gr_crstr_gpc_map2_tile16_f(gr_gk20a_get_map_tile_count(gr, 16)) |
1117 gr_crstr_gpc_map2_tile17_f(gr_gk20a_get_map_tile_count(gr, 17));
1118
1119 map3 = gr_crstr_gpc_map3_tile18_f(gr_gk20a_get_map_tile_count(gr, 18)) |
1120 gr_crstr_gpc_map3_tile19_f(gr_gk20a_get_map_tile_count(gr, 19)) |
1121 gr_crstr_gpc_map3_tile20_f(gr_gk20a_get_map_tile_count(gr, 20)) |
1122 gr_crstr_gpc_map3_tile21_f(gr_gk20a_get_map_tile_count(gr, 21)) |
1123 gr_crstr_gpc_map3_tile22_f(gr_gk20a_get_map_tile_count(gr, 22)) |
1124 gr_crstr_gpc_map3_tile23_f(gr_gk20a_get_map_tile_count(gr, 23));
1125
1126 map4 = gr_crstr_gpc_map4_tile24_f(gr_gk20a_get_map_tile_count(gr, 24)) |
1127 gr_crstr_gpc_map4_tile25_f(gr_gk20a_get_map_tile_count(gr, 25)) |
1128 gr_crstr_gpc_map4_tile26_f(gr_gk20a_get_map_tile_count(gr, 26)) |
1129 gr_crstr_gpc_map4_tile27_f(gr_gk20a_get_map_tile_count(gr, 27)) |
1130 gr_crstr_gpc_map4_tile28_f(gr_gk20a_get_map_tile_count(gr, 28)) |
1131 gr_crstr_gpc_map4_tile29_f(gr_gk20a_get_map_tile_count(gr, 29));
1132
1133 map5 = gr_crstr_gpc_map5_tile30_f(gr_gk20a_get_map_tile_count(gr, 30)) |
1134 gr_crstr_gpc_map5_tile31_f(gr_gk20a_get_map_tile_count(gr, 31)) |
1135 gr_crstr_gpc_map5_tile32_f(0) |
1136 gr_crstr_gpc_map5_tile33_f(0) |
1137 gr_crstr_gpc_map5_tile34_f(0) |
1138 gr_crstr_gpc_map5_tile35_f(0);
1139
1140 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1141 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1142 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1143 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1144 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1145 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1146
1147 switch (gr->tpc_count) {
1148 case 1:
1149 norm_shift = 4;
1150 break;
1151 case 2:
1152 case 3:
1153 norm_shift = 3;
1154 break;
1155 case 4:
1156 case 5:
1157 case 6:
1158 case 7:
1159 norm_shift = 2;
1160 break;
1161 case 8:
1162 case 9:
1163 case 10:
1164 case 11:
1165 case 12:
1166 case 13:
1167 case 14:
1168 case 15:
1169 norm_shift = 1;
1170 break;
1171 default:
1172 norm_shift = 0;
1173 break;
1174 }
1175
1176 norm_entries = gr->tpc_count << norm_shift;
1177 coeff5_mod = (1 << 5) % norm_entries;
1178 coeff6_mod = (1 << 6) % norm_entries;
1179 coeff7_mod = (1 << 7) % norm_entries;
1180 coeff8_mod = (1 << 8) % norm_entries;
1181 coeff9_mod = (1 << 9) % norm_entries;
1182 coeff10_mod = (1 << 10) % norm_entries;
1183 coeff11_mod = (1 << 11) % norm_entries;
1184
1185 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1186 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1187 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1188 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1189 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1190 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1191
1192 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1193 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1194 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1195 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1196 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1197 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1198 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1199
1200 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1201 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1202 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1203 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1204 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1205 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1206
1207 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1208 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1209 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1210
1211 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1212 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1213 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1214 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1215 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1216 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1217
1218 return 0;
1219}
1220
1221static inline u32 count_bits(u32 mask)
1222{
1223 u32 temp = mask;
1224 u32 count;
1225 for (count = 0; temp != 0; count++) {
1226 temp &= temp - 1;
1227 }
1228
1229 return count;
1230}
1231
1232int gr_gk20a_init_sm_id_table(struct gk20a *g)
1233{
1234 u32 gpc, tpc;
1235 u32 sm_id = 0;
1236
1237 for (tpc = 0; tpc < g->gr.max_tpc_per_gpc_count; tpc++) {
1238 for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
1239
1240 if (tpc < g->gr.gpc_tpc_count[gpc]) {
1241 g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
1242 g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
1243 g->gr.sm_to_cluster[sm_id].sm_index = 0;
1244 g->gr.sm_to_cluster[sm_id].global_tpc_index =
1245 sm_id;
1246 sm_id++;
1247 }
1248 }
1249 }
1250 g->gr.no_of_sm = sm_id;
1251 return 0;
1252}
1253
1254/*
1255 * Return number of TPCs in a GPC
1256 * Return 0 if GPC index is invalid i.e. GPC is disabled
1257 */
1258u32 gr_gk20a_get_tpc_count(struct gr_gk20a *gr, u32 gpc_index)
1259{
1260 if (gpc_index >= gr->gpc_count) {
1261 return 0;
1262 }
1263
1264 return gr->gpc_tpc_count[gpc_index];
1265}
1266
1267int gr_gk20a_init_fs_state(struct gk20a *g)
1268{
1269 struct gr_gk20a *gr = &g->gr;
1270 u32 tpc_index, gpc_index;
1271 u32 sm_id = 0, gpc_id = 0;
1272 u32 tpc_per_gpc;
1273 u32 fuse_tpc_mask;
1274 u32 reg_index;
1275 int err;
1276
1277 nvgpu_log_fn(g, " ");
1278
1279 if (g->ops.gr.init_sm_id_table) {
1280 err = g->ops.gr.init_sm_id_table(g);
1281 if (err != 0) {
1282 return err;
1283 }
1284
1285 /* Is table empty ? */
1286 if (g->gr.no_of_sm == 0) {
1287 return -EINVAL;
1288 }
1289 }
1290
1291 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
1292 tpc_index = g->gr.sm_to_cluster[sm_id].tpc_index;
1293 gpc_index = g->gr.sm_to_cluster[sm_id].gpc_index;
1294
1295 g->ops.gr.program_sm_id_numbering(g, gpc_index, tpc_index, sm_id);
1296
1297 if (g->ops.gr.program_active_tpc_counts) {
1298 g->ops.gr.program_active_tpc_counts(g, gpc_index);
1299 }
1300 }
1301
1302 for (reg_index = 0, gpc_id = 0;
1303 reg_index < gr_pd_num_tpc_per_gpc__size_1_v();
1304 reg_index++, gpc_id += 8) {
1305
1306 tpc_per_gpc =
1307 gr_pd_num_tpc_per_gpc_count0_f(gr_gk20a_get_tpc_count(gr, gpc_id + 0)) |
1308 gr_pd_num_tpc_per_gpc_count1_f(gr_gk20a_get_tpc_count(gr, gpc_id + 1)) |
1309 gr_pd_num_tpc_per_gpc_count2_f(gr_gk20a_get_tpc_count(gr, gpc_id + 2)) |
1310 gr_pd_num_tpc_per_gpc_count3_f(gr_gk20a_get_tpc_count(gr, gpc_id + 3)) |
1311 gr_pd_num_tpc_per_gpc_count4_f(gr_gk20a_get_tpc_count(gr, gpc_id + 4)) |
1312 gr_pd_num_tpc_per_gpc_count5_f(gr_gk20a_get_tpc_count(gr, gpc_id + 5)) |
1313 gr_pd_num_tpc_per_gpc_count6_f(gr_gk20a_get_tpc_count(gr, gpc_id + 6)) |
1314 gr_pd_num_tpc_per_gpc_count7_f(gr_gk20a_get_tpc_count(gr, gpc_id + 7));
1315
1316 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1317 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1318 }
1319
1320 /* gr__setup_pd_mapping stubbed for gk20a */
1321 g->ops.gr.setup_rop_mapping(g, gr);
1322 if (g->ops.gr.setup_alpha_beta_tables) {
1323 g->ops.gr.setup_alpha_beta_tables(g, gr);
1324 }
1325
1326 for (gpc_index = 0;
1327 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1328 gpc_index += 4) {
1329
1330 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1331 (gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) != 0U) ||
1332 (gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) != 0U) ||
1333 (gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) != 0U) ||
1334 (gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]) != 0U));
1335 }
1336
1337 fuse_tpc_mask = g->ops.gr.get_gpc_tpc_mask(g, 0);
1338 if ((g->tpc_fs_mask_user != 0U) &&
1339 (fuse_tpc_mask == BIT32(gr->max_tpc_count) - 1U)) {
1340 u32 val = g->tpc_fs_mask_user;
1341 val &= (0x1U << gr->max_tpc_count) - 1U;
1342 gk20a_writel(g, gr_cwd_fs_r(),
1343 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1344 gr_cwd_fs_num_tpcs_f(hweight32(val)));
1345 } else {
1346 gk20a_writel(g, gr_cwd_fs_r(),
1347 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1348 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1349 }
1350
1351 gk20a_writel(g, gr_bes_zrop_settings_r(),
1352 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1353 gk20a_writel(g, gr_bes_crop_settings_r(),
1354 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1355
1356 return 0;
1357}
1358
1359int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1360{
1361 struct gk20a *g = c->g;
1362 int ret;
1363
1364 nvgpu_log_fn(g, " ");
1365
1366 ret = gr_gk20a_submit_fecs_method_op(g,
1367 (struct fecs_method_op_gk20a) {
1368 .method.addr = save_type,
1369 .method.data = fecs_current_ctx_data(g, &c->inst_block),
1370 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1371 .ok = 1, .fail = 2,
1372 },
1373 .cond.ok = GR_IS_UCODE_OP_AND,
1374 .cond.fail = GR_IS_UCODE_OP_AND,
1375 }, true);
1376
1377 if (ret) {
1378 nvgpu_err(g, "save context image failed");
1379 }
1380
1381 return ret;
1382}
1383
1384u32 gk20a_init_sw_bundle(struct gk20a *g)
1385{
1386 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1387 u32 last_bundle_data = 0;
1388 u32 err = 0;
1389 unsigned int i;
1390
1391 /* disable fe_go_idle */
1392 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1393 gr_fe_go_idle_timeout_count_disabled_f());
1394 /* enable pipe mode override */
1395 gk20a_writel(g, gr_pipe_bundle_config_r(),
1396 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1397
1398 /* load bundle init */
1399 for (i = 0; i < sw_bundle_init->count; i++) {
1400 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1401 gk20a_writel(g, gr_pipe_bundle_data_r(),
1402 sw_bundle_init->l[i].value);
1403 last_bundle_data = sw_bundle_init->l[i].value;
1404 }
1405
1406 gk20a_writel(g, gr_pipe_bundle_address_r(),
1407 sw_bundle_init->l[i].addr);
1408
1409 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1410 GR_GO_IDLE_BUNDLE) {
1411 err = gr_gk20a_wait_idle(g,
1412 gk20a_get_gr_idle_timeout(g),
1413 GR_IDLE_CHECK_DEFAULT);
1414 if (err != 0U) {
1415 goto error;
1416 }
1417 }
1418
1419 err = gr_gk20a_wait_fe_idle(g, gk20a_get_gr_idle_timeout(g),
1420 GR_IDLE_CHECK_DEFAULT);
1421 if (err != 0U) {
1422 goto error;
1423 }
1424 }
1425
1426 if ((err == 0U) && (g->ops.gr.init_sw_veid_bundle != NULL)) {
1427 err = g->ops.gr.init_sw_veid_bundle(g);
1428 if (err != 0U) {
1429 goto error;
1430 }
1431 }
1432
1433 if (g->ops.gr.init_sw_bundle64) {
1434 err = g->ops.gr.init_sw_bundle64(g);
1435 if (err != 0U) {
1436 goto error;
1437 }
1438 }
1439
1440 /* disable pipe mode override */
1441 gk20a_writel(g, gr_pipe_bundle_config_r(),
1442 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1443
1444 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1445 GR_IDLE_CHECK_DEFAULT);
1446
1447 /* restore fe_go_idle */
1448 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1449 gr_fe_go_idle_timeout_count_prod_f());
1450
1451 return err;
1452
1453error:
1454 /* in case of error skip waiting for GR idle - just restore state */
1455 gk20a_writel(g, gr_pipe_bundle_config_r(),
1456 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1457
1458 /* restore fe_go_idle */
1459 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1460 gr_fe_go_idle_timeout_count_prod_f());
1461
1462 return err;
1463}
1464
1465/* init global golden image from a fresh gr_ctx in channel ctx.
1466 save a copy in local_golden_image in ctx_vars */
1467static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1468 struct channel_gk20a *c)
1469{
1470 struct gr_gk20a *gr = &g->gr;
1471 struct tsg_gk20a *tsg;
1472 struct nvgpu_gr_ctx *gr_ctx = NULL;
1473 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1474 u32 ctx_header_words;
1475 u32 i;
1476 u32 data;
1477 struct nvgpu_mem *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
1478 struct nvgpu_mem *gr_mem;
1479 u32 err = 0;
1480 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
1481 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
1482 u32 last_method_data = 0;
1483
1484 nvgpu_log_fn(g, " ");
1485
1486 tsg = tsg_gk20a_from_ch(c);
1487 if (tsg == NULL) {
1488 return -EINVAL;
1489 }
1490
1491 gr_ctx = &tsg->gr_ctx;
1492 gr_mem = &gr_ctx->mem;
1493
1494 /* golden ctx is global to all channels. Although only the first
1495 channel initializes golden image, driver needs to prevent multiple
1496 channels from initializing golden ctx at the same time */
1497 nvgpu_mutex_acquire(&gr->ctx_mutex);
1498
1499 if (gr->ctx_vars.golden_image_initialized) {
1500 goto clean_up;
1501 }
1502 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1503 struct nvgpu_timeout timeout;
1504
1505 nvgpu_timeout_init(g, &timeout,
1506 FE_PWR_MODE_TIMEOUT_MAX /
1507 FE_PWR_MODE_TIMEOUT_DEFAULT,
1508 NVGPU_TIMER_RETRY_TIMER);
1509 gk20a_writel(g, gr_fe_pwr_mode_r(),
1510 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_force_on_f());
1511 do {
1512 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1513 if (req == gr_fe_pwr_mode_req_done_v()) {
1514 break;
1515 }
1516 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1517 } while (nvgpu_timeout_expired_msg(&timeout,
1518 "timeout forcing FE on") == 0);
1519 }
1520
1521
1522 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1523 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1524 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1525 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1526 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1527 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1528 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1529 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
1530 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
1531 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
1532 (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1533 nvgpu_udelay(10);
1534
1535 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1536 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1537 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1538 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1539 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1540 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1541 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1542 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
1543 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
1544 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
1545 (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1546 nvgpu_udelay(10);
1547
1548 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1549 struct nvgpu_timeout timeout;
1550
1551 nvgpu_timeout_init(g, &timeout,
1552 FE_PWR_MODE_TIMEOUT_MAX /
1553 FE_PWR_MODE_TIMEOUT_DEFAULT,
1554 NVGPU_TIMER_RETRY_TIMER);
1555 gk20a_writel(g, gr_fe_pwr_mode_r(),
1556 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_auto_f());
1557
1558 do {
1559 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1560 if (req == gr_fe_pwr_mode_req_done_v()) {
1561 break;
1562 }
1563 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1564 } while (nvgpu_timeout_expired_msg(&timeout,
1565 "timeout setting FE power to auto") == 0);
1566 }
1567
1568 /* clear scc ram */
1569 gk20a_writel(g, gr_scc_init_r(),
1570 gr_scc_init_ram_trigger_f());
1571
1572 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1573 if (err != 0U) {
1574 goto clean_up;
1575 }
1576
1577 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1578 GR_IDLE_CHECK_DEFAULT);
1579
1580 /* load ctx init */
1581 for (i = 0; i < sw_ctx_load->count; i++) {
1582 gk20a_writel(g, sw_ctx_load->l[i].addr,
1583 sw_ctx_load->l[i].value);
1584 }
1585
1586 if (g->ops.gr.init_preemption_state) {
1587 g->ops.gr.init_preemption_state(g);
1588 }
1589
1590 if (g->ops.clock_gating.blcg_gr_load_gating_prod) {
1591 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
1592 }
1593
1594 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1595 GR_IDLE_CHECK_DEFAULT);
1596 if (err != 0U) {
1597 goto clean_up;
1598 }
1599
1600 /* disable fe_go_idle */
1601 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1602 gr_fe_go_idle_timeout_count_disabled_f());
1603
1604 err = g->ops.gr.commit_global_ctx_buffers(g, c, false);
1605 if (err != 0U) {
1606 goto clean_up;
1607 }
1608
1609 /* override a few ctx state registers */
1610 g->ops.gr.commit_global_timeslice(g, c);
1611
1612 /* floorsweep anything left */
1613 err = g->ops.gr.init_fs_state(g);
1614 if (err != 0U) {
1615 goto clean_up;
1616 }
1617
1618 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1619 GR_IDLE_CHECK_DEFAULT);
1620 if (err != 0U) {
1621 goto restore_fe_go_idle;
1622 }
1623
1624 err = gk20a_init_sw_bundle(g);
1625 if (err != 0U) {
1626 goto clean_up;
1627 }
1628
1629restore_fe_go_idle:
1630 /* restore fe_go_idle */
1631 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1632 gr_fe_go_idle_timeout_count_prod_f());
1633
1634 if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1635 GR_IDLE_CHECK_DEFAULT) != 0)) {
1636 goto clean_up;
1637 }
1638
1639 /* load method init */
1640 if (sw_method_init->count) {
1641 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1642 sw_method_init->l[0].value);
1643 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1644 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1645 sw_method_init->l[0].addr);
1646 last_method_data = sw_method_init->l[0].value;
1647 }
1648 for (i = 1; i < sw_method_init->count; i++) {
1649 if (sw_method_init->l[i].value != last_method_data) {
1650 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1651 sw_method_init->l[i].value);
1652 last_method_data = sw_method_init->l[i].value;
1653 }
1654 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1655 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1656 sw_method_init->l[i].addr);
1657 }
1658
1659 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1660 GR_IDLE_CHECK_DEFAULT);
1661 if (err != 0U) {
1662 goto clean_up;
1663 }
1664
1665 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1666 ctx_header_words >>= 2;
1667
1668 g->ops.mm.l2_flush(g, true);
1669
1670 for (i = 0; i < ctx_header_words; i++) {
1671 data = nvgpu_mem_rd32(g, gr_mem, i);
1672 nvgpu_mem_wr32(g, gold_mem, i, data);
1673 }
1674 nvgpu_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
1675 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1676
1677 g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
1678
1679 err = g->ops.gr.commit_inst(c, gr_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1680 if (err != 0U) {
1681 goto clean_up;
1682 }
1683
1684 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1685
1686
1687
1688 if (gr->ctx_vars.local_golden_image == NULL) {
1689
1690 gr->ctx_vars.local_golden_image =
1691 nvgpu_vzalloc(g, gr->ctx_vars.golden_image_size);
1692
1693 if (gr->ctx_vars.local_golden_image == NULL) {
1694 err = -ENOMEM;
1695 goto clean_up;
1696 }
1697 nvgpu_mem_rd_n(g, gold_mem, 0,
1698 gr->ctx_vars.local_golden_image,
1699 gr->ctx_vars.golden_image_size);
1700
1701 }
1702
1703 err = g->ops.gr.commit_inst(c, gr_mem->gpu_va);
1704 if (err != 0U) {
1705 goto clean_up;
1706 }
1707
1708 gr->ctx_vars.golden_image_initialized = true;
1709
1710 gk20a_writel(g, gr_fecs_current_ctx_r(),
1711 gr_fecs_current_ctx_valid_false_f());
1712
1713clean_up:
1714 if (err != 0U) {
1715 nvgpu_err(g, "fail");
1716 } else {
1717 nvgpu_log_fn(g, "done");
1718 }
1719
1720 nvgpu_mutex_release(&gr->ctx_mutex);
1721 return err;
1722}
1723
1724int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1725 struct channel_gk20a *c,
1726 bool enable_smpc_ctxsw)
1727{
1728 struct tsg_gk20a *tsg;
1729 struct nvgpu_gr_ctx *gr_ctx = NULL;
1730 struct nvgpu_mem *mem = NULL;
1731 u32 data;
1732 int ret;
1733
1734 nvgpu_log_fn(g, " ");
1735
1736 tsg = tsg_gk20a_from_ch(c);
1737 if (tsg == NULL) {
1738 return -EINVAL;
1739 }
1740
1741 gr_ctx = &tsg->gr_ctx;
1742 mem = &gr_ctx->mem;
1743 if (!nvgpu_mem_is_valid(mem)) {
1744 nvgpu_err(g, "no graphics context allocated");
1745 return -EFAULT;
1746 }
1747
1748 ret = gk20a_disable_channel_tsg(g, c);
1749 if (ret) {
1750 nvgpu_err(g, "failed to disable channel/TSG");
1751 goto out;
1752 }
1753 ret = gk20a_fifo_preempt(g, c);
1754 if (ret) {
1755 gk20a_enable_channel_tsg(g, c);
1756 nvgpu_err(g, "failed to preempt channel/TSG");
1757 goto out;
1758 }
1759
1760 /* Channel gr_ctx buffer is gpu cacheable.
1761 Flush and invalidate before cpu update. */
1762 g->ops.mm.l2_flush(g, true);
1763
1764 data = nvgpu_mem_rd(g, mem,
1765 ctxsw_prog_main_image_pm_o());
1766
1767 data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1768 data |= enable_smpc_ctxsw ?
1769 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1770 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1771
1772 nvgpu_mem_wr(g, mem,
1773 ctxsw_prog_main_image_pm_o(), data);
1774
1775out:
1776 gk20a_enable_channel_tsg(g, c);
1777 return ret;
1778}
1779
1780int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
1781 struct channel_gk20a *c,
1782 u64 gpu_va,
1783 u32 mode)
1784{
1785 struct tsg_gk20a *tsg;
1786 struct nvgpu_mem *gr_mem = NULL;
1787 struct nvgpu_gr_ctx *gr_ctx;
1788 struct pm_ctx_desc *pm_ctx;
1789 u32 data;
1790 u64 virt_addr = 0;
1791 struct nvgpu_mem *ctxheader = &c->ctx_header;
1792 int ret;
1793
1794 nvgpu_log_fn(g, " ");
1795
1796 tsg = tsg_gk20a_from_ch(c);
1797 if (tsg == NULL) {
1798 return -EINVAL;
1799 }
1800
1801 gr_ctx = &tsg->gr_ctx;
1802 pm_ctx = &gr_ctx->pm_ctx;
1803 gr_mem = &gr_ctx->mem;
1804 if (!nvgpu_mem_is_valid(gr_mem)) {
1805 nvgpu_err(g, "no graphics context allocated");
1806 return -EFAULT;
1807 }
1808
1809 if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
1810 (g->ops.gr.get_hw_accessor_stream_out_mode == NULL)) {
1811 nvgpu_err(g, "Mode-E hwpm context switch mode is not supported");
1812 return -EINVAL;
1813 }
1814
1815 switch (mode) {
1816 case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
1817 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
1818 return 0;
1819 }
1820 break;
1821 case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
1822 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
1823 return 0;
1824 }
1825 break;
1826 case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
1827 if (pm_ctx->pm_mode == g->ops.gr.get_hw_accessor_stream_out_mode()) {
1828 return 0;
1829 }
1830 break;
1831 default:
1832 nvgpu_err(g, "invalid hwpm context switch mode");
1833 return -EINVAL;
1834 }
1835
1836 ret = gk20a_disable_channel_tsg(g, c);
1837 if (ret) {
1838 nvgpu_err(g, "failed to disable channel/TSG");
1839 return ret;
1840 }
1841
1842 ret = gk20a_fifo_preempt(g, c);
1843 if (ret) {
1844 gk20a_enable_channel_tsg(g, c);
1845 nvgpu_err(g, "failed to preempt channel/TSG");
1846 return ret;
1847 }
1848
1849 /* Channel gr_ctx buffer is gpu cacheable.
1850 Flush and invalidate before cpu update. */
1851 g->ops.mm.l2_flush(g, true);
1852
1853 if (mode != NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW) {
1854 /* Allocate buffer if necessary */
1855 if (pm_ctx->mem.gpu_va == 0) {
1856 ret = nvgpu_dma_alloc_sys(g,
1857 g->gr.ctx_vars.pm_ctxsw_image_size,
1858 &pm_ctx->mem);
1859 if (ret) {
1860 c->g->ops.fifo.enable_channel(c);
1861 nvgpu_err(g,
1862 "failed to allocate pm ctxt buffer");
1863 return ret;
1864 }
1865
1866 pm_ctx->mem.gpu_va = nvgpu_gmmu_map_fixed(c->vm,
1867 &pm_ctx->mem,
1868 gpu_va,
1869 pm_ctx->mem.size,
1870 NVGPU_VM_MAP_CACHEABLE,
1871 gk20a_mem_flag_none, true,
1872 pm_ctx->mem.aperture);
1873 if (pm_ctx->mem.gpu_va == 0ULL) {
1874 nvgpu_err(g,
1875 "failed to map pm ctxt buffer");
1876 nvgpu_dma_free(g, &pm_ctx->mem);
1877 c->g->ops.fifo.enable_channel(c);
1878 return -ENOMEM;
1879 }
1880 }
1881
1882 if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
1883 (g->ops.gr.init_hwpm_pmm_register != NULL)) {
1884 g->ops.gr.init_hwpm_pmm_register(g);
1885 }
1886 }
1887
1888 data = nvgpu_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
1889 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1890
1891 switch (mode) {
1892 case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
1893 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
1894 virt_addr = pm_ctx->mem.gpu_va;
1895 break;
1896 case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
1897 pm_ctx->pm_mode = g->ops.gr.get_hw_accessor_stream_out_mode();
1898 virt_addr = pm_ctx->mem.gpu_va;
1899 break;
1900 case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
1901 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1902 virt_addr = 0;
1903 }
1904
1905 data |= pm_ctx->pm_mode;
1906
1907 nvgpu_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
1908
1909 if (ctxheader->gpu_va) {
1910 struct channel_gk20a *ch;
1911
1912 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
1913 nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
1914 g->ops.gr.write_pm_ptr(g, &ch->ctx_header, virt_addr);
1915 }
1916 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
1917 } else {
1918 g->ops.gr.write_pm_ptr(g, gr_mem, virt_addr);
1919 }
1920
1921 /* enable channel */
1922 gk20a_enable_channel_tsg(g, c);
1923
1924 return 0;
1925}
1926
1927void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
1928 struct nvgpu_mem *mem)
1929{
1930 nvgpu_mem_wr(g, mem,
1931 ctxsw_prog_main_image_num_save_ops_o(), 0);
1932 nvgpu_mem_wr(g, mem,
1933 ctxsw_prog_main_image_num_restore_ops_o(), 0);
1934}
1935
1936/* load saved fresh copy of gloden image into channel gr_ctx */
1937int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1938 struct channel_gk20a *c)
1939{
1940 struct gr_gk20a *gr = &g->gr;
1941 struct tsg_gk20a *tsg;
1942 struct nvgpu_gr_ctx *gr_ctx;
1943 u32 virt_addr_lo;
1944 u32 virt_addr_hi;
1945 u64 virt_addr = 0;
1946 u32 v, data;
1947 int ret = 0;
1948 struct nvgpu_mem *mem;
1949
1950 nvgpu_log_fn(g, " ");
1951
1952 tsg = tsg_gk20a_from_ch(c);
1953 if (tsg == NULL) {
1954 return -EINVAL;
1955 }
1956
1957 gr_ctx = &tsg->gr_ctx;
1958 mem = &gr_ctx->mem;
1959 if (gr->ctx_vars.local_golden_image == NULL) {
1960 return -EINVAL;
1961 }
1962
1963 /* Channel gr_ctx buffer is gpu cacheable.
1964 Flush and invalidate before cpu update. */
1965 g->ops.mm.l2_flush(g, true);
1966
1967 nvgpu_mem_wr_n(g, mem, 0,
1968 gr->ctx_vars.local_golden_image,
1969 gr->ctx_vars.golden_image_size);
1970
1971 if (g->ops.gr.init_ctxsw_hdr_data) {
1972 g->ops.gr.init_ctxsw_hdr_data(g, mem);
1973 }
1974
1975 if ((g->ops.gr.enable_cde_in_fecs != NULL) && c->cde) {
1976 g->ops.gr.enable_cde_in_fecs(g, mem);
1977 }
1978
1979 /* set priv access map */
1980 virt_addr_lo =
1981 u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1982 virt_addr_hi =
1983 u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1984
1985 if (g->allow_all) {
1986 data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
1987 } else {
1988 data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
1989 }
1990
1991 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
1992 data);
1993
1994 nvgpu_mem_wr(g, mem,
1995 ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
1996 virt_addr_lo);
1997 nvgpu_mem_wr(g, mem,
1998 ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
1999 virt_addr_hi);
2000
2001 /* disable verif features */
2002 v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
2003 v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
2004 v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
2005 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
2006
2007 if (g->ops.gr.update_ctxsw_preemption_mode) {
2008 g->ops.gr.update_ctxsw_preemption_mode(g, c, mem);
2009 }
2010
2011 if (g->ops.gr.update_boosted_ctx) {
2012 g->ops.gr.update_boosted_ctx(g, mem, gr_ctx);
2013 }
2014
2015 virt_addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
2016 virt_addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
2017
2018 nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
2019 gr_ctx->patch_ctx.data_count);
2020 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
2021 gr_ctx->patch_ctx.data_count);
2022
2023 nvgpu_mem_wr(g, mem,
2024 ctxsw_prog_main_image_patch_adr_lo_o(),
2025 virt_addr_lo);
2026 nvgpu_mem_wr(g, mem,
2027 ctxsw_prog_main_image_patch_adr_hi_o(),
2028 virt_addr_hi);
2029
2030 /* Update main header region of the context buffer with the info needed
2031 * for PM context switching, including mode and possibly a pointer to
2032 * the PM backing store.
2033 */
2034 if (gr_ctx->pm_ctx.pm_mode != ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
2035 if (gr_ctx->pm_ctx.mem.gpu_va == 0) {
2036 nvgpu_err(g,
2037 "context switched pm with no pm buffer!");
2038 return -EFAULT;
2039 }
2040
2041 virt_addr = gr_ctx->pm_ctx.mem.gpu_va;
2042 } else {
2043 virt_addr = 0;
2044 }
2045
2046 data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
2047 data = data & ~ctxsw_prog_main_image_pm_mode_m();
2048 data |= gr_ctx->pm_ctx.pm_mode;
2049
2050 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
2051
2052 g->ops.gr.write_pm_ptr(g, mem, virt_addr);
2053
2054 return ret;
2055}
2056
2057static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
2058{
2059 nvgpu_log_fn(g, " ");
2060
2061 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
2062 gr_fecs_ctxsw_mailbox_clear_value_f(~0));
2063
2064 gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
2065 gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
2066
2067 gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
2068 gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
2069
2070 nvgpu_log_fn(g, "done");
2071}
2072
2073static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
2074{
2075 struct mm_gk20a *mm = &g->mm;
2076 struct vm_gk20a *vm = mm->pmu.vm;
2077 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2078 int err;
2079
2080 err = g->ops.mm.alloc_inst_block(g, &ucode_info->inst_blk_desc);
2081 if (err != 0) {
2082 return err;
2083 }
2084
2085 g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
2086
2087 /* Map ucode surface to GMMU */
2088 ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
2089 &ucode_info->surface_desc,
2090 ucode_info->surface_desc.size,
2091 0, /* flags */
2092 gk20a_mem_flag_read_only,
2093 false,
2094 ucode_info->surface_desc.aperture);
2095 if (ucode_info->surface_desc.gpu_va == 0ULL) {
2096 nvgpu_err(g, "failed to update gmmu ptes");
2097 return -ENOMEM;
2098 }
2099
2100 return 0;
2101}
2102
2103static void gr_gk20a_init_ctxsw_ucode_segment(
2104 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
2105{
2106 p_seg->offset = *offset;
2107 p_seg->size = size;
2108 *offset = ALIGN(*offset + size, BLK_SIZE);
2109}
2110
2111static void gr_gk20a_init_ctxsw_ucode_segments(
2112 struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
2113 struct gk20a_ctxsw_bootloader_desc *bootdesc,
2114 u32 code_size, u32 data_size)
2115{
2116 u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
2117 segments->boot_entry = bootdesc->entry_point;
2118 segments->boot_imem_offset = bootdesc->imem_offset;
2119 gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
2120 gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
2121 gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
2122}
2123
2124static int gr_gk20a_copy_ctxsw_ucode_segments(
2125 struct gk20a *g,
2126 struct nvgpu_mem *dst,
2127 struct gk20a_ctxsw_ucode_segments *segments,
2128 u32 *bootimage,
2129 u32 *code, u32 *data)
2130{
2131 unsigned int i;
2132
2133 nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage,
2134 segments->boot.size);
2135 nvgpu_mem_wr_n(g, dst, segments->code.offset, code,
2136 segments->code.size);
2137 nvgpu_mem_wr_n(g, dst, segments->data.offset, data,
2138 segments->data.size);
2139
2140 /* compute a "checksum" for the boot binary to detect its version */
2141 segments->boot_signature = 0;
2142 for (i = 0; i < segments->boot.size / sizeof(u32); i++) {
2143 segments->boot_signature += bootimage[i];
2144 }
2145
2146 return 0;
2147}
2148
2149int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
2150{
2151 struct mm_gk20a *mm = &g->mm;
2152 struct vm_gk20a *vm = mm->pmu.vm;
2153 struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
2154 struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
2155 struct nvgpu_firmware *fecs_fw;
2156 struct nvgpu_firmware *gpccs_fw;
2157 u32 *fecs_boot_image;
2158 u32 *gpccs_boot_image;
2159 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2160 u32 ucode_size;
2161 int err = 0;
2162
2163 fecs_fw = nvgpu_request_firmware(g, GK20A_FECS_UCODE_IMAGE, 0);
2164 if (fecs_fw == NULL) {
2165 nvgpu_err(g, "failed to load fecs ucode!!");
2166 return -ENOENT;
2167 }
2168
2169 fecs_boot_desc = (void *)fecs_fw->data;
2170 fecs_boot_image = (void *)(fecs_fw->data +
2171 sizeof(struct gk20a_ctxsw_bootloader_desc));
2172
2173 gpccs_fw = nvgpu_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE, 0);
2174 if (gpccs_fw == NULL) {
2175 nvgpu_release_firmware(g, fecs_fw);
2176 nvgpu_err(g, "failed to load gpccs ucode!!");
2177 return -ENOENT;
2178 }
2179
2180 gpccs_boot_desc = (void *)gpccs_fw->data;
2181 gpccs_boot_image = (void *)(gpccs_fw->data +
2182 sizeof(struct gk20a_ctxsw_bootloader_desc));
2183
2184 ucode_size = 0;
2185 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
2186 fecs_boot_desc,
2187 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
2188 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
2189 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
2190 gpccs_boot_desc,
2191 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
2192 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
2193
2194 err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc);
2195 if (err != 0) {
2196 goto clean_up;
2197 }
2198
2199 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2200 &ucode_info->fecs,
2201 fecs_boot_image,
2202 g->gr.ctx_vars.ucode.fecs.inst.l,
2203 g->gr.ctx_vars.ucode.fecs.data.l);
2204
2205 nvgpu_release_firmware(g, fecs_fw);
2206 fecs_fw = NULL;
2207
2208 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2209 &ucode_info->gpccs,
2210 gpccs_boot_image,
2211 g->gr.ctx_vars.ucode.gpccs.inst.l,
2212 g->gr.ctx_vars.ucode.gpccs.data.l);
2213
2214 nvgpu_release_firmware(g, gpccs_fw);
2215 gpccs_fw = NULL;
2216
2217 err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
2218 if (err != 0) {
2219 goto clean_up;
2220 }
2221
2222 return 0;
2223
2224clean_up:
2225 if (ucode_info->surface_desc.gpu_va) {
2226 nvgpu_gmmu_unmap(vm, &ucode_info->surface_desc,
2227 ucode_info->surface_desc.gpu_va);
2228 }
2229 nvgpu_dma_free(g, &ucode_info->surface_desc);
2230
2231 nvgpu_release_firmware(g, gpccs_fw);
2232 gpccs_fw = NULL;
2233 nvgpu_release_firmware(g, fecs_fw);
2234 fecs_fw = NULL;
2235
2236 return err;
2237}
2238
2239static void gr_gk20a_wait_for_fecs_arb_idle(struct gk20a *g)
2240{
2241 int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2242 u32 val;
2243
2244 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2245 while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) {
2246 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2247 retries--;
2248 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2249 }
2250
2251 if (retries == 0) {
2252 nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x",
2253 gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2254 }
2255
2256 retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2257 while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2258 gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
2259 (retries != 0)) {
2260 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2261 retries--;
2262 }
2263 if (retries == 0) {
2264 nvgpu_err(g,
2265 "arbiter idle timeout, fecs ctxsw status: 0x%08x",
2266 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
2267 }
2268}
2269
2270void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
2271{
2272 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2273 int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2274 u64 inst_ptr;
2275
2276 while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2277 gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
2278 (retries != 0)) {
2279 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2280 retries--;
2281 }
2282 if (retries == 0) {
2283 nvgpu_err(g,
2284 "arbiter idle timeout, status: %08x",
2285 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
2286 }
2287
2288 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2289
2290 inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
2291 gk20a_writel(g, gr_fecs_new_ctx_r(),
2292 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2293 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2294 gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
2295 gr_fecs_new_ctx_target_sys_mem_coh_f(),
2296 gr_fecs_new_ctx_target_vid_mem_f()) |
2297 gr_fecs_new_ctx_valid_m());
2298
2299 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2300 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2301 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2302 gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
2303 gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(),
2304 gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
2305
2306 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2307
2308 /* Wait for arbiter command to complete */
2309 gr_gk20a_wait_for_fecs_arb_idle(g);
2310
2311 gk20a_writel(g, gr_fecs_current_ctx_r(),
2312 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2313 gr_fecs_current_ctx_target_m() |
2314 gr_fecs_current_ctx_valid_m());
2315 /* Send command to arbiter to flush */
2316 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2317
2318 gr_gk20a_wait_for_fecs_arb_idle(g);
2319
2320}
2321
2322void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
2323 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2324{
2325 u32 addr_code32;
2326 u32 addr_data32;
2327
2328 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2329 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2330
2331 /*
2332 * Copy falcon bootloader header into dmem at offset 0.
2333 * Configure dmem port 0 for auto-incrementing writes starting at dmem
2334 * offset 0.
2335 */
2336 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2337 gr_fecs_dmemc_offs_f(0) |
2338 gr_fecs_dmemc_blk_f(0) |
2339 gr_fecs_dmemc_aincw_f(1));
2340
2341 /* Write out the actual data */
2342 switch (segments->boot_signature) {
2343 case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
2344 case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
2345 case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
2346 case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
2347 case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
2348 case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
2349 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2350 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2351 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2352 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2353 /* fallthrough */
2354 case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
2355 case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
2356 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
2357 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
2358 case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
2359 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2360 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2361 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2362 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2363 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
2364 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2365 addr_code32);
2366 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2367 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2368 segments->code.size);
2369 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2370 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2371 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2372 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2373 addr_data32);
2374 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2375 segments->data.size);
2376 break;
2377 case FALCON_UCODE_SIG_T12X_FECS_OLDER:
2378 case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
2379 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2380 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2381 addr_code32);
2382 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2383 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2384 segments->code.size);
2385 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2386 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2387 addr_data32);
2388 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2389 segments->data.size);
2390 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2391 addr_code32);
2392 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2393 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2394 break;
2395 default:
2396 nvgpu_err(g,
2397 "unknown falcon ucode boot signature 0x%08x"
2398 " with reg_offset 0x%08x",
2399 segments->boot_signature, reg_offset);
2400 BUG();
2401 }
2402}
2403
2404void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
2405 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2406{
2407 u32 addr_load32;
2408 u32 blocks;
2409 u32 b;
2410 u32 dst;
2411
2412 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2413 blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2414
2415 /*
2416 * Set the base FB address for the DMA transfer. Subtract off the 256
2417 * byte IMEM block offset such that the relative FB and IMEM offsets
2418 * match, allowing the IMEM tags to be properly created.
2419 */
2420
2421 dst = segments->boot_imem_offset;
2422 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2423 (addr_load32 - (dst >> 8)));
2424
2425 for (b = 0; b < blocks; b++) {
2426 /* Setup destination IMEM offset */
2427 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2428 dst + (b << 8));
2429
2430 /* Setup source offset (relative to BASE) */
2431 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2432 dst + (b << 8));
2433
2434 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2435 gr_fecs_dmatrfcmd_imem_f(0x01) |
2436 gr_fecs_dmatrfcmd_write_f(0x00) |
2437 gr_fecs_dmatrfcmd_size_f(0x06) |
2438 gr_fecs_dmatrfcmd_ctxdma_f(0));
2439 }
2440
2441 /* Specify the falcon boot vector */
2442 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2443 gr_fecs_bootvec_vec_f(segments->boot_entry));
2444}
2445
2446static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2447{
2448 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2449 u64 addr_base = ucode_info->surface_desc.gpu_va;
2450
2451 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2452
2453 gr_gk20a_load_falcon_bind_instblk(g);
2454
2455 g->ops.gr.falcon_load_ucode(g, addr_base,
2456 &g->ctxsw_ucode_info.fecs, 0);
2457
2458 g->ops.gr.falcon_load_ucode(g, addr_base,
2459 &g->ctxsw_ucode_info.gpccs,
2460 gr_gpcs_gpccs_falcon_hwcfg_r() -
2461 gr_fecs_falcon_hwcfg_r());
2462}
2463
2464int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
2465{
2466 int err;
2467
2468 nvgpu_log_fn(g, " ");
2469
2470 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
2471 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2472 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2473 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2474 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2475 }
2476
2477 /*
2478 * In case bootloader is not supported, revert to the old way of
2479 * loading gr ucode, without the faster bootstrap routine.
2480 */
2481 if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
2482 gr_gk20a_load_falcon_dmem(g);
2483 gr_gk20a_load_falcon_imem(g);
2484 gr_gk20a_start_falcon_ucode(g);
2485 } else {
2486 if (!g->gr.skip_ucode_init) {
2487 err = gr_gk20a_init_ctxsw_ucode(g);
2488
2489 if (err != 0) {
2490 return err;
2491 }
2492 }
2493 gr_gk20a_load_falcon_with_bootloader(g);
2494 g->gr.skip_ucode_init = true;
2495 }
2496 nvgpu_log_fn(g, "done");
2497 return 0;
2498}
2499
2500int gr_gk20a_set_fecs_watchdog_timeout(struct gk20a *g)
2501{
2502 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2503 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2504 gk20a_writel(g, gr_fecs_method_push_r(),
2505 gr_fecs_method_push_adr_set_watchdog_timeout_f());
2506
2507 return 0;
2508}
2509
2510static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
2511{
2512 u32 ret;
2513
2514 nvgpu_log_fn(g, " ");
2515
2516 ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL,
2517 GR_IS_UCODE_OP_EQUAL,
2518 eUcodeHandshakeInitComplete,
2519 GR_IS_UCODE_OP_SKIP, 0, false);
2520 if (ret) {
2521 nvgpu_err(g, "falcon ucode init timeout");
2522 return ret;
2523 }
2524
2525 if (nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP) ||
2526 nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
2527 gk20a_writel(g, gr_fecs_current_ctx_r(),
2528 gr_fecs_current_ctx_valid_false_f());
2529 }
2530
2531 ret = g->ops.gr.set_fecs_watchdog_timeout(g);
2532 if (ret) {
2533 nvgpu_err(g, "fail to set watchdog timeout");
2534 return ret;
2535 }
2536
2537 nvgpu_log_fn(g, "done");
2538 return 0;
2539}
2540
2541int gr_gk20a_init_ctx_state(struct gk20a *g)
2542{
2543 u32 ret;
2544 struct fecs_method_op_gk20a op = {
2545 .mailbox = { .id = 0, .data = 0,
2546 .clr = ~0, .ok = 0, .fail = 0},
2547 .method.data = 0,
2548 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2549 .cond.fail = GR_IS_UCODE_OP_SKIP,
2550 };
2551
2552 nvgpu_log_fn(g, " ");
2553 /* query ctxsw image sizes, if golden context is not created */
2554 if (!g->gr.ctx_vars.golden_image_initialized) {
2555 op.method.addr =
2556 gr_fecs_method_push_adr_discover_image_size_v();
2557 op.mailbox.ret = &g->gr.ctx_vars.golden_image_size;
2558 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2559 if (ret) {
2560 nvgpu_err(g,
2561 "query golden image size failed");
2562 return ret;
2563 }
2564 op.method.addr =
2565 gr_fecs_method_push_adr_discover_zcull_image_size_v();
2566 op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size;
2567 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2568 if (ret) {
2569 nvgpu_err(g,
2570 "query zcull ctx image size failed");
2571 return ret;
2572 }
2573 op.method.addr =
2574 gr_fecs_method_push_adr_discover_pm_image_size_v();
2575 op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size;
2576 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2577 if (ret) {
2578 nvgpu_err(g,
2579 "query pm ctx image size failed");
2580 return ret;
2581 }
2582 g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2583#ifdef CONFIG_GK20A_CTXSW_TRACE
2584 g->gr.ctx_vars.fecs_trace_buffer_size =
2585 gk20a_fecs_trace_buffer_size(g);
2586#endif
2587 }
2588
2589 nvgpu_log_fn(g, "done");
2590 return 0;
2591}
2592
2593void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
2594 struct gr_ctx_buffer_desc *desc)
2595{
2596 if (desc == NULL) {
2597 return;
2598 }
2599 nvgpu_dma_free(g, &desc->mem);
2600 desc->destroy = NULL;
2601}
2602
2603int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
2604 struct gr_ctx_buffer_desc *desc,
2605 size_t size)
2606{
2607 int err = 0;
2608
2609 nvgpu_log_fn(g, " ");
2610
2611 if (nvgpu_mem_is_valid(&desc->mem)) {
2612 return 0;
2613 }
2614
2615 err = nvgpu_dma_alloc_sys(g, size, &desc->mem);
2616 if (err != 0) {
2617 return err;
2618 }
2619
2620 desc->destroy = gk20a_gr_destroy_ctx_buffer;
2621
2622 return err;
2623}
2624
2625static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2626{
2627 struct gr_gk20a *gr = &g->gr;
2628 u32 i;
2629
2630 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2631 /* destroy exists iff buffer is allocated */
2632 if (gr->global_ctx_buffer[i].destroy) {
2633 gr->global_ctx_buffer[i].destroy(g,
2634 &gr->global_ctx_buffer[i]);
2635 }
2636 }
2637
2638 nvgpu_log_fn(g, "done");
2639}
2640
2641int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2642{
2643 struct gr_gk20a *gr = &g->gr;
2644 int attr_buffer_size, err;
2645
2646 u32 cb_buffer_size = gr->bundle_cb_default_size *
2647 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2648
2649 u32 pagepool_buffer_size = g->ops.gr.pagepool_default_size(g) *
2650 gr_scc_pagepool_total_pages_byte_granularity_v();
2651
2652 nvgpu_log_fn(g, " ");
2653
2654 attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2655
2656 nvgpu_log_info(g, "cb_buffer_size : %d", cb_buffer_size);
2657
2658 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[CIRCULAR],
2659 cb_buffer_size);
2660 if (err != 0) {
2661 goto clean_up;
2662 }
2663
2664 if (g->ops.secure_alloc) {
2665 err = g->ops.secure_alloc(g,
2666 &gr->global_ctx_buffer[CIRCULAR_VPR],
2667 cb_buffer_size);
2668 if (err != 0) {
2669 goto clean_up;
2670 }
2671 }
2672
2673 nvgpu_log_info(g, "pagepool_buffer_size : %d", pagepool_buffer_size);
2674
2675 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[PAGEPOOL],
2676 pagepool_buffer_size);
2677 if (err != 0) {
2678 goto clean_up;
2679 }
2680
2681 if (g->ops.secure_alloc) {
2682 err = g->ops.secure_alloc(g,
2683 &gr->global_ctx_buffer[PAGEPOOL_VPR],
2684 pagepool_buffer_size);
2685 if (err != 0) {
2686 goto clean_up;
2687 }
2688 }
2689
2690 nvgpu_log_info(g, "attr_buffer_size : %d", attr_buffer_size);
2691
2692 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[ATTRIBUTE],
2693 attr_buffer_size);
2694 if (err != 0) {
2695 goto clean_up;
2696 }
2697
2698 if (g->ops.secure_alloc) {
2699 err = g->ops.secure_alloc(g,
2700 &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2701 attr_buffer_size);
2702 if (err != 0) {
2703 goto clean_up;
2704 }
2705 }
2706
2707 nvgpu_log_info(g, "golden_image_size : %d",
2708 gr->ctx_vars.golden_image_size);
2709
2710 err = gk20a_gr_alloc_ctx_buffer(g,
2711 &gr->global_ctx_buffer[GOLDEN_CTX],
2712 gr->ctx_vars.golden_image_size);
2713 if (err != 0) {
2714 goto clean_up;
2715 }
2716
2717 nvgpu_log_info(g, "priv_access_map_size : %d",
2718 gr->ctx_vars.priv_access_map_size);
2719
2720 err = gk20a_gr_alloc_ctx_buffer(g,
2721 &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2722 gr->ctx_vars.priv_access_map_size);
2723
2724 if (err != 0) {
2725 goto clean_up;
2726 }
2727
2728#ifdef CONFIG_GK20A_CTXSW_TRACE
2729 nvgpu_log_info(g, "fecs_trace_buffer_size : %d",
2730 gr->ctx_vars.fecs_trace_buffer_size);
2731
2732 err = nvgpu_dma_alloc_sys(g,
2733 gr->ctx_vars.fecs_trace_buffer_size,
2734 &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem);
2735 if (err != 0) {
2736 goto clean_up;
2737 }
2738
2739 gr->global_ctx_buffer[FECS_TRACE_BUFFER].destroy =
2740 gk20a_gr_destroy_ctx_buffer;
2741#endif
2742
2743 nvgpu_log_fn(g, "done");
2744 return 0;
2745
2746 clean_up:
2747 nvgpu_err(g, "fail");
2748 gr_gk20a_free_global_ctx_buffers(g);
2749 return -ENOMEM;
2750}
2751
2752static void gr_gk20a_unmap_global_ctx_buffers(struct gk20a *g,
2753 struct vm_gk20a *vm,
2754 struct nvgpu_gr_ctx *gr_ctx)
2755{
2756 u64 *g_bfr_va = gr_ctx->global_ctx_buffer_va;
2757 u64 *g_bfr_size = gr_ctx->global_ctx_buffer_size;
2758 int *g_bfr_index = gr_ctx->global_ctx_buffer_index;
2759 u32 i;
2760
2761 nvgpu_log_fn(g, " ");
2762
2763 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2764 if (g_bfr_index[i]) {
2765 struct nvgpu_mem *mem;
2766
2767 /*
2768 * Translate from VA index to buffer index to determine
2769 * the correct struct nvgpu_mem to use. Handles the VPR
2770 * vs non-VPR difference in context images.
2771 */
2772 mem = &g->gr.global_ctx_buffer[g_bfr_index[i]].mem;
2773
2774 nvgpu_gmmu_unmap(vm, mem, g_bfr_va[i]);
2775 }
2776 }
2777
2778 memset(g_bfr_va, 0, sizeof(gr_ctx->global_ctx_buffer_va));
2779 memset(g_bfr_size, 0, sizeof(gr_ctx->global_ctx_buffer_size));
2780 memset(g_bfr_index, 0, sizeof(gr_ctx->global_ctx_buffer_index));
2781
2782 gr_ctx->global_ctx_buffer_mapped = false;
2783}
2784
2785int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2786 struct channel_gk20a *c)
2787{
2788 struct tsg_gk20a *tsg;
2789 struct vm_gk20a *ch_vm = c->vm;
2790 u64 *g_bfr_va;
2791 u64 *g_bfr_size;
2792 int *g_bfr_index;
2793 struct gr_gk20a *gr = &g->gr;
2794 struct nvgpu_mem *mem;
2795 u64 gpu_va;
2796
2797 nvgpu_log_fn(g, " ");
2798
2799 tsg = tsg_gk20a_from_ch(c);
2800 if (tsg == NULL) {
2801 return -EINVAL;
2802 }
2803
2804 g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
2805 g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
2806 g_bfr_index = tsg->gr_ctx.global_ctx_buffer_index;
2807
2808 /* Circular Buffer */
2809 if (c->vpr &&
2810 nvgpu_mem_is_valid(&gr->global_ctx_buffer[CIRCULAR_VPR].mem)) {
2811 mem = &gr->global_ctx_buffer[CIRCULAR_VPR].mem;
2812 g_bfr_index[CIRCULAR_VA] = CIRCULAR_VPR;
2813 } else {
2814 mem = &gr->global_ctx_buffer[CIRCULAR].mem;
2815 g_bfr_index[CIRCULAR_VA] = CIRCULAR;
2816 }
2817
2818 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2819 NVGPU_VM_MAP_CACHEABLE,
2820 gk20a_mem_flag_none, true, mem->aperture);
2821 if (gpu_va == 0ULL) {
2822 goto clean_up;
2823 }
2824 g_bfr_va[CIRCULAR_VA] = gpu_va;
2825 g_bfr_size[CIRCULAR_VA] = mem->size;
2826
2827 /* Attribute Buffer */
2828 if (c->vpr &&
2829 nvgpu_mem_is_valid(&gr->global_ctx_buffer[ATTRIBUTE_VPR].mem)) {
2830 mem = &gr->global_ctx_buffer[ATTRIBUTE_VPR].mem;
2831 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE_VPR;
2832 } else {
2833 mem = &gr->global_ctx_buffer[ATTRIBUTE].mem;
2834 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE;
2835 }
2836
2837 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2838 NVGPU_VM_MAP_CACHEABLE,
2839 gk20a_mem_flag_none, false, mem->aperture);
2840 if (gpu_va == 0ULL) {
2841 goto clean_up;
2842 }
2843 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2844 g_bfr_size[ATTRIBUTE_VA] = mem->size;
2845
2846 /* Page Pool */
2847 if (c->vpr &&
2848 nvgpu_mem_is_valid(&gr->global_ctx_buffer[PAGEPOOL_VPR].mem)) {
2849 mem = &gr->global_ctx_buffer[PAGEPOOL_VPR].mem;
2850 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL_VPR;
2851 } else {
2852 mem = &gr->global_ctx_buffer[PAGEPOOL].mem;
2853 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL;
2854 }
2855
2856 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2857 NVGPU_VM_MAP_CACHEABLE,
2858 gk20a_mem_flag_none, true, mem->aperture);
2859 if (gpu_va == 0ULL) {
2860 goto clean_up;
2861 }
2862 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2863 g_bfr_size[PAGEPOOL_VA] = mem->size;
2864
2865 /* Golden Image */
2866 mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
2867 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2868 gk20a_mem_flag_none, true, mem->aperture);
2869 if (gpu_va == 0ULL) {
2870 goto clean_up;
2871 }
2872 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2873 g_bfr_size[GOLDEN_CTX_VA] = mem->size;
2874 g_bfr_index[GOLDEN_CTX_VA] = GOLDEN_CTX;
2875
2876 /* Priv register Access Map */
2877 mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
2878 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2879 gk20a_mem_flag_none, true, mem->aperture);
2880 if (gpu_va == 0ULL) {
2881 goto clean_up;
2882 }
2883 g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2884 g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
2885 g_bfr_index[PRIV_ACCESS_MAP_VA] = PRIV_ACCESS_MAP;
2886
2887 tsg->gr_ctx.global_ctx_buffer_mapped = true;
2888
2889#ifdef CONFIG_GK20A_CTXSW_TRACE
2890 /* FECS trace buffer */
2891 if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
2892 mem = &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem;
2893 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2894 gk20a_mem_flag_none, true, mem->aperture);
2895 if (!gpu_va)
2896 goto clean_up;
2897 g_bfr_va[FECS_TRACE_BUFFER_VA] = gpu_va;
2898 g_bfr_size[FECS_TRACE_BUFFER_VA] = mem->size;
2899 g_bfr_index[FECS_TRACE_BUFFER_VA] = FECS_TRACE_BUFFER;
2900 }
2901#endif
2902
2903 return 0;
2904
2905clean_up:
2906 gr_gk20a_unmap_global_ctx_buffers(g, ch_vm, &tsg->gr_ctx);
2907
2908 return -ENOMEM;
2909}
2910
2911int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
2912 struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
2913 u32 class,
2914 u32 padding)
2915{
2916 struct gr_gk20a *gr = &g->gr;
2917 int err = 0;
2918
2919 nvgpu_log_fn(g, " ");
2920
2921 if (gr->ctx_vars.buffer_size == 0) {
2922 return 0;
2923 }
2924
2925 /* alloc channel gr ctx buffer */
2926 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2927 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2928
2929 err = nvgpu_dma_alloc(g, gr->ctx_vars.buffer_total_size, &gr_ctx->mem);
2930 if (err != 0) {
2931 return err;
2932 }
2933
2934 gr_ctx->mem.gpu_va = nvgpu_gmmu_map(vm,
2935 &gr_ctx->mem,
2936 gr_ctx->mem.size,
2937 0, /* not GPU-cacheable */
2938 gk20a_mem_flag_none, true,
2939 gr_ctx->mem.aperture);
2940 if (gr_ctx->mem.gpu_va == 0ULL) {
2941 goto err_free_mem;
2942 }
2943
2944 return 0;
2945
2946 err_free_mem:
2947 nvgpu_dma_free(g, &gr_ctx->mem);
2948
2949 return err;
2950}
2951
2952static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
2953 struct tsg_gk20a *tsg, u32 class, u32 padding)
2954{
2955 struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
2956 int err;
2957
2958 if (tsg->vm == NULL) {
2959 nvgpu_err(tsg->g, "No address space bound");
2960 return -ENOMEM;
2961 }
2962
2963 err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
2964 if (err != 0) {
2965 return err;
2966 }
2967
2968 gr_ctx->tsgid = tsg->tsgid;
2969
2970 return 0;
2971}
2972
2973void gr_gk20a_free_gr_ctx(struct gk20a *g,
2974 struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
2975{
2976 nvgpu_log_fn(g, " ");
2977
2978 if (gr_ctx->mem.gpu_va) {
2979 gr_gk20a_unmap_global_ctx_buffers(g, vm, gr_ctx);
2980 gr_gk20a_free_channel_patch_ctx(g, vm, gr_ctx);
2981 gr_gk20a_free_channel_pm_ctx(g, vm, gr_ctx);
2982
2983 if ((g->ops.gr.dump_ctxsw_stats != NULL) &&
2984 g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close) {
2985 g->ops.gr.dump_ctxsw_stats(g, vm, gr_ctx);
2986 }
2987
2988 nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
2989 nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
2990 nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
2991 nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
2992 nvgpu_dma_unmap_free(vm, &gr_ctx->mem);
2993
2994 memset(gr_ctx, 0, sizeof(*gr_ctx));
2995 }
2996}
2997
2998void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
2999{
3000 struct gk20a *g = tsg->g;
3001
3002 if (tsg->vm == NULL) {
3003 nvgpu_err(g, "No address space bound");
3004 return;
3005 }
3006 tsg->g->ops.gr.free_gr_ctx(g, tsg->vm, &tsg->gr_ctx);
3007}
3008
3009u32 gr_gk20a_get_patch_slots(struct gk20a *g)
3010{
3011 return PATCH_CTX_SLOTS_PER_PAGE;
3012}
3013
3014static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
3015 struct channel_gk20a *c)
3016{
3017 struct tsg_gk20a *tsg;
3018 struct patch_desc *patch_ctx;
3019 struct vm_gk20a *ch_vm = c->vm;
3020 u32 alloc_size;
3021 int err = 0;
3022
3023 nvgpu_log_fn(g, " ");
3024
3025 tsg = tsg_gk20a_from_ch(c);
3026 if (tsg == NULL) {
3027 return -EINVAL;
3028 }
3029
3030 patch_ctx = &tsg->gr_ctx.patch_ctx;
3031 alloc_size = g->ops.gr.get_patch_slots(g) *
3032 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
3033
3034 nvgpu_log(g, gpu_dbg_info, "patch buffer size in entries: %d",
3035 alloc_size);
3036
3037 err = nvgpu_dma_alloc_map_sys(ch_vm,
3038 alloc_size * sizeof(u32), &patch_ctx->mem);
3039 if (err != 0) {
3040 return err;
3041 }
3042
3043 nvgpu_log_fn(g, "done");
3044 return 0;
3045}
3046
3047static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
3048 struct vm_gk20a *vm,
3049 struct nvgpu_gr_ctx *gr_ctx)
3050{
3051 struct patch_desc *patch_ctx = &gr_ctx->patch_ctx;
3052
3053 nvgpu_log_fn(g, " ");
3054
3055 if (patch_ctx->mem.gpu_va) {
3056 nvgpu_gmmu_unmap(vm, &patch_ctx->mem,
3057 patch_ctx->mem.gpu_va);
3058 }
3059
3060 nvgpu_dma_free(g, &patch_ctx->mem);
3061 patch_ctx->data_count = 0;
3062}
3063
3064static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
3065 struct vm_gk20a *vm,
3066 struct nvgpu_gr_ctx *gr_ctx)
3067{
3068 struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
3069
3070 nvgpu_log_fn(g, " ");
3071
3072 if (pm_ctx->mem.gpu_va) {
3073 nvgpu_gmmu_unmap(vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
3074
3075 nvgpu_dma_free(g, &pm_ctx->mem);
3076 }
3077}
3078
3079int gk20a_alloc_obj_ctx(struct channel_gk20a *c, u32 class_num, u32 flags)
3080{
3081 struct gk20a *g = c->g;
3082 struct nvgpu_gr_ctx *gr_ctx;
3083 struct tsg_gk20a *tsg = NULL;
3084 int err = 0;
3085
3086 nvgpu_log_fn(g, " ");
3087
3088 /* an address space needs to have been bound at this point.*/
3089 if (!gk20a_channel_as_bound(c) && (c->vm == NULL)) {
3090 nvgpu_err(g,
3091 "not bound to address space at time"
3092 " of grctx allocation");
3093 return -EINVAL;
3094 }
3095
3096 if (!g->ops.gr.is_valid_class(g, class_num)) {
3097 nvgpu_err(g,
3098 "invalid obj class 0x%x", class_num);
3099 err = -EINVAL;
3100 goto out;
3101 }
3102 c->obj_class = class_num;
3103
3104 tsg = tsg_gk20a_from_ch(c);
3105 if (tsg == NULL) {
3106 return -EINVAL;
3107 }
3108
3109 gr_ctx = &tsg->gr_ctx;
3110
3111 if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
3112 tsg->vm = c->vm;
3113 nvgpu_vm_get(tsg->vm);
3114 err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
3115 class_num,
3116 flags);
3117 if (err != 0) {
3118 nvgpu_err(g,
3119 "fail to allocate TSG gr ctx buffer");
3120 nvgpu_vm_put(tsg->vm);
3121 tsg->vm = NULL;
3122 goto out;
3123 }
3124
3125 /* allocate patch buffer */
3126 if (!nvgpu_mem_is_valid(&gr_ctx->patch_ctx.mem)) {
3127 gr_ctx->patch_ctx.data_count = 0;
3128 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
3129 if (err != 0) {
3130 nvgpu_err(g,
3131 "fail to allocate patch buffer");
3132 goto out;
3133 }
3134 }
3135
3136 /* map global buffer to channel gpu_va and commit */
3137 err = g->ops.gr.map_global_ctx_buffers(g, c);
3138 if (err != 0) {
3139 nvgpu_err(g,
3140 "fail to map global ctx buffer");
3141 goto out;
3142 }
3143 g->ops.gr.commit_global_ctx_buffers(g, c, true);
3144
3145 /* commit gr ctx buffer */
3146 err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
3147 if (err != 0) {
3148 nvgpu_err(g,
3149 "fail to commit gr ctx buffer");
3150 goto out;
3151 }
3152
3153 /* init golden image */
3154 err = gr_gk20a_init_golden_ctx_image(g, c);
3155 if (err != 0) {
3156 nvgpu_err(g,
3157 "fail to init golden ctx image");
3158 goto out;
3159 }
3160
3161 /* Re-enable ELPG now that golden image has been initialized.
3162 * The PMU PG init code may already have tried to enable elpg, but
3163 * would not have been able to complete this action since the golden
3164 * image hadn't been initialized yet, so do this now.
3165 */
3166 err = nvgpu_pmu_reenable_elpg(g);
3167 if (err != 0) {
3168 nvgpu_err(g, "fail to re-enable elpg");
3169 goto out;
3170 }
3171
3172 /* load golden image */
3173 gr_gk20a_load_golden_ctx_image(g, c);
3174 if (err != 0) {
3175 nvgpu_err(g,
3176 "fail to load golden ctx image");
3177 goto out;
3178 }
3179#ifdef CONFIG_GK20A_CTXSW_TRACE
3180 if (g->ops.fecs_trace.bind_channel && !c->vpr) {
3181 err = g->ops.fecs_trace.bind_channel(g, c);
3182 if (err != 0) {
3183 nvgpu_warn(g,
3184 "fail to bind channel for ctxsw trace");
3185 }
3186 }
3187#endif
3188
3189 if (g->ops.gr.set_czf_bypass) {
3190 g->ops.gr.set_czf_bypass(g, c);
3191 }
3192
3193 /* PM ctxt switch is off by default */
3194 gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
3195 } else {
3196 /* commit gr ctx buffer */
3197 err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
3198 if (err != 0) {
3199 nvgpu_err(g,
3200 "fail to commit gr ctx buffer");
3201 goto out;
3202 }
3203#ifdef CONFIG_GK20A_CTXSW_TRACE
3204 if (g->ops.fecs_trace.bind_channel && !c->vpr) {
3205 err = g->ops.fecs_trace.bind_channel(g, c);
3206 if (err != 0) {
3207 nvgpu_warn(g,
3208 "fail to bind channel for ctxsw trace");
3209 }
3210 }
3211#endif
3212 }
3213
3214 nvgpu_log_fn(g, "done");
3215 return 0;
3216out:
3217 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
3218 can be reused so no need to release them.
3219 2. golden image init and load is a one time thing so if
3220 they pass, no need to undo. */
3221 nvgpu_err(g, "fail");
3222 return err;
3223}
3224
3225static void gk20a_remove_gr_support(struct gr_gk20a *gr)
3226{
3227 struct gk20a *g = gr->g;
3228
3229 nvgpu_log_fn(g, " ");
3230
3231 gr_gk20a_free_cyclestats_snapshot_data(g);
3232
3233 gr_gk20a_free_global_ctx_buffers(g);
3234
3235 nvgpu_dma_free(g, &gr->compbit_store.mem);
3236
3237 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
3238
3239 nvgpu_kfree(g, gr->gpc_tpc_count);
3240 nvgpu_kfree(g, gr->gpc_zcb_count);
3241 nvgpu_kfree(g, gr->gpc_ppc_count);
3242 nvgpu_kfree(g, gr->pes_tpc_count[0]);
3243 nvgpu_kfree(g, gr->pes_tpc_count[1]);
3244 nvgpu_kfree(g, gr->pes_tpc_mask[0]);
3245 nvgpu_kfree(g, gr->pes_tpc_mask[1]);
3246 nvgpu_kfree(g, gr->sm_to_cluster);
3247 nvgpu_kfree(g, gr->gpc_skip_mask);
3248 nvgpu_kfree(g, gr->map_tiles);
3249 nvgpu_kfree(g, gr->fbp_rop_l2_en_mask);
3250 gr->gpc_tpc_count = NULL;
3251 gr->gpc_zcb_count = NULL;
3252 gr->gpc_ppc_count = NULL;
3253 gr->pes_tpc_count[0] = NULL;
3254 gr->pes_tpc_count[1] = NULL;
3255 gr->pes_tpc_mask[0] = NULL;
3256 gr->pes_tpc_mask[1] = NULL;
3257 gr->gpc_skip_mask = NULL;
3258 gr->map_tiles = NULL;
3259 gr->fbp_rop_l2_en_mask = NULL;
3260
3261 gr->ctx_vars.valid = false;
3262 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.inst.l);
3263 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.data.l);
3264 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.inst.l);
3265 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.data.l);
3266 nvgpu_kfree(g, gr->ctx_vars.sw_bundle_init.l);
3267 nvgpu_kfree(g, gr->ctx_vars.sw_veid_bundle_init.l);
3268 nvgpu_kfree(g, gr->ctx_vars.sw_method_init.l);
3269 nvgpu_kfree(g, gr->ctx_vars.sw_ctx_load.l);
3270 nvgpu_kfree(g, gr->ctx_vars.sw_non_ctx_load.l);
3271 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.sys.l);
3272 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc.l);
3273 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.tpc.l);
3274 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
3275 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.ppc.l);
3276 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_sys.l);
3277 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_gpc.l);
3278 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_tpc.l);
3279 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ppc.l);
3280 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_sys.l);
3281 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp.l);
3282 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_gpc.l);
3283 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp_router.l);
3284 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc_router.l);
3285 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ltc.l);
3286 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_fbpa.l);
3287 nvgpu_kfree(g, gr->ctx_vars.sw_bundle64_init.l);
3288 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_cau.l);
3289
3290 nvgpu_vfree(g, gr->ctx_vars.local_golden_image);
3291 gr->ctx_vars.local_golden_image = NULL;
3292
3293 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map) {
3294 nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
3295 }
3296 gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
3297
3298 gk20a_comptag_allocator_destroy(g, &gr->comp_tags);
3299
3300 nvgpu_ecc_remove_support(g);
3301}
3302
3303static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
3304{
3305 u32 gpc_index, pes_index;
3306 u32 pes_tpc_mask;
3307 u32 pes_tpc_count;
3308 u32 pes_heavy_index;
3309 u32 gpc_new_skip_mask;
3310 u32 tmp;
3311 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
3312 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
3313
3314 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
3315 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
3316
3317 tmp = gk20a_readl(g, top_num_gpcs_r());
3318 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
3319
3320 tmp = gk20a_readl(g, top_num_fbps_r());
3321 gr->max_fbps_count = top_num_fbps_value_v(tmp);
3322
3323 gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g);
3324
3325 if (gr->fbp_rop_l2_en_mask == NULL) {
3326 gr->fbp_rop_l2_en_mask =
3327 nvgpu_kzalloc(g, gr->max_fbps_count * sizeof(u32));
3328 if (gr->fbp_rop_l2_en_mask == NULL) {
3329 goto clean_up;
3330 }
3331 } else {
3332 memset(gr->fbp_rop_l2_en_mask, 0, gr->max_fbps_count *
3333 sizeof(u32));
3334 }
3335
3336 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
3337 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
3338
3339 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
3340
3341 tmp = gk20a_readl(g, top_num_fbps_r());
3342 gr->sys_count = top_num_fbps_value_v(tmp);
3343
3344 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
3345 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
3346
3347 gr->pe_count_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
3348 if (WARN(gr->pe_count_per_gpc > GK20A_GR_MAX_PES_PER_GPC,
3349 "too many pes per gpc\n")) {
3350 goto clean_up;
3351 }
3352
3353 gr->max_zcull_per_gpc_count = nvgpu_get_litter_value(g, GPU_LIT_NUM_ZCULL_BANKS);
3354
3355 if (gr->gpc_count == 0U) {
3356 nvgpu_err(g, "gpc_count==0!");
3357 goto clean_up;
3358 }
3359
3360 if (gr->gpc_tpc_count == NULL) {
3361 gr->gpc_tpc_count = nvgpu_kzalloc(g, gr->gpc_count *
3362 sizeof(u32));
3363 } else {
3364 memset(gr->gpc_tpc_count, 0, gr->gpc_count *
3365 sizeof(u32));
3366 }
3367
3368 if (gr->gpc_tpc_mask == NULL) {
3369 gr->gpc_tpc_mask = nvgpu_kzalloc(g, gr->max_gpc_count *
3370 sizeof(u32));
3371 } else {
3372 memset(gr->gpc_tpc_mask, 0, gr->max_gpc_count *
3373 sizeof(u32));
3374 }
3375
3376 if (gr->gpc_zcb_count == NULL) {
3377 gr->gpc_zcb_count = nvgpu_kzalloc(g, gr->gpc_count *
3378 sizeof(u32));
3379 } else {
3380 memset(gr->gpc_zcb_count, 0, gr->gpc_count *
3381 sizeof(u32));
3382 }
3383
3384 if (gr->gpc_ppc_count == NULL) {
3385 gr->gpc_ppc_count = nvgpu_kzalloc(g, gr->gpc_count *
3386 sizeof(u32));
3387 } else {
3388 memset(gr->gpc_ppc_count, 0, gr->gpc_count *
3389 sizeof(u32));
3390 }
3391
3392 if (gr->gpc_skip_mask == NULL) {
3393 gr->gpc_skip_mask =
3394 nvgpu_kzalloc(g, gr_pd_dist_skip_table__size_1_v() *
3395 4 * sizeof(u32));
3396 } else {
3397 memset(gr->gpc_skip_mask, 0, gr_pd_dist_skip_table__size_1_v() *
3398 4 * sizeof(u32));
3399 }
3400
3401 if ((gr->gpc_tpc_count == NULL) || (gr->gpc_tpc_mask == NULL) ||
3402 (gr->gpc_zcb_count == NULL) || (gr->gpc_ppc_count == NULL) ||
3403 (gr->gpc_skip_mask == NULL)) {
3404 goto clean_up;
3405 }
3406
3407 for (gpc_index = 0; gpc_index < gr->max_gpc_count; gpc_index++) {
3408 if (g->ops.gr.get_gpc_tpc_mask) {
3409 gr->gpc_tpc_mask[gpc_index] =
3410 g->ops.gr.get_gpc_tpc_mask(g, gpc_index);
3411 }
3412 }
3413
3414 gr->ppc_count = 0;
3415 gr->tpc_count = 0;
3416 gr->zcb_count = 0;
3417 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3418 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r() +
3419 gpc_stride * gpc_index);
3420
3421 gr->gpc_tpc_count[gpc_index] =
3422 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3423 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3424
3425 gr->gpc_zcb_count[gpc_index] =
3426 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3427 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3428
3429 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3430 if (gr->pes_tpc_count[pes_index] == NULL) {
3431 gr->pes_tpc_count[pes_index] =
3432 nvgpu_kzalloc(g, gr->gpc_count *
3433 sizeof(u32));
3434 gr->pes_tpc_mask[pes_index] =
3435 nvgpu_kzalloc(g, gr->gpc_count *
3436 sizeof(u32));
3437 if ((gr->pes_tpc_count[pes_index] == NULL) ||
3438 (gr->pes_tpc_mask[pes_index] == NULL)) {
3439 goto clean_up;
3440 }
3441 }
3442
3443 tmp = gk20a_readl(g,
3444 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3445 gpc_index * gpc_stride);
3446
3447 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3448 pes_tpc_count = count_bits(pes_tpc_mask);
3449
3450 /* detect PES presence by seeing if there are
3451 * TPCs connected to it.
3452 */
3453 if (pes_tpc_count != 0) {
3454 gr->gpc_ppc_count[gpc_index]++;
3455 }
3456
3457 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3458 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3459 }
3460
3461 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3462
3463 gpc_new_skip_mask = 0;
3464 if (gr->pe_count_per_gpc > 1 &&
3465 gr->pes_tpc_count[0][gpc_index] +
3466 gr->pes_tpc_count[1][gpc_index] == 5) {
3467 pes_heavy_index =
3468 gr->pes_tpc_count[0][gpc_index] >
3469 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3470
3471 gpc_new_skip_mask =
3472 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3473 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3474 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3475
3476 } else if (gr->pe_count_per_gpc > 1 &&
3477 (gr->pes_tpc_count[0][gpc_index] +
3478 gr->pes_tpc_count[1][gpc_index] == 4) &&
3479 (gr->pes_tpc_count[0][gpc_index] !=
3480 gr->pes_tpc_count[1][gpc_index])) {
3481 pes_heavy_index =
3482 gr->pes_tpc_count[0][gpc_index] >
3483 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3484
3485 gpc_new_skip_mask =
3486 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3487 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3488 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3489 }
3490 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3491 }
3492
3493 /* allocate for max tpc per gpc */
3494 if (gr->sm_to_cluster == NULL) {
3495 gr->sm_to_cluster = nvgpu_kzalloc(g, gr->gpc_count *
3496 gr->max_tpc_per_gpc_count *
3497 sm_per_tpc * sizeof(struct sm_info));
3498 if (!gr->sm_to_cluster)
3499 goto clean_up;
3500 } else {
3501 memset(gr->sm_to_cluster, 0, gr->gpc_count *
3502 gr->max_tpc_per_gpc_count *
3503 sm_per_tpc * sizeof(struct sm_info));
3504 }
3505 gr->no_of_sm = 0;
3506
3507 nvgpu_log_info(g, "fbps: %d", gr->num_fbps);
3508 nvgpu_log_info(g, "max_gpc_count: %d", gr->max_gpc_count);
3509 nvgpu_log_info(g, "max_fbps_count: %d", gr->max_fbps_count);
3510 nvgpu_log_info(g, "max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3511 nvgpu_log_info(g, "max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3512 nvgpu_log_info(g, "max_tpc_count: %d", gr->max_tpc_count);
3513 nvgpu_log_info(g, "sys_count: %d", gr->sys_count);
3514 nvgpu_log_info(g, "gpc_count: %d", gr->gpc_count);
3515 nvgpu_log_info(g, "pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3516 nvgpu_log_info(g, "tpc_count: %d", gr->tpc_count);
3517 nvgpu_log_info(g, "ppc_count: %d", gr->ppc_count);
3518
3519 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3520 nvgpu_log_info(g, "gpc_tpc_count[%d] : %d",
3521 gpc_index, gr->gpc_tpc_count[gpc_index]);
3522 }
3523 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3524 nvgpu_log_info(g, "gpc_zcb_count[%d] : %d",
3525 gpc_index, gr->gpc_zcb_count[gpc_index]);
3526 }
3527 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3528 nvgpu_log_info(g, "gpc_ppc_count[%d] : %d",
3529 gpc_index, gr->gpc_ppc_count[gpc_index]);
3530 }
3531 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3532 nvgpu_log_info(g, "gpc_skip_mask[%d] : %d",
3533 gpc_index, gr->gpc_skip_mask[gpc_index]);
3534 }
3535 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3536 for (pes_index = 0;
3537 pes_index < gr->pe_count_per_gpc;
3538 pes_index++) {
3539 nvgpu_log_info(g, "pes_tpc_count[%d][%d] : %d",
3540 pes_index, gpc_index,
3541 gr->pes_tpc_count[pes_index][gpc_index]);
3542 }
3543 }
3544
3545 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3546 for (pes_index = 0;
3547 pes_index < gr->pe_count_per_gpc;
3548 pes_index++) {
3549 nvgpu_log_info(g, "pes_tpc_mask[%d][%d] : %d",
3550 pes_index, gpc_index,
3551 gr->pes_tpc_mask[pes_index][gpc_index]);
3552 }
3553 }
3554
3555 g->ops.gr.bundle_cb_defaults(g);
3556 g->ops.gr.cb_size_default(g);
3557 g->ops.gr.calc_global_ctx_buffer_size(g);
3558 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3559
3560 nvgpu_log_info(g, "bundle_cb_default_size: %d",
3561 gr->bundle_cb_default_size);
3562 nvgpu_log_info(g, "min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3563 nvgpu_log_info(g, "bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3564 nvgpu_log_info(g, "attrib_cb_default_size: %d",
3565 gr->attrib_cb_default_size);
3566 nvgpu_log_info(g, "attrib_cb_size: %d", gr->attrib_cb_size);
3567 nvgpu_log_info(g, "alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3568 nvgpu_log_info(g, "alpha_cb_size: %d", gr->alpha_cb_size);
3569 nvgpu_log_info(g, "timeslice_mode: %d", gr->timeslice_mode);
3570
3571 return 0;
3572
3573clean_up:
3574 return -ENOMEM;
3575}
3576
3577static u32 prime_set[18] = {
3578 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3579
3580static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3581{
3582 s32 comm_denom;
3583 s32 mul_factor;
3584 s32 *init_frac = NULL;
3585 s32 *init_err = NULL;
3586 s32 *run_err = NULL;
3587 s32 *sorted_num_tpcs = NULL;
3588 s32 *sorted_to_unsorted_gpc_map = NULL;
3589 u32 gpc_index;
3590 u32 gpc_mark = 0;
3591 u32 num_tpc;
3592 u32 max_tpc_count = 0;
3593 u32 swap;
3594 u32 tile_count;
3595 u32 index;
3596 bool delete_map = false;
3597 bool gpc_sorted;
3598 int ret = 0;
3599 int num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
3600 int num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
3601 int map_tile_count = num_gpcs * num_tpc_per_gpc;
3602
3603 init_frac = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3604 init_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3605 run_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3606 sorted_num_tpcs =
3607 nvgpu_kzalloc(g, num_gpcs * num_tpc_per_gpc * sizeof(s32));
3608 sorted_to_unsorted_gpc_map =
3609 nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3610
3611 if (!((init_frac != NULL) &&
3612 (init_err != NULL) &&
3613 (run_err != NULL) &&
3614 (sorted_num_tpcs != NULL) &&
3615 (sorted_to_unsorted_gpc_map != NULL))) {
3616 ret = -ENOMEM;
3617 goto clean_up;
3618 }
3619
3620 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3621
3622 if (gr->tpc_count == 3) {
3623 gr->map_row_offset = 2;
3624 } else if (gr->tpc_count < 3) {
3625 gr->map_row_offset = 1;
3626 } else {
3627 gr->map_row_offset = 3;
3628
3629 for (index = 1; index < 18; index++) {
3630 u32 prime = prime_set[index];
3631 if ((gr->tpc_count % prime) != 0) {
3632 gr->map_row_offset = prime;
3633 break;
3634 }
3635 }
3636 }
3637
3638 switch (gr->tpc_count) {
3639 case 15:
3640 gr->map_row_offset = 6;
3641 break;
3642 case 14:
3643 gr->map_row_offset = 5;
3644 break;
3645 case 13:
3646 gr->map_row_offset = 2;
3647 break;
3648 case 11:
3649 gr->map_row_offset = 7;
3650 break;
3651 case 10:
3652 gr->map_row_offset = 6;
3653 break;
3654 case 7:
3655 case 5:
3656 gr->map_row_offset = 1;
3657 break;
3658 default:
3659 break;
3660 }
3661
3662 if (gr->map_tiles) {
3663 if (gr->map_tile_count != gr->tpc_count) {
3664 delete_map = true;
3665 }
3666
3667 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3668 if (gr_gk20a_get_map_tile_count(gr, tile_count)
3669 >= gr->tpc_count) {
3670 delete_map = true;
3671 }
3672 }
3673
3674 if (delete_map) {
3675 nvgpu_kfree(g, gr->map_tiles);
3676 gr->map_tiles = NULL;
3677 gr->map_tile_count = 0;
3678 }
3679 }
3680
3681 if (gr->map_tiles == NULL) {
3682 gr->map_tiles = nvgpu_kzalloc(g, map_tile_count * sizeof(u8));
3683 if (gr->map_tiles == NULL) {
3684 ret = -ENOMEM;
3685 goto clean_up;
3686 }
3687 gr->map_tile_count = map_tile_count;
3688
3689 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3690 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3691 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3692 }
3693
3694 gpc_sorted = false;
3695 while (!gpc_sorted) {
3696 gpc_sorted = true;
3697 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3698 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3699 gpc_sorted = false;
3700 swap = sorted_num_tpcs[gpc_index];
3701 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3702 sorted_num_tpcs[gpc_index + 1] = swap;
3703 swap = sorted_to_unsorted_gpc_map[gpc_index];
3704 sorted_to_unsorted_gpc_map[gpc_index] =
3705 sorted_to_unsorted_gpc_map[gpc_index + 1];
3706 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3707 }
3708 }
3709 }
3710
3711 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3712 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count) {
3713 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3714 }
3715 }
3716
3717 mul_factor = gr->gpc_count * max_tpc_count;
3718 if (mul_factor & 0x1) {
3719 mul_factor = 2;
3720 } else {
3721 mul_factor = 1;
3722 }
3723
3724 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3725
3726 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3727 num_tpc = sorted_num_tpcs[gpc_index];
3728
3729 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3730
3731 if (num_tpc != 0) {
3732 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3733 } else {
3734 init_err[gpc_index] = 0;
3735 }
3736
3737 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3738 }
3739
3740 while (gpc_mark < gr->tpc_count) {
3741 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3742 if ((run_err[gpc_index] * 2) >= comm_denom) {
3743 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3744 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3745 } else {
3746 run_err[gpc_index] += init_frac[gpc_index];
3747 }
3748 }
3749 }
3750 }
3751
3752clean_up:
3753 nvgpu_kfree(g, init_frac);
3754 nvgpu_kfree(g, init_err);
3755 nvgpu_kfree(g, run_err);
3756 nvgpu_kfree(g, sorted_num_tpcs);
3757 nvgpu_kfree(g, sorted_to_unsorted_gpc_map);
3758
3759 if (ret) {
3760 nvgpu_err(g, "fail");
3761 } else {
3762 nvgpu_log_fn(g, "done");
3763 }
3764
3765 return ret;
3766}
3767
3768static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3769{
3770 struct gr_zcull_gk20a *zcull = &gr->zcull;
3771
3772 zcull->aliquot_width = gr->tpc_count * 16;
3773 zcull->aliquot_height = 16;
3774
3775 zcull->width_align_pixels = gr->tpc_count * 16;
3776 zcull->height_align_pixels = 32;
3777
3778 zcull->aliquot_size =
3779 zcull->aliquot_width * zcull->aliquot_height;
3780
3781 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3782 zcull->pixel_squares_by_aliquots =
3783 gr->zcb_count * 16 * 16 * gr->tpc_count /
3784 (gr->gpc_count * gr->gpc_tpc_count[0]);
3785
3786 zcull->total_aliquots =
3787 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3788 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3789
3790 return 0;
3791}
3792
3793u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3794{
3795 /* assuming gr has already been initialized */
3796 return gr->ctx_vars.zcull_ctxsw_image_size;
3797}
3798
3799int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3800 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3801{
3802 struct tsg_gk20a *tsg;
3803 struct zcull_ctx_desc *zcull_ctx;
3804
3805 tsg = tsg_gk20a_from_ch(c);
3806 if (tsg == NULL) {
3807 return -EINVAL;
3808 }
3809
3810 zcull_ctx = &tsg->gr_ctx.zcull_ctx;
3811 zcull_ctx->ctx_sw_mode = mode;
3812 zcull_ctx->gpu_va = zcull_va;
3813
3814 /* TBD: don't disable channel in sw method processing */
3815 return gr_gk20a_ctx_zcull_setup(g, c);
3816}
3817
3818int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3819 struct gr_zcull_info *zcull_params)
3820{
3821 struct gr_zcull_gk20a *zcull = &gr->zcull;
3822
3823 zcull_params->width_align_pixels = zcull->width_align_pixels;
3824 zcull_params->height_align_pixels = zcull->height_align_pixels;
3825 zcull_params->pixel_squares_by_aliquots =
3826 zcull->pixel_squares_by_aliquots;
3827 zcull_params->aliquot_total = zcull->total_aliquots;
3828
3829 zcull_params->region_byte_multiplier =
3830 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3831 zcull_params->region_header_size =
3832 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3833 gr_zcull_save_restore_header_bytes_per_gpc_v();
3834
3835 zcull_params->subregion_header_size =
3836 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3837 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3838
3839 zcull_params->subregion_width_align_pixels =
3840 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3841 zcull_params->subregion_height_align_pixels =
3842 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3843 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3844
3845 return 0;
3846}
3847
3848int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3849 struct zbc_entry *color_val, u32 index)
3850{
3851 u32 i;
3852
3853 /* update l2 table */
3854 g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3855
3856 /* update ds table */
3857 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3858 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3859 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3860 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3861 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3862 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3863 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3864 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3865
3866 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3867 gr_ds_zbc_color_fmt_val_f(color_val->format));
3868
3869 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3870 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3871
3872 /* trigger the write */
3873 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3874 gr_ds_zbc_tbl_ld_select_c_f() |
3875 gr_ds_zbc_tbl_ld_action_write_f() |
3876 gr_ds_zbc_tbl_ld_trigger_active_f());
3877
3878 /* update local copy */
3879 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3880 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3881 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3882 }
3883 gr->zbc_col_tbl[index].format = color_val->format;
3884 gr->zbc_col_tbl[index].ref_cnt++;
3885
3886 return 0;
3887}
3888
3889int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3890 struct zbc_entry *depth_val, u32 index)
3891{
3892 /* update l2 table */
3893 g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3894
3895 /* update ds table */
3896 gk20a_writel(g, gr_ds_zbc_z_r(),
3897 gr_ds_zbc_z_val_f(depth_val->depth));
3898
3899 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3900 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3901
3902 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3903 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3904
3905 /* trigger the write */
3906 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3907 gr_ds_zbc_tbl_ld_select_z_f() |
3908 gr_ds_zbc_tbl_ld_action_write_f() |
3909 gr_ds_zbc_tbl_ld_trigger_active_f());
3910
3911 /* update local copy */
3912 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3913 gr->zbc_dep_tbl[index].format = depth_val->format;
3914 gr->zbc_dep_tbl[index].ref_cnt++;
3915
3916 return 0;
3917}
3918
3919void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3920{
3921 struct fifo_gk20a *f = &g->fifo;
3922 struct fifo_engine_info_gk20a *gr_info = NULL;
3923 u32 ret;
3924 u32 engine_id;
3925
3926 engine_id = gk20a_fifo_get_gr_engine_id(g);
3927 gr_info = (f->engine_info + engine_id);
3928
3929 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3930 if (ret) {
3931 nvgpu_err(g,
3932 "failed to disable gr engine activity");
3933 return;
3934 }
3935
3936 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
3937 GR_IDLE_CHECK_DEFAULT);
3938 if (ret) {
3939 nvgpu_err(g,
3940 "failed to idle graphics");
3941 goto clean_up;
3942 }
3943
3944 /* update zbc */
3945 g->ops.gr.pmu_save_zbc(g, entries);
3946
3947clean_up:
3948 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3949 if (ret) {
3950 nvgpu_err(g,
3951 "failed to enable gr engine activity");
3952 }
3953}
3954
3955int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3956 struct zbc_entry *zbc_val)
3957{
3958 struct zbc_color_table *c_tbl;
3959 struct zbc_depth_table *d_tbl;
3960 u32 i;
3961 int ret = -ENOSPC;
3962 bool added = false;
3963 u32 entries;
3964
3965 /* no endian swap ? */
3966
3967 nvgpu_mutex_acquire(&gr->zbc_lock);
3968 nvgpu_speculation_barrier();
3969 switch (zbc_val->type) {
3970 case GK20A_ZBC_TYPE_COLOR:
3971 /* search existing tables */
3972 for (i = 0; i < gr->max_used_color_index; i++) {
3973
3974 c_tbl = &gr->zbc_col_tbl[i];
3975
3976 if ((c_tbl->ref_cnt != 0U) &&
3977 (c_tbl->format == zbc_val->format) &&
3978 (memcmp(c_tbl->color_ds, zbc_val->color_ds,
3979 sizeof(zbc_val->color_ds)) == 0) &&
3980 (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3981 sizeof(zbc_val->color_l2)) == 0)) {
3982
3983 added = true;
3984 c_tbl->ref_cnt++;
3985 ret = 0;
3986 break;
3987 }
3988 }
3989 /* add new table */
3990 if (!added &&
3991 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3992
3993 c_tbl =
3994 &gr->zbc_col_tbl[gr->max_used_color_index];
3995 WARN_ON(c_tbl->ref_cnt != 0);
3996
3997 ret = g->ops.gr.add_zbc_color(g, gr,
3998 zbc_val, gr->max_used_color_index);
3999
4000 if (ret == 0) {
4001 gr->max_used_color_index++;
4002 }
4003 }
4004 break;
4005 case GK20A_ZBC_TYPE_DEPTH:
4006 /* search existing tables */
4007 for (i = 0; i < gr->max_used_depth_index; i++) {
4008
4009 d_tbl = &gr->zbc_dep_tbl[i];
4010
4011 if ((d_tbl->ref_cnt != 0U) &&
4012 (d_tbl->depth == zbc_val->depth) &&
4013 (d_tbl->format == zbc_val->format)) {
4014 added = true;
4015 d_tbl->ref_cnt++;
4016 ret = 0;
4017 break;
4018 }
4019 }
4020 /* add new table */
4021 if (!added &&
4022 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
4023
4024 d_tbl =
4025 &gr->zbc_dep_tbl[gr->max_used_depth_index];
4026 WARN_ON(d_tbl->ref_cnt != 0);
4027
4028 ret = g->ops.gr.add_zbc_depth(g, gr,
4029 zbc_val, gr->max_used_depth_index);
4030
4031 if (ret == 0) {
4032 gr->max_used_depth_index++;
4033 }
4034 }
4035 break;
4036 case T19X_ZBC:
4037 if (g->ops.gr.add_zbc_type_s) {
4038 added = g->ops.gr.add_zbc_type_s(g, gr, zbc_val, &ret);
4039 } else {
4040 nvgpu_err(g,
4041 "invalid zbc table type %d", zbc_val->type);
4042 ret = -EINVAL;
4043 goto err_mutex;
4044 }
4045 break;
4046 default:
4047 nvgpu_err(g,
4048 "invalid zbc table type %d", zbc_val->type);
4049 ret = -EINVAL;
4050 goto err_mutex;
4051 }
4052
4053 if (!added && ret == 0) {
4054 /* update zbc for elpg only when new entry is added */
4055 entries = max(gr->max_used_color_index,
4056 gr->max_used_depth_index);
4057 g->ops.gr.pmu_save_zbc(g, entries);
4058 }
4059
4060err_mutex:
4061 nvgpu_mutex_release(&gr->zbc_lock);
4062 return ret;
4063}
4064
4065/* get a zbc table entry specified by index
4066 * return table size when type is invalid */
4067int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
4068 struct zbc_query_params *query_params)
4069{
4070 u32 index = query_params->index_size;
4071 u32 i;
4072
4073 nvgpu_speculation_barrier();
4074 switch (query_params->type) {
4075 case GK20A_ZBC_TYPE_INVALID:
4076 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
4077 break;
4078 case GK20A_ZBC_TYPE_COLOR:
4079 if (index >= GK20A_ZBC_TABLE_SIZE) {
4080 nvgpu_err(g,
4081 "invalid zbc color table index");
4082 return -EINVAL;
4083 }
4084
4085 nvgpu_speculation_barrier();
4086 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4087 query_params->color_l2[i] =
4088 gr->zbc_col_tbl[index].color_l2[i];
4089 query_params->color_ds[i] =
4090 gr->zbc_col_tbl[index].color_ds[i];
4091 }
4092 query_params->format = gr->zbc_col_tbl[index].format;
4093 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
4094 break;
4095 case GK20A_ZBC_TYPE_DEPTH:
4096 if (index >= GK20A_ZBC_TABLE_SIZE) {
4097 nvgpu_err(g,
4098 "invalid zbc depth table index");
4099 return -EINVAL;
4100 }
4101
4102 nvgpu_speculation_barrier();
4103 query_params->depth = gr->zbc_dep_tbl[index].depth;
4104 query_params->format = gr->zbc_dep_tbl[index].format;
4105 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
4106 break;
4107 case T19X_ZBC:
4108 if (g->ops.gr.zbc_s_query_table) {
4109 return g->ops.gr.zbc_s_query_table(g, gr,
4110 query_params);
4111 } else {
4112 nvgpu_err(g,
4113 "invalid zbc table type");
4114 return -EINVAL;
4115 }
4116 break;
4117 default:
4118 nvgpu_err(g,
4119 "invalid zbc table type");
4120 return -EINVAL;
4121 }
4122
4123 return 0;
4124}
4125
4126static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
4127{
4128 unsigned int i;
4129 int ret;
4130
4131 for (i = 0; i < gr->max_used_color_index; i++) {
4132 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
4133 struct zbc_entry zbc_val;
4134
4135 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
4136 memcpy(zbc_val.color_ds,
4137 c_tbl->color_ds, sizeof(zbc_val.color_ds));
4138 memcpy(zbc_val.color_l2,
4139 c_tbl->color_l2, sizeof(zbc_val.color_l2));
4140 zbc_val.format = c_tbl->format;
4141
4142 ret = g->ops.gr.add_zbc_color(g, gr, &zbc_val, i);
4143
4144 if (ret) {
4145 return ret;
4146 }
4147 }
4148 for (i = 0; i < gr->max_used_depth_index; i++) {
4149 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
4150 struct zbc_entry zbc_val;
4151
4152 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
4153 zbc_val.depth = d_tbl->depth;
4154 zbc_val.format = d_tbl->format;
4155
4156 ret = g->ops.gr.add_zbc_depth(g, gr, &zbc_val, i);
4157 if (ret) {
4158 return ret;
4159 }
4160 }
4161
4162 if (g->ops.gr.load_zbc_s_tbl) {
4163 ret = g->ops.gr.load_zbc_s_tbl(g, gr);
4164 if (ret) {
4165 return ret;
4166 }
4167 }
4168
4169 return 0;
4170}
4171
4172int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
4173{
4174 struct zbc_entry zbc_val;
4175 u32 i = 0;
4176 int err = 0;
4177
4178 err = nvgpu_mutex_init(&gr->zbc_lock);
4179 if (err != 0) {
4180 nvgpu_err(g, "Error in zbc_lock mutex initialization");
4181 return err;
4182 }
4183
4184 /* load default color table */
4185 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
4186
4187 /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
4188 zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v();
4189 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4190 zbc_val.color_ds[i] = 0;
4191 zbc_val.color_l2[i] = 0;
4192 }
4193 zbc_val.color_l2[0] = 0xff000000;
4194 zbc_val.color_ds[3] = 0x3f800000;
4195 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4196 if (err != 0) {
4197 goto color_fail;
4198 }
4199
4200 /* Transparent black = (fmt 1 = zero) */
4201 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
4202 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4203 zbc_val.color_ds[i] = 0;
4204 zbc_val.color_l2[i] = 0;
4205 }
4206 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4207 if (err != 0) {
4208 goto color_fail;
4209 }
4210
4211 /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
4212 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
4213 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4214 zbc_val.color_ds[i] = 0x3f800000;
4215 zbc_val.color_l2[i] = 0xffffffff;
4216 }
4217 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4218 if (err != 0) {
4219 goto color_fail;
4220 }
4221
4222 gr->max_default_color_index = 3;
4223
4224 /* load default depth table */
4225 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
4226
4227 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4228 zbc_val.depth = 0x3f800000;
4229 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4230 if (err != 0) {
4231 goto depth_fail;
4232 }
4233
4234 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4235 zbc_val.depth = 0;
4236 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4237 if (err != 0) {
4238 goto depth_fail;
4239 }
4240
4241 gr->max_default_depth_index = 2;
4242
4243 if (g->ops.gr.load_zbc_s_default_tbl) {
4244 err = g->ops.gr.load_zbc_s_default_tbl(g, gr);
4245 if (err != 0) {
4246 return err;
4247 }
4248 }
4249
4250 return 0;
4251
4252color_fail:
4253 nvgpu_err(g, "fail to load default zbc color table");
4254 return err;
4255depth_fail:
4256 nvgpu_err(g, "fail to load default zbc depth table");
4257 return err;
4258}
4259
4260int _gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4261 struct zbc_entry *zbc_val)
4262{
4263 struct fifo_gk20a *f = &g->fifo;
4264 struct fifo_engine_info_gk20a *gr_info = NULL;
4265 int ret;
4266 u32 engine_id;
4267
4268 engine_id = gk20a_fifo_get_gr_engine_id(g);
4269 gr_info = (f->engine_info + engine_id);
4270
4271 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
4272 if (ret) {
4273 nvgpu_err(g,
4274 "failed to disable gr engine activity");
4275 return ret;
4276 }
4277
4278 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
4279 GR_IDLE_CHECK_DEFAULT);
4280 if (ret) {
4281 nvgpu_err(g,
4282 "failed to idle graphics");
4283 goto clean_up;
4284 }
4285
4286 ret = gr_gk20a_add_zbc(g, gr, zbc_val);
4287
4288clean_up:
4289 if (gk20a_fifo_enable_engine_activity(g, gr_info)) {
4290 nvgpu_err(g,
4291 "failed to enable gr engine activity");
4292 }
4293
4294 return ret;
4295}
4296
4297int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4298 struct zbc_entry *zbc_val)
4299{
4300 nvgpu_log_fn(g, " ");
4301
4302 return gr_gk20a_elpg_protected_call(g,
4303 gr_gk20a_add_zbc(g, gr, zbc_val));
4304}
4305
4306void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
4307 u32 *zcull_map_tiles)
4308{
4309 u32 val;
4310
4311 nvgpu_log_fn(g, " ");
4312
4313 if (zcull_num_entries >= 8) {
4314 nvgpu_log_fn(g, "map0");
4315 val =
4316 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(
4317 zcull_map_tiles[0]) |
4318 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(
4319 zcull_map_tiles[1]) |
4320 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(
4321 zcull_map_tiles[2]) |
4322 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(
4323 zcull_map_tiles[3]) |
4324 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(
4325 zcull_map_tiles[4]) |
4326 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(
4327 zcull_map_tiles[5]) |
4328 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(
4329 zcull_map_tiles[6]) |
4330 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(
4331 zcull_map_tiles[7]);
4332
4333 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(), val);
4334 }
4335
4336 if (zcull_num_entries >= 16) {
4337 nvgpu_log_fn(g, "map1");
4338 val =
4339 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(
4340 zcull_map_tiles[8]) |
4341 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(
4342 zcull_map_tiles[9]) |
4343 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(
4344 zcull_map_tiles[10]) |
4345 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(
4346 zcull_map_tiles[11]) |
4347 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(
4348 zcull_map_tiles[12]) |
4349 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(
4350 zcull_map_tiles[13]) |
4351 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(
4352 zcull_map_tiles[14]) |
4353 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(
4354 zcull_map_tiles[15]);
4355
4356 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(), val);
4357 }
4358
4359 if (zcull_num_entries >= 24) {
4360 nvgpu_log_fn(g, "map2");
4361 val =
4362 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(
4363 zcull_map_tiles[16]) |
4364 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(
4365 zcull_map_tiles[17]) |
4366 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(
4367 zcull_map_tiles[18]) |
4368 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(
4369 zcull_map_tiles[19]) |
4370 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(
4371 zcull_map_tiles[20]) |
4372 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(
4373 zcull_map_tiles[21]) |
4374 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(
4375 zcull_map_tiles[22]) |
4376 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(
4377 zcull_map_tiles[23]);
4378
4379 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(), val);
4380 }
4381
4382 if (zcull_num_entries >= 32) {
4383 nvgpu_log_fn(g, "map3");
4384 val =
4385 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(
4386 zcull_map_tiles[24]) |
4387 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(
4388 zcull_map_tiles[25]) |
4389 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(
4390 zcull_map_tiles[26]) |
4391 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(
4392 zcull_map_tiles[27]) |
4393 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(
4394 zcull_map_tiles[28]) |
4395 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(
4396 zcull_map_tiles[29]) |
4397 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(
4398 zcull_map_tiles[30]) |
4399 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(
4400 zcull_map_tiles[31]);
4401
4402 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(), val);
4403 }
4404
4405}
4406
4407static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
4408{
4409 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
4410 u32 *zcull_map_tiles, *zcull_bank_counters;
4411 u32 map_counter;
4412 u32 rcp_conserv;
4413 u32 offset;
4414 bool floorsweep = false;
4415 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
4416 u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
4417 u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
4418 GPU_LIT_NUM_TPC_PER_GPC);
4419 u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc;
4420 u32 map_tile_count;
4421
4422 if (gr->map_tiles == NULL) {
4423 return -1;
4424 }
4425
4426 if (zcull_alloc_num % 8 != 0) {
4427 /* Total 8 fields per map reg i.e. tile_0 to tile_7*/
4428 zcull_alloc_num += (zcull_alloc_num % 8);
4429 }
4430 zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4431
4432 if (zcull_map_tiles == NULL) {
4433 nvgpu_err(g,
4434 "failed to allocate zcull map titles");
4435 return -ENOMEM;
4436 }
4437
4438 zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4439
4440 if (zcull_bank_counters == NULL) {
4441 nvgpu_err(g,
4442 "failed to allocate zcull bank counters");
4443 nvgpu_kfree(g, zcull_map_tiles);
4444 return -ENOMEM;
4445 }
4446
4447 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
4448 map_tile_count = gr_gk20a_get_map_tile_count(gr, map_counter);
4449 zcull_map_tiles[map_counter] =
4450 zcull_bank_counters[map_tile_count];
4451 zcull_bank_counters[map_tile_count]++;
4452 }
4453
4454 if (g->ops.gr.program_zcull_mapping != NULL) {
4455 g->ops.gr.program_zcull_mapping(g, zcull_alloc_num,
4456 zcull_map_tiles);
4457 }
4458
4459 nvgpu_kfree(g, zcull_map_tiles);
4460 nvgpu_kfree(g, zcull_bank_counters);
4461
4462 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4463 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
4464 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
4465
4466 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4467 gpc_zcull_count < gpc_tpc_count) {
4468 nvgpu_err(g,
4469 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
4470 gpc_zcull_count, gpc_tpc_count, gpc_index);
4471 return -EINVAL;
4472 }
4473 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4474 gpc_zcull_count != 0) {
4475 floorsweep = true;
4476 }
4477 }
4478
4479 /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */
4480 rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(),
4481 gr->gpc_tpc_count[0]);
4482
4483 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4484 offset = gpc_index * gpc_stride;
4485
4486 if (floorsweep) {
4487 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4488 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4489 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4490 gr->max_zcull_per_gpc_count));
4491 } else {
4492 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4493 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4494 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4495 gr->gpc_tpc_count[gpc_index]));
4496 }
4497
4498 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4499 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4500 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4501
4502 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4503 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4504 }
4505
4506 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4507 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4508
4509 return 0;
4510}
4511
4512void gk20a_gr_enable_exceptions(struct gk20a *g)
4513{
4514 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4515 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4516 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4517 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4518 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4519 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4520}
4521
4522void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4523{
4524 struct gr_gk20a *gr = &g->gr;
4525 u32 tpc_mask;
4526
4527 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(),
4528 gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f() |
4529 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f());
4530
4531 tpc_mask =
4532 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->max_tpc_per_gpc_count) - 1);
4533
4534 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
4535}
4536
4537
4538void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4539{
4540 /* enable exceptions */
4541 gk20a_writel(g, gr_fe_hww_esr_r(),
4542 gr_fe_hww_esr_en_enable_f() |
4543 gr_fe_hww_esr_reset_active_f());
4544 gk20a_writel(g, gr_memfmt_hww_esr_r(),
4545 gr_memfmt_hww_esr_en_enable_f() |
4546 gr_memfmt_hww_esr_reset_active_f());
4547}
4548
4549void gr_gk20a_fecs_host_int_enable(struct gk20a *g)
4550{
4551 gk20a_writel(g, gr_fecs_host_int_enable_r(),
4552 gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
4553 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4554 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4555 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4556 gr_fecs_host_int_enable_watchdog_enable_f());
4557}
4558
4559static int gk20a_init_gr_setup_hw(struct gk20a *g)
4560{
4561 struct gr_gk20a *gr = &g->gr;
4562 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4563 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4564 u32 data;
4565 u32 last_method_data = 0;
4566 u32 i, err;
4567
4568 nvgpu_log_fn(g, " ");
4569
4570 if (g->ops.gr.init_gpc_mmu) {
4571 g->ops.gr.init_gpc_mmu(g);
4572 }
4573
4574 /* load gr floorsweeping registers */
4575 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4576 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4577 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4578 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4579
4580 gr_gk20a_zcull_init_hw(g, gr);
4581
4582 if (g->ops.priv_ring.set_ppriv_timeout_settings != NULL) {
4583 g->ops.priv_ring.set_ppriv_timeout_settings(g);
4584 }
4585
4586 /* enable fifo access */
4587 gk20a_writel(g, gr_gpfifo_ctl_r(),
4588 gr_gpfifo_ctl_access_enabled_f() |
4589 gr_gpfifo_ctl_semaphore_access_enabled_f());
4590
4591 /* TBD: reload gr ucode when needed */
4592
4593 /* enable interrupts */
4594 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4595 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4596
4597 /* enable fecs error interrupts */
4598 g->ops.gr.fecs_host_int_enable(g);
4599
4600 g->ops.gr.enable_hww_exceptions(g);
4601 g->ops.gr.set_hww_esr_report_mask(g);
4602
4603 /* enable TPC exceptions per GPC */
4604 if (g->ops.gr.enable_gpc_exceptions) {
4605 g->ops.gr.enable_gpc_exceptions(g);
4606 }
4607
4608 /* enable ECC for L1/SM */
4609 if (g->ops.gr.ecc_init_scrub_reg) {
4610 g->ops.gr.ecc_init_scrub_reg(g);
4611 }
4612
4613 /* TBD: enable per BE exceptions */
4614
4615 /* reset and enable exceptions */
4616 g->ops.gr.enable_exceptions(g);
4617
4618 gr_gk20a_load_zbc_table(g, gr);
4619
4620 if (g->ops.ltc.init_cbc) {
4621 g->ops.ltc.init_cbc(g, gr);
4622 }
4623
4624 if (g->ops.fb.init_cbc) {
4625 g->ops.fb.init_cbc(g, gr);
4626 }
4627
4628 if (g->ops.gr.disable_rd_coalesce) {
4629 g->ops.gr.disable_rd_coalesce(g);
4630 }
4631
4632 /* load ctx init */
4633 for (i = 0; i < sw_ctx_load->count; i++) {
4634 gk20a_writel(g, sw_ctx_load->l[i].addr,
4635 sw_ctx_load->l[i].value);
4636 }
4637
4638 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4639 GR_IDLE_CHECK_DEFAULT);
4640 if (err != 0U) {
4641 goto out;
4642 }
4643
4644 if (g->ops.gr.init_preemption_state) {
4645 err = g->ops.gr.init_preemption_state(g);
4646 if (err != 0U) {
4647 goto out;
4648 }
4649 }
4650
4651 /* disable fe_go_idle */
4652 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4653 gr_fe_go_idle_timeout_count_disabled_f());
4654
4655 /* override a few ctx state registers */
4656 g->ops.gr.commit_global_timeslice(g, NULL);
4657
4658 /* floorsweep anything left */
4659 err = g->ops.gr.init_fs_state(g);
4660 if (err != 0U) {
4661 goto out;
4662 }
4663
4664 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4665 GR_IDLE_CHECK_DEFAULT);
4666 if (err != 0U) {
4667 goto restore_fe_go_idle;
4668 }
4669
4670restore_fe_go_idle:
4671 /* restore fe_go_idle */
4672 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4673 gr_fe_go_idle_timeout_count_prod_f());
4674
4675 if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4676 GR_IDLE_CHECK_DEFAULT) != 0)) {
4677 goto out;
4678 }
4679
4680 /* load method init */
4681 if (sw_method_init->count) {
4682 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4683 sw_method_init->l[0].value);
4684 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4685 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4686 sw_method_init->l[0].addr);
4687 last_method_data = sw_method_init->l[0].value;
4688 }
4689 for (i = 1; i < sw_method_init->count; i++) {
4690 if (sw_method_init->l[i].value != last_method_data) {
4691 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4692 sw_method_init->l[i].value);
4693 last_method_data = sw_method_init->l[i].value;
4694 }
4695 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4696 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4697 sw_method_init->l[i].addr);
4698 }
4699
4700 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4701 GR_IDLE_CHECK_DEFAULT);
4702out:
4703 nvgpu_log_fn(g, "done");
4704 return err;
4705}
4706
4707static int gk20a_init_gr_prepare(struct gk20a *g)
4708{
4709 u32 err = 0;
4710
4711 /* reset gr engine */
4712 g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_GRAPH) |
4713 g->ops.mc.reset_mask(g, NVGPU_UNIT_BLG) |
4714 g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON));
4715
4716 nvgpu_cg_init_gr_load_gating_prod(g);
4717
4718 /* Disable elcg until it gets enabled later in the init*/
4719 nvgpu_cg_elcg_disable_no_wait(g);
4720
4721 /* enable fifo access */
4722 gk20a_writel(g, gr_gpfifo_ctl_r(),
4723 gr_gpfifo_ctl_access_enabled_f() |
4724 gr_gpfifo_ctl_semaphore_access_enabled_f());
4725
4726 if (!g->gr.ctx_vars.valid) {
4727 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4728 if (err != 0U) {
4729 nvgpu_err(g,
4730 "fail to load gr init ctx");
4731 }
4732 }
4733 return err;
4734}
4735
4736static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4737{
4738 struct nvgpu_timeout timeout;
4739 bool fecs_scrubbing;
4740 bool gpccs_scrubbing;
4741
4742 nvgpu_log_fn(g, " ");
4743
4744 nvgpu_timeout_init(g, &timeout,
4745 CTXSW_MEM_SCRUBBING_TIMEOUT_MAX /
4746 CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT,
4747 NVGPU_TIMER_RETRY_TIMER);
4748 do {
4749 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4750 (gr_fecs_dmactl_imem_scrubbing_m() |
4751 gr_fecs_dmactl_dmem_scrubbing_m());
4752
4753 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4754 (gr_gpccs_dmactl_imem_scrubbing_m() |
4755 gr_gpccs_dmactl_imem_scrubbing_m());
4756
4757 if (!fecs_scrubbing && !gpccs_scrubbing) {
4758 nvgpu_log_fn(g, "done");
4759 return 0;
4760 }
4761
4762 nvgpu_udelay(CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT);
4763 } while (nvgpu_timeout_expired(&timeout) == 0);
4764
4765 nvgpu_err(g, "Falcon mem scrubbing timeout");
4766 return -ETIMEDOUT;
4767}
4768
4769static int gr_gk20a_init_ctxsw(struct gk20a *g)
4770{
4771 u32 err = 0;
4772
4773 err = g->ops.gr.load_ctxsw_ucode(g);
4774 if (err != 0U) {
4775 goto out;
4776 }
4777
4778 err = gr_gk20a_wait_ctxsw_ready(g);
4779 if (err != 0U) {
4780 goto out;
4781 }
4782
4783out:
4784 if (err != 0U) {
4785 nvgpu_err(g, "fail");
4786 } else {
4787 nvgpu_log_fn(g, "done");
4788 }
4789
4790 return err;
4791}
4792
4793static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4794{
4795 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4796 u32 i, err = 0;
4797
4798 nvgpu_log_fn(g, " ");
4799
4800 /* enable interrupts */
4801 gk20a_writel(g, gr_intr_r(), ~0);
4802 gk20a_writel(g, gr_intr_en_r(), ~0);
4803
4804 /* load non_ctx init */
4805 for (i = 0; i < sw_non_ctx_load->count; i++) {
4806 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4807 sw_non_ctx_load->l[i].value);
4808 }
4809
4810 err = gr_gk20a_wait_mem_scrubbing(g);
4811 if (err != 0U) {
4812 goto out;
4813 }
4814
4815 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4816 GR_IDLE_CHECK_DEFAULT);
4817 if (err != 0U) {
4818 goto out;
4819 }
4820
4821out:
4822 if (err != 0U) {
4823 nvgpu_err(g, "fail");
4824 } else {
4825 nvgpu_log_fn(g, "done");
4826 }
4827
4828 return 0;
4829}
4830
4831static int gr_gk20a_init_access_map(struct gk20a *g)
4832{
4833 struct gr_gk20a *gr = &g->gr;
4834 struct nvgpu_mem *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
4835 u32 nr_pages =
4836 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4837 PAGE_SIZE);
4838 u32 *whitelist = NULL;
4839 int w, num_entries = 0;
4840
4841 nvgpu_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
4842
4843 g->ops.gr.get_access_map(g, &whitelist, &num_entries);
4844
4845 for (w = 0; w < num_entries; w++) {
4846 u32 map_bit, map_byte, map_shift, x;
4847 map_bit = whitelist[w] >> 2;
4848 map_byte = map_bit >> 3;
4849 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4850 nvgpu_log_info(g, "access map addr:0x%x byte:0x%x bit:%d",
4851 whitelist[w], map_byte, map_shift);
4852 x = nvgpu_mem_rd32(g, mem, map_byte / sizeof(u32));
4853 x |= 1 << (
4854 (map_byte % sizeof(u32) * BITS_PER_BYTE)
4855 + map_shift);
4856 nvgpu_mem_wr32(g, mem, map_byte / sizeof(u32), x);
4857 }
4858
4859 return 0;
4860}
4861
4862static int gk20a_init_gr_setup_sw(struct gk20a *g)
4863{
4864 struct gr_gk20a *gr = &g->gr;
4865 int err = 0;
4866
4867 nvgpu_log_fn(g, " ");
4868
4869 if (gr->sw_ready) {
4870 nvgpu_log_fn(g, "skip init");
4871 return 0;
4872 }
4873
4874 gr->g = g;
4875
4876#if defined(CONFIG_GK20A_CYCLE_STATS)
4877 err = nvgpu_mutex_init(&g->gr.cs_lock);
4878 if (err != 0) {
4879 nvgpu_err(g, "Error in gr.cs_lock mutex initialization");
4880 return err;
4881 }
4882#endif
4883
4884 err = gr_gk20a_init_gr_config(g, gr);
4885 if (err != 0) {
4886 goto clean_up;
4887 }
4888
4889 err = gr_gk20a_init_map_tiles(g, gr);
4890 if (err != 0) {
4891 goto clean_up;
4892 }
4893
4894 if (g->ops.ltc.init_comptags) {
4895 err = g->ops.ltc.init_comptags(g, gr);
4896 if (err != 0) {
4897 goto clean_up;
4898 }
4899 }
4900
4901 err = gr_gk20a_init_zcull(g, gr);
4902 if (err != 0) {
4903 goto clean_up;
4904 }
4905
4906 err = g->ops.gr.alloc_global_ctx_buffers(g);
4907 if (err != 0) {
4908 goto clean_up;
4909 }
4910
4911 err = gr_gk20a_init_access_map(g);
4912 if (err != 0) {
4913 goto clean_up;
4914 }
4915
4916 gr_gk20a_load_zbc_default_table(g, gr);
4917
4918 if (g->ops.gr.init_czf_bypass) {
4919 g->ops.gr.init_czf_bypass(g);
4920 }
4921
4922 if (g->ops.gr.init_gfxp_wfi_timeout_count) {
4923 g->ops.gr.init_gfxp_wfi_timeout_count(g);
4924 }
4925
4926 err = nvgpu_mutex_init(&gr->ctx_mutex);
4927 if (err != 0) {
4928 nvgpu_err(g, "Error in gr.ctx_mutex initialization");
4929 goto clean_up;
4930 }
4931
4932 nvgpu_spinlock_init(&gr->ch_tlb_lock);
4933
4934 gr->remove_support = gk20a_remove_gr_support;
4935 gr->sw_ready = true;
4936
4937 err = nvgpu_ecc_init_support(g);
4938 if (err != 0) {
4939 goto clean_up;
4940 }
4941
4942 nvgpu_log_fn(g, "done");
4943 return 0;
4944
4945clean_up:
4946 nvgpu_err(g, "fail");
4947 gk20a_remove_gr_support(gr);
4948 return err;
4949}
4950
4951static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
4952{
4953 struct nvgpu_pmu *pmu = &g->pmu;
4954 struct mm_gk20a *mm = &g->mm;
4955 struct vm_gk20a *vm = mm->pmu.vm;
4956 int err = 0;
4957
4958 u32 size;
4959
4960 nvgpu_log_fn(g, " ");
4961
4962 size = 0;
4963
4964 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4965 if (err != 0) {
4966 nvgpu_err(g,
4967 "fail to query fecs pg buffer size");
4968 return err;
4969 }
4970
4971 if (pmu->pg_buf.cpu_va == NULL) {
4972 err = nvgpu_dma_alloc_map_sys(vm, size, &pmu->pg_buf);
4973 if (err != 0) {
4974 nvgpu_err(g, "failed to allocate memory");
4975 return -ENOMEM;
4976 }
4977 }
4978
4979
4980 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &mm->pmu.inst_block);
4981 if (err != 0) {
4982 nvgpu_err(g,
4983 "fail to bind pmu inst to gr");
4984 return err;
4985 }
4986
4987 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va);
4988 if (err != 0) {
4989 nvgpu_err(g,
4990 "fail to set pg buffer pmu va");
4991 return err;
4992 }
4993
4994 return err;
4995}
4996
4997int gk20a_init_gr_support(struct gk20a *g)
4998{
4999 int err = 0;
5000
5001 nvgpu_log_fn(g, " ");
5002
5003 g->gr.initialized = false;
5004
5005 /* this is required before gr_gk20a_init_ctx_state */
5006 err = nvgpu_mutex_init(&g->gr.fecs_mutex);
5007 if (err != 0) {
5008 nvgpu_err(g, "Error in gr.fecs_mutex initialization");
5009 return err;
5010 }
5011
5012 err = gr_gk20a_init_ctxsw(g);
5013 if (err != 0) {
5014 return err;
5015 }
5016
5017 /* this appears query for sw states but fecs actually init
5018 ramchain, etc so this is hw init */
5019 err = g->ops.gr.init_ctx_state(g);
5020 if (err != 0) {
5021 return err;
5022 }
5023
5024 err = gk20a_init_gr_setup_sw(g);
5025 if (err != 0) {
5026 return err;
5027 }
5028
5029 err = gk20a_init_gr_setup_hw(g);
5030 if (err != 0) {
5031 return err;
5032 }
5033
5034 if (g->can_elpg) {
5035 err = gk20a_init_gr_bind_fecs_elpg(g);
5036 if (err != 0) {
5037 return err;
5038 }
5039 }
5040
5041 /* GR is inialized, signal possible waiters */
5042 g->gr.initialized = true;
5043 nvgpu_cond_signal(&g->gr.init_wq);
5044
5045 return 0;
5046}
5047
5048/* Wait until GR is initialized */
5049void gk20a_gr_wait_initialized(struct gk20a *g)
5050{
5051 NVGPU_COND_WAIT(&g->gr.init_wq, g->gr.initialized, 0);
5052}
5053
5054#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
5055#define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
5056#define NVA297_SET_SHADER_EXCEPTIONS 0x1528
5057#define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
5058
5059#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
5060
5061void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
5062{
5063 nvgpu_log_fn(g, " ");
5064
5065 if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
5066 gk20a_writel(g,
5067 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
5068 gk20a_writel(g,
5069 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
5070 } else {
5071 /* setup sm warp esr report masks */
5072 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
5073 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
5074 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
5075 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
5076 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
5077 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
5078 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
5079 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
5080 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
5081 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
5082 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
5083 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
5084 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
5085 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
5086 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
5087 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
5088 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
5089 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
5090 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
5091 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
5092 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
5093
5094 /* setup sm global esr report mask */
5095 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
5096 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
5097 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
5098 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
5099 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
5100 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
5101 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
5102 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
5103 }
5104}
5105
5106int gk20a_enable_gr_hw(struct gk20a *g)
5107{
5108 int err;
5109
5110 nvgpu_log_fn(g, " ");
5111
5112 err = gk20a_init_gr_prepare(g);
5113 if (err != 0) {
5114 return err;
5115 }
5116
5117 err = gk20a_init_gr_reset_enable_hw(g);
5118 if (err != 0) {
5119 return err;
5120 }
5121
5122 nvgpu_log_fn(g, "done");
5123
5124 return 0;
5125}
5126
5127int gk20a_gr_reset(struct gk20a *g)
5128{
5129 int err;
5130 u32 size;
5131
5132 g->gr.initialized = false;
5133
5134 nvgpu_mutex_acquire(&g->gr.fecs_mutex);
5135
5136 err = gk20a_enable_gr_hw(g);
5137 if (err != 0) {
5138 nvgpu_mutex_release(&g->gr.fecs_mutex);
5139 return err;
5140 }
5141
5142 err = gk20a_init_gr_setup_hw(g);
5143 if (err != 0) {
5144 nvgpu_mutex_release(&g->gr.fecs_mutex);
5145 return err;
5146 }
5147
5148 err = gr_gk20a_init_ctxsw(g);
5149 if (err != 0) {
5150 nvgpu_mutex_release(&g->gr.fecs_mutex);
5151 return err;
5152 }
5153
5154 nvgpu_mutex_release(&g->gr.fecs_mutex);
5155
5156 /* this appears query for sw states but fecs actually init
5157 ramchain, etc so this is hw init */
5158 err = g->ops.gr.init_ctx_state(g);
5159 if (err != 0) {
5160 return err;
5161 }
5162
5163 size = 0;
5164 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
5165 if (err != 0) {
5166 nvgpu_err(g,
5167 "fail to query fecs pg buffer size");
5168 return err;
5169 }
5170
5171 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &g->mm.pmu.inst_block);
5172 if (err != 0) {
5173 nvgpu_err(g,
5174 "fail to bind pmu inst to gr");
5175 return err;
5176 }
5177
5178 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va);
5179 if (err != 0) {
5180 nvgpu_err(g,
5181 "fail to set pg buffer pmu va");
5182 return err;
5183 }
5184
5185 nvgpu_cg_init_gr_load_gating_prod(g);
5186 nvgpu_cg_elcg_enable_no_wait(g);
5187
5188 /* GR is inialized, signal possible waiters */
5189 g->gr.initialized = true;
5190 nvgpu_cond_signal(&g->gr.init_wq);
5191
5192 return err;
5193}
5194
5195static void gk20a_gr_set_error_notifier(struct gk20a *g,
5196 struct gr_gk20a_isr_data *isr_data, u32 error_notifier)
5197{
5198 struct channel_gk20a *ch;
5199 struct tsg_gk20a *tsg;
5200 struct channel_gk20a *ch_tsg;
5201
5202 ch = isr_data->ch;
5203
5204 if (ch == NULL) {
5205 return;
5206 }
5207
5208 tsg = tsg_gk20a_from_ch(ch);
5209 if (tsg != NULL) {
5210 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
5211 nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
5212 channel_gk20a, ch_entry) {
5213 if (gk20a_channel_get(ch_tsg)) {
5214 g->ops.fifo.set_error_notifier(ch_tsg,
5215 error_notifier);
5216 gk20a_channel_put(ch_tsg);
5217 }
5218
5219 }
5220 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
5221 } else {
5222 nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
5223 }
5224}
5225
5226static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
5227 struct gr_gk20a_isr_data *isr_data)
5228{
5229 nvgpu_log_fn(g, " ");
5230 gk20a_gr_set_error_notifier(g, isr_data,
5231 NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT);
5232 nvgpu_err(g,
5233 "gr semaphore timeout");
5234 return -EINVAL;
5235}
5236
5237static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
5238 struct gr_gk20a_isr_data *isr_data)
5239{
5240 nvgpu_log_fn(g, " ");
5241 gk20a_gr_set_error_notifier(g, isr_data,
5242 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
5243 /* This is an unrecoverable error, reset is needed */
5244 nvgpu_err(g,
5245 "gr semaphore timeout");
5246 return -EINVAL;
5247}
5248
5249static int gk20a_gr_handle_illegal_method(struct gk20a *g,
5250 struct gr_gk20a_isr_data *isr_data)
5251{
5252 int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
5253 isr_data->class_num, isr_data->offset,
5254 isr_data->data_lo);
5255 if (ret) {
5256 gk20a_gr_set_error_notifier(g, isr_data,
5257 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
5258 nvgpu_err(g, "invalid method class 0x%08x"
5259 ", offset 0x%08x address 0x%08x",
5260 isr_data->class_num, isr_data->offset, isr_data->addr);
5261 }
5262 return ret;
5263}
5264
5265static int gk20a_gr_handle_illegal_class(struct gk20a *g,
5266 struct gr_gk20a_isr_data *isr_data)
5267{
5268 nvgpu_log_fn(g, " ");
5269 gk20a_gr_set_error_notifier(g, isr_data,
5270 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5271 nvgpu_err(g,
5272 "invalid class 0x%08x, offset 0x%08x",
5273 isr_data->class_num, isr_data->offset);
5274 return -EINVAL;
5275}
5276
5277int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
5278 struct gr_gk20a_isr_data *isr_data)
5279{
5280 u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r());
5281 int ret = 0;
5282 u32 chid = isr_data->ch != NULL ?
5283 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5284
5285 if (gr_fecs_intr == 0U) {
5286 return 0;
5287 }
5288
5289 if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
5290 gk20a_gr_set_error_notifier(g, isr_data,
5291 NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
5292 nvgpu_err(g,
5293 "firmware method error 0x%08x for offset 0x%04x",
5294 gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
5295 isr_data->data_lo);
5296 ret = -1;
5297 } else if ((gr_fecs_intr &
5298 gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
5299 /* currently, recovery is not initiated */
5300 nvgpu_err(g, "fecs watchdog triggered for channel %u, "
5301 "cannot ctxsw anymore !!", chid);
5302 gk20a_fecs_dump_falcon_stats(g);
5303 } else if ((gr_fecs_intr &
5304 gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) {
5305 u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6));
5306
5307 if (mailbox_value == MAILBOX_VALUE_TIMESTAMP_BUFFER_FULL) {
5308 nvgpu_info(g, "ctxsw intr0 set by ucode, "
5309 "timestamp buffer full");
5310#ifdef CONFIG_GK20A_CTXSW_TRACE
5311 gk20a_fecs_trace_reset_buffer(g);
5312#else
5313 ret = -1;
5314#endif
5315 } else {
5316 nvgpu_err(g,
5317 "ctxsw intr0 set by ucode, error_code: 0x%08x",
5318 mailbox_value);
5319 ret = -1;
5320 }
5321 } else {
5322 nvgpu_err(g,
5323 "unhandled fecs error interrupt 0x%08x for channel %u",
5324 gr_fecs_intr, ch->chid);
5325 gk20a_fecs_dump_falcon_stats(g);
5326 }
5327
5328 gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);
5329 return ret;
5330}
5331
5332static int gk20a_gr_handle_class_error(struct gk20a *g,
5333 struct gr_gk20a_isr_data *isr_data)
5334{
5335 u32 gr_class_error;
5336 u32 chid = isr_data->ch != NULL ?
5337 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5338
5339 nvgpu_log_fn(g, " ");
5340
5341 gr_class_error =
5342 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
5343 gk20a_gr_set_error_notifier(g, isr_data,
5344 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5345 nvgpu_err(g, "class error 0x%08x, offset 0x%08x,"
5346 "sub channel 0x%08x mme generated %d,"
5347 " mme pc 0x%08xdata high %d priv status %d"
5348 " unhandled intr 0x%08x for channel %u",
5349 isr_data->class_num, (isr_data->offset << 2),
5350 gr_trapped_addr_subch_v(isr_data->addr),
5351 gr_trapped_addr_mme_generated_v(isr_data->addr),
5352 gr_trapped_data_mme_pc_v(
5353 gk20a_readl(g, gr_trapped_data_mme_r())),
5354 gr_trapped_addr_datahigh_v(isr_data->addr),
5355 gr_trapped_addr_priv_v(isr_data->addr),
5356 gr_class_error, chid);
5357
5358 nvgpu_err(g, "trapped data low 0x%08x",
5359 gk20a_readl(g, gr_trapped_data_lo_r()));
5360 if (gr_trapped_addr_datahigh_v(isr_data->addr)) {
5361 nvgpu_err(g, "trapped data high 0x%08x",
5362 gk20a_readl(g, gr_trapped_data_hi_r()));
5363 }
5364
5365 return -EINVAL;
5366}
5367
5368static int gk20a_gr_handle_firmware_method(struct gk20a *g,
5369 struct gr_gk20a_isr_data *isr_data)
5370{
5371 u32 chid = isr_data->ch != NULL ?
5372 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5373
5374 nvgpu_log_fn(g, " ");
5375
5376 gk20a_gr_set_error_notifier(g, isr_data,
5377 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5378 nvgpu_err(g,
5379 "firmware method 0x%08x, offset 0x%08x for channel %u",
5380 isr_data->class_num, isr_data->offset,
5381 chid);
5382 return -EINVAL;
5383}
5384
5385int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
5386 struct gr_gk20a_isr_data *isr_data)
5387{
5388 struct channel_gk20a *ch = isr_data->ch;
5389 struct tsg_gk20a *tsg;
5390
5391 if (ch == NULL) {
5392 return 0;
5393 }
5394
5395 tsg = tsg_gk20a_from_ch(ch);
5396 if (tsg != NULL) {
5397 g->ops.fifo.post_event_id(tsg,
5398 NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
5399
5400 nvgpu_cond_broadcast(&ch->semaphore_wq);
5401 } else {
5402 nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
5403 }
5404
5405 return 0;
5406}
5407
5408#if defined(CONFIG_GK20A_CYCLE_STATS)
5409static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
5410 u32 offset)
5411{
5412 /* support only 24-bit 4-byte aligned offsets */
5413 bool valid = !(offset & 0xFF000003);
5414
5415 if (g->allow_all)
5416 return true;
5417
5418 /* whitelist check */
5419 valid = valid &&
5420 is_bar0_global_offset_whitelisted_gk20a(g, offset);
5421 /* resource size check in case there was a problem
5422 * with allocating the assumed size of bar0 */
5423 valid = valid && gk20a_io_valid_reg(g, offset);
5424 return valid;
5425}
5426#endif
5427
5428int gk20a_gr_handle_notify_pending(struct gk20a *g,
5429 struct gr_gk20a_isr_data *isr_data)
5430{
5431 struct channel_gk20a *ch = isr_data->ch;
5432
5433#if defined(CONFIG_GK20A_CYCLE_STATS)
5434 void *virtual_address;
5435 u32 buffer_size;
5436 u32 offset;
5437 bool exit;
5438#endif
5439 if (ch == NULL || tsg_gk20a_from_ch(ch) == NULL) {
5440 return 0;
5441 }
5442
5443#if defined(CONFIG_GK20A_CYCLE_STATS)
5444 /* GL will never use payload 0 for cycle state */
5445 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
5446 return 0;
5447
5448 nvgpu_mutex_acquire(&ch->cyclestate.cyclestate_buffer_mutex);
5449
5450 virtual_address = ch->cyclestate.cyclestate_buffer;
5451 buffer_size = ch->cyclestate.cyclestate_buffer_size;
5452 offset = isr_data->data_lo;
5453 exit = false;
5454 while (!exit) {
5455 struct share_buffer_head *sh_hdr;
5456 u32 min_element_size;
5457
5458 /* validate offset */
5459 if (offset + sizeof(struct share_buffer_head) > buffer_size ||
5460 offset + sizeof(struct share_buffer_head) < offset) {
5461 nvgpu_err(g,
5462 "cyclestats buffer overrun at offset 0x%x",
5463 offset);
5464 break;
5465 }
5466
5467 sh_hdr = (struct share_buffer_head *)
5468 ((char *)virtual_address + offset);
5469
5470 min_element_size =
5471 (sh_hdr->operation == OP_END ?
5472 sizeof(struct share_buffer_head) :
5473 sizeof(struct gk20a_cyclestate_buffer_elem));
5474
5475 /* validate sh_hdr->size */
5476 if (sh_hdr->size < min_element_size ||
5477 offset + sh_hdr->size > buffer_size ||
5478 offset + sh_hdr->size < offset) {
5479 nvgpu_err(g,
5480 "bad cyclestate buffer header size at offset 0x%x",
5481 offset);
5482 sh_hdr->failed = true;
5483 break;
5484 }
5485
5486 switch (sh_hdr->operation) {
5487 case OP_END:
5488 exit = true;
5489 break;
5490
5491 case BAR0_READ32:
5492 case BAR0_WRITE32:
5493 {
5494 struct gk20a_cyclestate_buffer_elem *op_elem =
5495 (struct gk20a_cyclestate_buffer_elem *)sh_hdr;
5496 bool valid = is_valid_cyclestats_bar0_offset_gk20a(
5497 g, op_elem->offset_bar0);
5498 u32 raw_reg;
5499 u64 mask_orig;
5500 u64 v;
5501
5502 if (!valid) {
5503 nvgpu_err(g,
5504 "invalid cycletstats op offset: 0x%x",
5505 op_elem->offset_bar0);
5506
5507 sh_hdr->failed = exit = true;
5508 break;
5509 }
5510
5511
5512 mask_orig =
5513 ((1ULL <<
5514 (op_elem->last_bit + 1))
5515 -1)&~((1ULL <<
5516 op_elem->first_bit)-1);
5517
5518 raw_reg =
5519 gk20a_readl(g,
5520 op_elem->offset_bar0);
5521
5522 switch (sh_hdr->operation) {
5523 case BAR0_READ32:
5524 op_elem->data =
5525 (raw_reg & mask_orig)
5526 >> op_elem->first_bit;
5527 break;
5528
5529 case BAR0_WRITE32:
5530 v = 0;
5531 if ((unsigned int)mask_orig !=
5532 (unsigned int)~0) {
5533 v = (unsigned int)
5534 (raw_reg & ~mask_orig);
5535 }
5536
5537 v |= ((op_elem->data
5538 << op_elem->first_bit)
5539 & mask_orig);
5540
5541 gk20a_writel(g,
5542 op_elem->offset_bar0,
5543 (unsigned int)v);
5544 break;
5545 default:
5546 /* nop ok?*/
5547 break;
5548 }
5549 }
5550 break;
5551
5552 default:
5553 /* no operation content case */
5554 exit = true;
5555 break;
5556 }
5557 sh_hdr->completed = true;
5558 offset += sh_hdr->size;
5559 }
5560 nvgpu_mutex_release(&ch->cyclestate.cyclestate_buffer_mutex);
5561#endif
5562 nvgpu_log_fn(g, " ");
5563 nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
5564 return 0;
5565}
5566
5567/* Used by sw interrupt thread to translate current ctx to chid.
5568 * Also used by regops to translate current ctx to chid and tsgid.
5569 * For performance, we don't want to go through 128 channels every time.
5570 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5571 * A small tlb is used here to cache translation.
5572 *
5573 * Returned channel must be freed with gk20a_channel_put() */
5574static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
5575 struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid)
5576{
5577 struct fifo_gk20a *f = &g->fifo;
5578 struct gr_gk20a *gr = &g->gr;
5579 u32 chid = -1;
5580 u32 tsgid = NVGPU_INVALID_TSG_ID;
5581 u32 i;
5582 struct channel_gk20a *ret = NULL;
5583
5584 /* when contexts are unloaded from GR, the valid bit is reset
5585 * but the instance pointer information remains intact.
5586 * This might be called from gr_isr where contexts might be
5587 * unloaded. No need to check ctx_valid bit
5588 */
5589
5590 nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
5591
5592 /* check cache first */
5593 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5594 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5595 chid = gr->chid_tlb[i].chid;
5596 tsgid = gr->chid_tlb[i].tsgid;
5597 ret = gk20a_channel_from_id(g, chid);
5598 goto unlock;
5599 }
5600 }
5601
5602 /* slow path */
5603 for (chid = 0; chid < f->num_channels; chid++) {
5604 struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);
5605
5606 if (ch == NULL) {
5607 continue;
5608 }
5609
5610 if ((u32)(nvgpu_inst_block_addr(g, &ch->inst_block) >>
5611 ram_in_base_shift_v()) ==
5612 gr_fecs_current_ctx_ptr_v(curr_ctx)) {
5613 tsgid = ch->tsgid;
5614 /* found it */
5615 ret = ch;
5616 break;
5617 }
5618 gk20a_channel_put(ch);
5619 }
5620
5621 if (ret == NULL) {
5622 goto unlock;
5623 }
5624
5625 /* add to free tlb entry */
5626 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5627 if (gr->chid_tlb[i].curr_ctx == 0) {
5628 gr->chid_tlb[i].curr_ctx = curr_ctx;
5629 gr->chid_tlb[i].chid = chid;
5630 gr->chid_tlb[i].tsgid = tsgid;
5631 goto unlock;
5632 }
5633 }
5634
5635 /* no free entry, flush one */
5636 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5637 gr->chid_tlb[gr->channel_tlb_flush_index].chid = chid;
5638 gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid;
5639
5640 gr->channel_tlb_flush_index =
5641 (gr->channel_tlb_flush_index + 1) &
5642 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5643
5644unlock:
5645 nvgpu_spinlock_release(&gr->ch_tlb_lock);
5646 if (curr_tsgid) {
5647 *curr_tsgid = tsgid;
5648 }
5649 return ret;
5650}
5651
5652int gk20a_gr_lock_down_sm(struct gk20a *g,
5653 u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
5654 bool check_errors)
5655{
5656 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5657 u32 dbgr_control0;
5658
5659 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5660 "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm);
5661
5662 /* assert stop trigger */
5663 dbgr_control0 =
5664 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
5665 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5666 gk20a_writel(g,
5667 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
5668
5669 return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask,
5670 check_errors);
5671}
5672
5673bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5674{
5675 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5676
5677 /* check if an sm debugger is attached.
5678 * assumption: all SMs will have debug mode enabled/disabled
5679 * uniformly. */
5680 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5681 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v()) {
5682 return true;
5683 }
5684
5685 return false;
5686}
5687
5688int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
5689 bool *post_event, struct channel_gk20a *fault_ch,
5690 u32 *hww_global_esr)
5691{
5692 int ret = 0;
5693 bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
5694 bool disable_sm_exceptions = true;
5695 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5696 bool sm_debugger_attached;
5697 u32 global_esr, warp_esr, global_mask;
5698
5699 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
5700
5701 sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
5702
5703 global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
5704 *hww_global_esr = global_esr;
5705 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
5706 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
5707
5708 if (!sm_debugger_attached) {
5709 nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
5710 global_esr, warp_esr);
5711 return -EFAULT;
5712 }
5713
5714 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5715 "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
5716
5717 gr_gk20a_elpg_protected_call(g,
5718 g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
5719
5720 if (g->ops.gr.pre_process_sm_exception) {
5721 ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
5722 global_esr, warp_esr,
5723 sm_debugger_attached,
5724 fault_ch,
5725 &early_exit,
5726 &ignore_debugger);
5727 if (ret) {
5728 nvgpu_err(g, "could not pre-process sm error!");
5729 return ret;
5730 }
5731 }
5732
5733 if (early_exit) {
5734 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5735 "returning early");
5736 return ret;
5737 }
5738
5739 /*
5740 * Disable forwarding of tpc exceptions,
5741 * the debugger will reenable exceptions after servicing them.
5742 *
5743 * Do not disable exceptions if the only SM exception is BPT_INT
5744 */
5745 if ((global_esr == gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f())
5746 && (warp_esr == 0)) {
5747 disable_sm_exceptions = false;
5748 }
5749
5750 if (!ignore_debugger && disable_sm_exceptions) {
5751 u32 tpc_exception_en = gk20a_readl(g,
5752 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
5753 offset);
5754 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5755 gk20a_writel(g,
5756 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
5757 tpc_exception_en);
5758 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM Exceptions disabled");
5759 }
5760
5761 /* if a debugger is present and an error has occurred, do a warp sync */
5762 if (!ignore_debugger &&
5763 ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5764 nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
5765 do_warp_sync = true;
5766 }
5767
5768 if (do_warp_sync) {
5769 ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
5770 global_mask, true);
5771 if (ret) {
5772 nvgpu_err(g, "sm did not lock down!");
5773 return ret;
5774 }
5775 }
5776
5777 if (ignore_debugger) {
5778 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5779 "ignore_debugger set, skipping event posting");
5780 } else {
5781 *post_event = true;
5782 }
5783
5784 return ret;
5785}
5786
5787int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
5788 bool *post_event)
5789{
5790 int ret = 0;
5791 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
5792 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
5793 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
5794 u32 esr;
5795
5796 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
5797
5798 esr = gk20a_readl(g,
5799 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
5800 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);
5801
5802 gk20a_writel(g,
5803 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
5804 esr);
5805
5806 return ret;
5807}
5808
5809void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
5810 u32 *esr_sm_sel)
5811{
5812 *esr_sm_sel = 1;
5813}
5814
5815static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5816 bool *post_event, struct channel_gk20a *fault_ch,
5817 u32 *hww_global_esr)
5818{
5819 int ret = 0;
5820 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5821 u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r()
5822 + offset);
5823 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
5824
5825 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5826 "GPC%d TPC%d: pending exception 0x%x",
5827 gpc, tpc, tpc_exception);
5828
5829 /* check if an sm exeption is pending */
5830 if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) ==
5831 gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) {
5832 u32 esr_sm_sel, sm;
5833
5834 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5835 "GPC%d TPC%d: SM exception pending", gpc, tpc);
5836
5837 if (g->ops.gr.handle_tpc_sm_ecc_exception) {
5838 g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc,
5839 post_event, fault_ch, hww_global_esr);
5840 }
5841
5842 g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel);
5843
5844 for (sm = 0; sm < sm_per_tpc; sm++) {
5845
5846 if ((esr_sm_sel & BIT32(sm)) == 0U) {
5847 continue;
5848 }
5849
5850 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5851 "GPC%d TPC%d: SM%d exception pending",
5852 gpc, tpc, sm);
5853
5854 ret |= g->ops.gr.handle_sm_exception(g,
5855 gpc, tpc, sm, post_event, fault_ch,
5856 hww_global_esr);
5857 /* clear the hwws, also causes tpc and gpc
5858 * exceptions to be cleared. Should be cleared
5859 * only if SM is locked down or empty.
5860 */
5861 g->ops.gr.clear_sm_hww(g,
5862 gpc, tpc, sm, *hww_global_esr);
5863
5864 }
5865
5866 }
5867
5868 /* check if a tex exeption is pending */
5869 if (gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(tpc_exception) ==
5870 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) {
5871 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5872 "GPC%d TPC%d: TEX exception pending", gpc, tpc);
5873 ret |= g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event);
5874 }
5875
5876 if (g->ops.gr.handle_tpc_mpc_exception) {
5877 ret |= g->ops.gr.handle_tpc_mpc_exception(g,
5878 gpc, tpc, post_event);
5879 }
5880
5881 return ret;
5882}
5883
5884static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5885 struct channel_gk20a *fault_ch, u32 *hww_global_esr)
5886{
5887 int ret = 0;
5888 u32 gpc_offset, gpc, tpc;
5889 struct gr_gk20a *gr = &g->gr;
5890 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5891 u32 gpc_exception;
5892
5893 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, " ");
5894
5895 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
5896 if ((exception1 & (1 << gpc)) == 0) {
5897 continue;
5898 }
5899
5900 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5901 "GPC%d exception pending", gpc);
5902
5903 gpc_offset = gk20a_gr_gpc_offset(g, gpc);
5904
5905 gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r()
5906 + gpc_offset);
5907
5908 /* check if any tpc has an exception */
5909 for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
5910 if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) &
5911 (1 << tpc)) == 0) {
5912 continue;
5913 }
5914
5915 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5916 "GPC%d: TPC%d exception pending", gpc, tpc);
5917
5918 ret |= gk20a_gr_handle_tpc_exception(g, gpc, tpc,
5919 post_event, fault_ch, hww_global_esr);
5920
5921 }
5922
5923 /* Handle GCC exception */
5924 if ((gr_gpc0_gpccs_gpc_exception_gcc_v(gpc_exception) != 0U) &&
5925 (g->ops.gr.handle_gcc_exception != NULL)) {
5926 int gcc_ret = 0;
5927 gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc,
5928 post_event, fault_ch, hww_global_esr);
5929 ret |= (ret != 0) ? ret : gcc_ret;
5930 }
5931
5932 /* Handle GPCCS exceptions */
5933 if (g->ops.gr.handle_gpc_gpccs_exception) {
5934 int ret_ecc = 0;
5935 ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
5936 gpc_exception);
5937 ret |= (ret != 0) ? ret : ret_ecc;
5938 }
5939
5940 /* Handle GPCMMU exceptions */
5941 if (g->ops.gr.handle_gpc_gpcmmu_exception) {
5942 int ret_mmu = 0;
5943
5944 ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
5945 gpc_exception);
5946 ret |= (ret != 0) ? ret : ret_mmu;
5947 }
5948
5949 }
5950
5951 return ret;
5952}
5953
5954static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg,
5955 u32 global_esr)
5956{
5957 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) {
5958 g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT);
5959 }
5960
5961 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) {
5962 g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE);
5963 }
5964
5965 return 0;
5966}
5967
5968int gk20a_gr_isr(struct gk20a *g)
5969{
5970 struct gr_gk20a_isr_data isr_data;
5971 u32 grfifo_ctl;
5972 u32 obj_table;
5973 bool need_reset = false;
5974 u32 gr_intr = gk20a_readl(g, gr_intr_r());
5975 struct channel_gk20a *ch = NULL;
5976 struct channel_gk20a *fault_ch = NULL;
5977 u32 tsgid = NVGPU_INVALID_TSG_ID;
5978 struct tsg_gk20a *tsg = NULL;
5979 u32 gr_engine_id;
5980 u32 global_esr = 0;
5981 u32 chid;
5982
5983 nvgpu_log_fn(g, " ");
5984 nvgpu_log(g, gpu_dbg_intr, "pgraph intr 0x%08x", gr_intr);
5985
5986 if (gr_intr == 0U) {
5987 return 0;
5988 }
5989
5990 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
5991 if (gr_engine_id != FIFO_INVAL_ENGINE_ID) {
5992 gr_engine_id = BIT(gr_engine_id);
5993 }
5994
5995 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5996 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5997 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5998
5999 gk20a_writel(g, gr_gpfifo_ctl_r(),
6000 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
6001 gr_gpfifo_ctl_semaphore_access_f(0));
6002
6003 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
6004 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
6005 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
6006 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
6007 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
6008 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
6009 obj_table = (isr_data.sub_chan < 4) ? gk20a_readl(g,
6010 gr_fe_object_table_r(isr_data.sub_chan)) : 0;
6011 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
6012
6013 ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, &tsgid);
6014 isr_data.ch = ch;
6015 chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
6016
6017 if (ch == NULL) {
6018 nvgpu_err(g, "pgraph intr: 0x%08x, chid: INVALID", gr_intr);
6019 } else {
6020 tsg = tsg_gk20a_from_ch(ch);
6021 if (tsg == NULL) {
6022 nvgpu_err(g, "pgraph intr: 0x%08x, chid: %d "
6023 "not bound to tsg", gr_intr, chid);
6024 }
6025 }
6026
6027 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
6028 "channel %d: addr 0x%08x, "
6029 "data 0x%08x 0x%08x,"
6030 "ctx 0x%08x, offset 0x%08x, "
6031 "subchannel 0x%08x, class 0x%08x",
6032 chid, isr_data.addr,
6033 isr_data.data_hi, isr_data.data_lo,
6034 isr_data.curr_ctx, isr_data.offset,
6035 isr_data.sub_chan, isr_data.class_num);
6036
6037 if (gr_intr & gr_intr_notify_pending_f()) {
6038 g->ops.gr.handle_notify_pending(g, &isr_data);
6039 gk20a_writel(g, gr_intr_r(),
6040 gr_intr_notify_reset_f());
6041 gr_intr &= ~gr_intr_notify_pending_f();
6042 }
6043
6044 if (gr_intr & gr_intr_semaphore_pending_f()) {
6045 g->ops.gr.handle_semaphore_pending(g, &isr_data);
6046 gk20a_writel(g, gr_intr_r(),
6047 gr_intr_semaphore_reset_f());
6048 gr_intr &= ~gr_intr_semaphore_pending_f();
6049 }
6050
6051 if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
6052 if (gk20a_gr_handle_semaphore_timeout_pending(g,
6053 &isr_data) != 0) {
6054 need_reset = true;
6055 }
6056 gk20a_writel(g, gr_intr_r(),
6057 gr_intr_semaphore_reset_f());
6058 gr_intr &= ~gr_intr_semaphore_pending_f();
6059 }
6060
6061 if (gr_intr & gr_intr_illegal_notify_pending_f()) {
6062 if (gk20a_gr_intr_illegal_notify_pending(g,
6063 &isr_data) != 0) {
6064 need_reset = true;
6065 }
6066 gk20a_writel(g, gr_intr_r(),
6067 gr_intr_illegal_notify_reset_f());
6068 gr_intr &= ~gr_intr_illegal_notify_pending_f();
6069 }
6070
6071 if (gr_intr & gr_intr_illegal_method_pending_f()) {
6072 if (gk20a_gr_handle_illegal_method(g, &isr_data) != 0) {
6073 need_reset = true;
6074 }
6075 gk20a_writel(g, gr_intr_r(),
6076 gr_intr_illegal_method_reset_f());
6077 gr_intr &= ~gr_intr_illegal_method_pending_f();
6078 }
6079
6080 if (gr_intr & gr_intr_illegal_class_pending_f()) {
6081 if (gk20a_gr_handle_illegal_class(g, &isr_data) != 0) {
6082 need_reset = true;
6083 }
6084 gk20a_writel(g, gr_intr_r(),
6085 gr_intr_illegal_class_reset_f());
6086 gr_intr &= ~gr_intr_illegal_class_pending_f();
6087 }
6088
6089 if (gr_intr & gr_intr_fecs_error_pending_f()) {
6090 if (g->ops.gr.handle_fecs_error(g, ch, &isr_data) != 0) {
6091 need_reset = true;
6092 }
6093 gk20a_writel(g, gr_intr_r(),
6094 gr_intr_fecs_error_reset_f());
6095 gr_intr &= ~gr_intr_fecs_error_pending_f();
6096 }
6097
6098 if (gr_intr & gr_intr_class_error_pending_f()) {
6099 if (gk20a_gr_handle_class_error(g, &isr_data) != 0) {
6100 need_reset = true;
6101 }
6102 gk20a_writel(g, gr_intr_r(),
6103 gr_intr_class_error_reset_f());
6104 gr_intr &= ~gr_intr_class_error_pending_f();
6105 }
6106
6107 /* this one happens if someone tries to hit a non-whitelisted
6108 * register using set_falcon[4] */
6109 if (gr_intr & gr_intr_firmware_method_pending_f()) {
6110 if (gk20a_gr_handle_firmware_method(g, &isr_data) != 0) {
6111 need_reset = true;
6112 }
6113 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
6114 gk20a_writel(g, gr_intr_r(),
6115 gr_intr_firmware_method_reset_f());
6116 gr_intr &= ~gr_intr_firmware_method_pending_f();
6117 }
6118
6119 if (gr_intr & gr_intr_exception_pending_f()) {
6120 u32 exception = gk20a_readl(g, gr_exception_r());
6121
6122 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
6123
6124 if (exception & gr_exception_fe_m()) {
6125 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
6126 u32 info = gk20a_readl(g, gr_fe_hww_esr_info_r());
6127
6128 nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
6129 fe, info);
6130 gk20a_writel(g, gr_fe_hww_esr_r(),
6131 gr_fe_hww_esr_reset_active_f());
6132 need_reset = true;
6133 }
6134
6135 if (exception & gr_exception_memfmt_m()) {
6136 u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
6137
6138 nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
6139 gk20a_writel(g, gr_memfmt_hww_esr_r(),
6140 gr_memfmt_hww_esr_reset_active_f());
6141 need_reset = true;
6142 }
6143
6144 if (exception & gr_exception_pd_m()) {
6145 u32 pd = gk20a_readl(g, gr_pd_hww_esr_r());
6146
6147 nvgpu_err(g, "pd exception: esr 0x%08x", pd);
6148 gk20a_writel(g, gr_pd_hww_esr_r(),
6149 gr_pd_hww_esr_reset_active_f());
6150 need_reset = true;
6151 }
6152
6153 if (exception & gr_exception_scc_m()) {
6154 u32 scc = gk20a_readl(g, gr_scc_hww_esr_r());
6155
6156 nvgpu_err(g, "scc exception: esr 0x%08x", scc);
6157 gk20a_writel(g, gr_scc_hww_esr_r(),
6158 gr_scc_hww_esr_reset_active_f());
6159 need_reset = true;
6160 }
6161
6162 if (exception & gr_exception_ds_m()) {
6163 u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
6164
6165 nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
6166 gk20a_writel(g, gr_ds_hww_esr_r(),
6167 gr_ds_hww_esr_reset_task_f());
6168 need_reset = true;
6169 }
6170
6171 if (exception & gr_exception_ssync_m()) {
6172 if (g->ops.gr.handle_ssync_hww) {
6173 if (g->ops.gr.handle_ssync_hww(g) != 0) {
6174 need_reset = true;
6175 }
6176 } else {
6177 nvgpu_err(g, "unhandled ssync exception");
6178 }
6179 }
6180
6181 if (exception & gr_exception_mme_m()) {
6182 u32 mme = gk20a_readl(g, gr_mme_hww_esr_r());
6183 u32 info = gk20a_readl(g, gr_mme_hww_esr_info_r());
6184
6185 nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
6186 mme, info);
6187 gk20a_writel(g, gr_mme_hww_esr_r(),
6188 gr_mme_hww_esr_reset_active_f());
6189 need_reset = true;
6190 }
6191
6192 if (exception & gr_exception_sked_m()) {
6193 u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
6194
6195 nvgpu_err(g, "sked exception: esr 0x%08x", sked);
6196 gk20a_writel(g, gr_sked_hww_esr_r(),
6197 gr_sked_hww_esr_reset_active_f());
6198 need_reset = true;
6199 }
6200
6201 /* check if a gpc exception has occurred */
6202 if (((exception & gr_exception_gpc_m()) != 0U) &&
6203 !need_reset) {
6204 bool post_event = false;
6205
6206 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
6207 "GPC exception pending");
6208
6209 if (tsg != NULL) {
6210 fault_ch = isr_data.ch;
6211 }
6212
6213 /* fault_ch can be NULL */
6214 /* check if any gpc has an exception */
6215 if (gk20a_gr_handle_gpc_exception(g, &post_event,
6216 fault_ch, &global_esr) != 0) {
6217 need_reset = true;
6218 }
6219
6220 /* signal clients waiting on an event */
6221 if (g->ops.gr.sm_debugger_attached(g) &&
6222 post_event && (fault_ch != NULL)) {
6223 g->ops.debugger.post_events(fault_ch);
6224 }
6225 }
6226
6227 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
6228 gr_intr &= ~gr_intr_exception_pending_f();
6229
6230 if (need_reset) {
6231 nvgpu_err(g, "set gr exception notifier");
6232 gk20a_gr_set_error_notifier(g, &isr_data,
6233 NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
6234 }
6235 }
6236
6237 if (need_reset) {
6238 if (tsg != NULL) {
6239 gk20a_fifo_recover(g, gr_engine_id,
6240 tsgid, true, true, true,
6241 RC_TYPE_GR_FAULT);
6242 } else {
6243 if (ch != NULL) {
6244 nvgpu_err(g, "chid: %d referenceable but not "
6245 "bound to tsg", chid);
6246 }
6247 gk20a_fifo_recover(g, gr_engine_id,
6248 0, false, false, true,
6249 RC_TYPE_GR_FAULT);
6250 }
6251 }
6252
6253 if (gr_intr != 0U) {
6254 /* clear unhandled interrupts */
6255 if (ch == NULL) {
6256 /*
6257 * This is probably an interrupt during
6258 * gk20a_free_channel()
6259 */
6260 nvgpu_err(g, "unhandled gr intr 0x%08x for "
6261 "unreferenceable channel, clearing",
6262 gr_intr);
6263 } else {
6264 nvgpu_err(g, "unhandled gr intr 0x%08x for chid: %d",
6265 gr_intr, chid);
6266 }
6267 gk20a_writel(g, gr_intr_r(), gr_intr);
6268 }
6269
6270 gk20a_writel(g, gr_gpfifo_ctl_r(),
6271 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
6272 gr_gpfifo_ctl_semaphore_access_f(1));
6273
6274
6275 /* Posting of BPT events should be the last thing in this function */
6276 if ((global_esr != 0U) && (tsg != NULL)) {
6277 gk20a_gr_post_bpt_events(g, tsg, global_esr);
6278 }
6279
6280 if (ch) {
6281 gk20a_channel_put(ch);
6282 }
6283
6284 return 0;
6285}
6286
6287u32 gk20a_gr_nonstall_isr(struct gk20a *g)
6288{
6289 u32 ops = 0;
6290 u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
6291
6292 nvgpu_log(g, gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
6293
6294 if ((gr_intr & gr_intr_nonstall_trap_pending_f()) != 0U) {
6295 /* Clear the interrupt */
6296 gk20a_writel(g, gr_intr_nonstall_r(),
6297 gr_intr_nonstall_trap_pending_f());
6298 ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE |
6299 GK20A_NONSTALL_OPS_POST_EVENTS);
6300 }
6301 return ops;
6302}
6303
6304int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
6305{
6306 BUG_ON(size == NULL);
6307 return gr_gk20a_submit_fecs_method_op(g,
6308 (struct fecs_method_op_gk20a) {
6309 .mailbox.id = 0,
6310 .mailbox.data = 0,
6311 .mailbox.clr = ~0,
6312 .method.data = 1,
6313 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
6314 .mailbox.ret = size,
6315 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
6316 .mailbox.ok = 0,
6317 .cond.fail = GR_IS_UCODE_OP_SKIP,
6318 .mailbox.fail = 0}, false);
6319}
6320
6321int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g,
6322 struct nvgpu_mem *inst_block)
6323{
6324 u32 data = fecs_current_ctx_data(g, inst_block);
6325
6326 return gr_gk20a_submit_fecs_method_op(g,
6327 (struct fecs_method_op_gk20a){
6328 .mailbox.id = 4,
6329 .mailbox.data = data,
6330 .mailbox.clr = ~0,
6331 .method.data = 1,
6332 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
6333 .mailbox.ret = NULL,
6334 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6335 .mailbox.ok = 1,
6336 .cond.fail = GR_IS_UCODE_OP_SKIP,
6337 .mailbox.fail = 0}, false);
6338}
6339
6340int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
6341{
6342 return gr_gk20a_submit_fecs_method_op(g,
6343 (struct fecs_method_op_gk20a) {
6344 .mailbox.id = 4,
6345 .mailbox.data = u64_lo32(pmu_va >> 8),
6346 .mailbox.clr = ~0,
6347 .method.data = 1,
6348 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
6349 .mailbox.ret = NULL,
6350 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6351 .mailbox.ok = 1,
6352 .cond.fail = GR_IS_UCODE_OP_SKIP,
6353 .mailbox.fail = 0}, false);
6354}
6355
6356int gk20a_gr_suspend(struct gk20a *g)
6357{
6358 u32 ret = 0;
6359
6360 nvgpu_log_fn(g, " ");
6361
6362 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
6363 GR_IDLE_CHECK_DEFAULT);
6364 if (ret) {
6365 return ret;
6366 }
6367
6368 gk20a_writel(g, gr_gpfifo_ctl_r(),
6369 gr_gpfifo_ctl_access_disabled_f());
6370
6371 /* disable gr intr */
6372 gk20a_writel(g, gr_intr_r(), 0);
6373 gk20a_writel(g, gr_intr_en_r(), 0);
6374
6375 /* disable all exceptions */
6376 gk20a_writel(g, gr_exception_r(), 0);
6377 gk20a_writel(g, gr_exception_en_r(), 0);
6378 gk20a_writel(g, gr_exception1_r(), 0);
6379 gk20a_writel(g, gr_exception1_en_r(), 0);
6380 gk20a_writel(g, gr_exception2_r(), 0);
6381 gk20a_writel(g, gr_exception2_en_r(), 0);
6382
6383 gk20a_gr_flush_channel_tlb(&g->gr);
6384
6385 g->gr.initialized = false;
6386
6387 nvgpu_log_fn(g, "done");
6388 return ret;
6389}
6390
6391static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6392 u32 addr,
6393 bool is_quad, u32 quad,
6394 u32 *context_buffer,
6395 u32 context_buffer_size,
6396 u32 *priv_offset);
6397
6398static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
6399 u32 addr,
6400 u32 *priv_offset);
6401
6402/* This function will decode a priv address and return the partition type and numbers. */
6403int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
6404 enum ctxsw_addr_type *addr_type,
6405 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
6406 u32 *broadcast_flags)
6407{
6408 u32 gpc_addr;
6409
6410 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6411
6412 /* setup defaults */
6413 *addr_type = CTXSW_ADDR_TYPE_SYS;
6414 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
6415 *gpc_num = 0;
6416 *tpc_num = 0;
6417 *ppc_num = 0;
6418 *be_num = 0;
6419
6420 if (pri_is_gpc_addr(g, addr)) {
6421 *addr_type = CTXSW_ADDR_TYPE_GPC;
6422 gpc_addr = pri_gpccs_addr_mask(addr);
6423 if (pri_is_gpc_addr_shared(g, addr)) {
6424 *addr_type = CTXSW_ADDR_TYPE_GPC;
6425 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
6426 } else {
6427 *gpc_num = pri_get_gpc_num(g, addr);
6428 }
6429
6430 if (pri_is_ppc_addr(g, gpc_addr)) {
6431 *addr_type = CTXSW_ADDR_TYPE_PPC;
6432 if (pri_is_ppc_addr_shared(g, gpc_addr)) {
6433 *broadcast_flags |= PRI_BROADCAST_FLAGS_PPC;
6434 return 0;
6435 }
6436 }
6437 if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
6438 *addr_type = CTXSW_ADDR_TYPE_TPC;
6439 if (pri_is_tpc_addr_shared(g, gpc_addr)) {
6440 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
6441 return 0;
6442 }
6443 *tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6444 }
6445 return 0;
6446 } else if (pri_is_be_addr(g, addr)) {
6447 *addr_type = CTXSW_ADDR_TYPE_BE;
6448 if (pri_is_be_addr_shared(g, addr)) {
6449 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
6450 return 0;
6451 }
6452 *be_num = pri_get_be_num(g, addr);
6453 return 0;
6454 } else if (g->ops.ltc.pri_is_ltc_addr(g, addr)) {
6455 *addr_type = CTXSW_ADDR_TYPE_LTCS;
6456 if (g->ops.ltc.is_ltcs_ltss_addr(g, addr)) {
6457 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS;
6458 } else if (g->ops.ltc.is_ltcn_ltss_addr(g, addr)) {
6459 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS;
6460 }
6461 return 0;
6462 } else if (pri_is_fbpa_addr(g, addr)) {
6463 *addr_type = CTXSW_ADDR_TYPE_FBPA;
6464 if (pri_is_fbpa_addr_shared(g, addr)) {
6465 *broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA;
6466 return 0;
6467 }
6468 return 0;
6469 } else if ((g->ops.gr.is_egpc_addr != NULL) &&
6470 g->ops.gr.is_egpc_addr(g, addr)) {
6471 return g->ops.gr.decode_egpc_addr(g,
6472 addr, addr_type, gpc_num,
6473 tpc_num, broadcast_flags);
6474 } else {
6475 *addr_type = CTXSW_ADDR_TYPE_SYS;
6476 return 0;
6477 }
6478 /* PPC!?!?!?! */
6479
6480 /*NOTREACHED*/
6481 return -EINVAL;
6482}
6483
6484void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
6485 u32 num_fbpas,
6486 u32 *priv_addr_table, u32 *t)
6487{
6488 u32 fbpa_id;
6489
6490 for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) {
6491 priv_addr_table[(*t)++] = pri_fbpa_addr(g,
6492 pri_fbpa_addr_mask(g, addr), fbpa_id);
6493 }
6494}
6495
6496int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
6497 u32 gpc_num,
6498 u32 *priv_addr_table, u32 *t)
6499{
6500 u32 ppc_num;
6501
6502 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6503
6504 for (ppc_num = 0; ppc_num < g->gr.gpc_ppc_count[gpc_num]; ppc_num++) {
6505 priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr),
6506 gpc_num, ppc_num);
6507 }
6508
6509 return 0;
6510}
6511
6512/*
6513 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
6514 * unicast addresses. This function will convert a BE unicast address to a BE
6515 * broadcast address and split a GPC/TPC broadcast address into a table of
6516 * GPC/TPC addresses. The addresses generated by this function can be
6517 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
6518 */
6519int gr_gk20a_create_priv_addr_table(struct gk20a *g,
6520 u32 addr,
6521 u32 *priv_addr_table,
6522 u32 *num_registers)
6523{
6524 enum ctxsw_addr_type addr_type;
6525 u32 gpc_num, tpc_num, ppc_num, be_num;
6526 u32 priv_addr, gpc_addr;
6527 u32 broadcast_flags;
6528 u32 t;
6529 int err;
6530
6531 t = 0;
6532 *num_registers = 0;
6533
6534 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6535
6536 err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
6537 &gpc_num, &tpc_num, &ppc_num, &be_num,
6538 &broadcast_flags);
6539 nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
6540 if (err != 0) {
6541 return err;
6542 }
6543
6544 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6545 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6546 /* The BE broadcast registers are included in the compressed PRI
6547 * table. Convert a BE unicast address to a broadcast address
6548 * so that we can look up the offset. */
6549 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
6550 ((broadcast_flags & PRI_BROADCAST_FLAGS_BE) == 0U)) {
6551 priv_addr_table[t++] = pri_be_shared_addr(g, addr);
6552 } else {
6553 priv_addr_table[t++] = addr;
6554 }
6555
6556 *num_registers = t;
6557 return 0;
6558 }
6559
6560 /* The GPC/TPC unicast registers are included in the compressed PRI
6561 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
6562 * that we can look up the offsets. */
6563 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
6564 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
6565
6566 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
6567 for (tpc_num = 0;
6568 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6569 tpc_num++) {
6570 priv_addr_table[t++] =
6571 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6572 gpc_num, tpc_num);
6573 }
6574
6575 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
6576 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
6577 priv_addr_table, &t);
6578 if (err != 0) {
6579 return err;
6580 }
6581 } else {
6582 priv_addr = pri_gpc_addr(g,
6583 pri_gpccs_addr_mask(addr),
6584 gpc_num);
6585
6586 gpc_addr = pri_gpccs_addr_mask(priv_addr);
6587 tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6588 if (tpc_num >= g->gr.gpc_tpc_count[gpc_num]) {
6589 continue;
6590 }
6591
6592 priv_addr_table[t++] = priv_addr;
6593 }
6594 }
6595 } else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
6596 (addr_type == CTXSW_ADDR_TYPE_ETPC)) &&
6597 (g->ops.gr.egpc_etpc_priv_addr_table != NULL)) {
6598 nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC");
6599 g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num, tpc_num,
6600 broadcast_flags, priv_addr_table, &t);
6601 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) {
6602 g->ops.ltc.split_lts_broadcast_addr(g, addr,
6603 priv_addr_table, &t);
6604 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) {
6605 g->ops.ltc.split_ltc_broadcast_addr(g, addr,
6606 priv_addr_table, &t);
6607 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) {
6608 g->ops.gr.split_fbpa_broadcast_addr(g, addr,
6609 nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS),
6610 priv_addr_table, &t);
6611 } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) == 0U) {
6612 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
6613 for (tpc_num = 0;
6614 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6615 tpc_num++) {
6616 priv_addr_table[t++] =
6617 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6618 gpc_num, tpc_num);
6619 }
6620 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
6621 err = gr_gk20a_split_ppc_broadcast_addr(g,
6622 addr, gpc_num, priv_addr_table, &t);
6623 } else {
6624 priv_addr_table[t++] = addr;
6625 }
6626 }
6627
6628 *num_registers = t;
6629 return 0;
6630}
6631
6632int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
6633 u32 addr,
6634 u32 max_offsets,
6635 u32 *offsets, u32 *offset_addrs,
6636 u32 *num_offsets,
6637 bool is_quad, u32 quad)
6638{
6639 u32 i;
6640 u32 priv_offset = 0;
6641 u32 *priv_registers;
6642 u32 num_registers = 0;
6643 int err = 0;
6644 struct gr_gk20a *gr = &g->gr;
6645 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6646 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6647 sm_per_tpc;
6648
6649 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6650
6651 /* implementation is crossed-up if either of these happen */
6652 if (max_offsets > potential_offsets) {
6653 nvgpu_log_fn(g, "max_offsets > potential_offsets");
6654 return -EINVAL;
6655 }
6656
6657 if (!g->gr.ctx_vars.golden_image_initialized) {
6658 return -ENODEV;
6659 }
6660
6661 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6662 if (priv_registers == NULL) {
6663 nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
6664 err = PTR_ERR(priv_registers);
6665 goto cleanup;
6666 }
6667 memset(offsets, 0, sizeof(u32) * max_offsets);
6668 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6669 *num_offsets = 0;
6670
6671 g->ops.gr.create_priv_addr_table(g, addr, &priv_registers[0],
6672 &num_registers);
6673
6674 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6675 nvgpu_log_fn(g, "max_offsets = %d, num_registers = %d",
6676 max_offsets, num_registers);
6677 err = -EINVAL;
6678 goto cleanup;
6679 }
6680
6681 if ((max_offsets == 1) && (num_registers > 1)) {
6682 num_registers = 1;
6683 }
6684
6685 if (g->gr.ctx_vars.local_golden_image == NULL) {
6686 nvgpu_log_fn(g, "no context switch header info to work with");
6687 err = -EINVAL;
6688 goto cleanup;
6689 }
6690
6691 for (i = 0; i < num_registers; i++) {
6692 err = gr_gk20a_find_priv_offset_in_buffer(g,
6693 priv_registers[i],
6694 is_quad, quad,
6695 g->gr.ctx_vars.local_golden_image,
6696 g->gr.ctx_vars.golden_image_size,
6697 &priv_offset);
6698 if (err != 0) {
6699 nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
6700 addr); /*, grPriRegStr(addr)));*/
6701 goto cleanup;
6702 }
6703
6704 offsets[i] = priv_offset;
6705 offset_addrs[i] = priv_registers[i];
6706 }
6707
6708 *num_offsets = num_registers;
6709cleanup:
6710 if (!IS_ERR_OR_NULL(priv_registers)) {
6711 nvgpu_kfree(g, priv_registers);
6712 }
6713
6714 return err;
6715}
6716
6717int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
6718 u32 addr,
6719 u32 max_offsets,
6720 u32 *offsets, u32 *offset_addrs,
6721 u32 *num_offsets)
6722{
6723 u32 i;
6724 u32 priv_offset = 0;
6725 u32 *priv_registers;
6726 u32 num_registers = 0;
6727 int err = 0;
6728 struct gr_gk20a *gr = &g->gr;
6729 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6730 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6731 sm_per_tpc;
6732
6733 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6734
6735 /* implementation is crossed-up if either of these happen */
6736 if (max_offsets > potential_offsets) {
6737 return -EINVAL;
6738 }
6739
6740 if (!g->gr.ctx_vars.golden_image_initialized) {
6741 return -ENODEV;
6742 }
6743
6744 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6745 if (priv_registers == NULL) {
6746 nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
6747 return -ENOMEM;
6748 }
6749 memset(offsets, 0, sizeof(u32) * max_offsets);
6750 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6751 *num_offsets = 0;
6752
6753 g->ops.gr.create_priv_addr_table(g, addr, priv_registers,
6754 &num_registers);
6755
6756 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6757 err = -EINVAL;
6758 goto cleanup;
6759 }
6760
6761 if ((max_offsets == 1) && (num_registers > 1)) {
6762 num_registers = 1;
6763 }
6764
6765 if (g->gr.ctx_vars.local_golden_image == NULL) {
6766 nvgpu_log_fn(g, "no context switch header info to work with");
6767 err = -EINVAL;
6768 goto cleanup;
6769 }
6770
6771 for (i = 0; i < num_registers; i++) {
6772 err = gr_gk20a_find_priv_offset_in_pm_buffer(g,
6773 priv_registers[i],
6774 &priv_offset);
6775 if (err != 0) {
6776 nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
6777 addr); /*, grPriRegStr(addr)));*/
6778 goto cleanup;
6779 }
6780
6781 offsets[i] = priv_offset;
6782 offset_addrs[i] = priv_registers[i];
6783 }
6784
6785 *num_offsets = num_registers;
6786cleanup:
6787 nvgpu_kfree(g, priv_registers);
6788
6789 return err;
6790}
6791
6792/* Setup some register tables. This looks hacky; our
6793 * register/offset functions are just that, functions.
6794 * So they can't be used as initializers... TBD: fix to
6795 * generate consts at least on an as-needed basis.
6796 */
6797static const u32 _num_ovr_perf_regs = 17;
6798static u32 _ovr_perf_regs[17] = { 0, };
6799/* Following are the blocks of registers that the ucode
6800 stores in the extended region.*/
6801
6802void gk20a_gr_init_ovr_sm_dsm_perf(void)
6803{
6804 if (_ovr_perf_regs[0] != 0) {
6805 return;
6806 }
6807
6808 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
6809 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
6810 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
6811 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
6812 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
6813 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
6814 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
6815 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
6816 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
6817 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
6818 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
6819 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
6820 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
6821 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
6822 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
6823 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
6824 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
6825
6826}
6827
6828/* TBD: would like to handle this elsewhere, at a higher level.
6829 * these are currently constructed in a "test-then-write" style
6830 * which makes it impossible to know externally whether a ctx
6831 * write will actually occur. so later we should put a lazy,
6832 * map-and-hold system in the patch write state */
6833static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
6834 struct channel_gk20a *ch,
6835 u32 addr, u32 data,
6836 struct nvgpu_mem *mem)
6837{
6838 u32 num_gpc = g->gr.gpc_count;
6839 u32 num_tpc;
6840 u32 tpc, gpc, reg;
6841 u32 chk_addr;
6842 u32 vaddr_lo;
6843 u32 vaddr_hi;
6844 u32 tmp;
6845 u32 num_ovr_perf_regs = 0;
6846 u32 *ovr_perf_regs = NULL;
6847 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6848 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6849 struct tsg_gk20a *tsg;
6850 struct nvgpu_gr_ctx *gr_ctx;
6851 struct nvgpu_mem *ctxheader = &ch->ctx_header;
6852
6853 tsg = tsg_gk20a_from_ch(ch);
6854 if (tsg == NULL) {
6855 return -EINVAL;
6856 }
6857
6858 gr_ctx = &tsg->gr_ctx;
6859 g->ops.gr.init_ovr_sm_dsm_perf();
6860 g->ops.gr.init_sm_dsm_reg_info();
6861 g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);
6862
6863 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6864
6865 for (reg = 0; reg < num_ovr_perf_regs; reg++) {
6866 for (gpc = 0; gpc < num_gpc; gpc++) {
6867 num_tpc = g->gr.gpc_tpc_count[gpc];
6868 for (tpc = 0; tpc < num_tpc; tpc++) {
6869 chk_addr = ((gpc_stride * gpc) +
6870 (tpc_in_gpc_stride * tpc) +
6871 ovr_perf_regs[reg]);
6872 if (chk_addr != addr) {
6873 continue;
6874 }
6875 /* reset the patch count from previous
6876 runs,if ucode has already processed
6877 it */
6878 tmp = nvgpu_mem_rd(g, mem,
6879 ctxsw_prog_main_image_patch_count_o());
6880
6881 if (tmp == 0U) {
6882 gr_ctx->patch_ctx.data_count = 0;
6883 }
6884
6885 gr_gk20a_ctx_patch_write(g, gr_ctx,
6886 addr, data, true);
6887
6888 vaddr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
6889 vaddr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
6890
6891 nvgpu_mem_wr(g, mem,
6892 ctxsw_prog_main_image_patch_count_o(),
6893 gr_ctx->patch_ctx.data_count);
6894 if (ctxheader->gpu_va) {
6895 nvgpu_mem_wr(g, ctxheader,
6896 ctxsw_prog_main_image_patch_adr_lo_o(),
6897 vaddr_lo);
6898 nvgpu_mem_wr(g, ctxheader,
6899 ctxsw_prog_main_image_patch_adr_hi_o(),
6900 vaddr_hi);
6901 } else {
6902 nvgpu_mem_wr(g, mem,
6903 ctxsw_prog_main_image_patch_adr_lo_o(),
6904 vaddr_lo);
6905 nvgpu_mem_wr(g, mem,
6906 ctxsw_prog_main_image_patch_adr_hi_o(),
6907 vaddr_hi);
6908 }
6909
6910 /* we're not caching these on cpu side,
6911 but later watch for it */
6912 return 0;
6913 }
6914 }
6915 }
6916
6917 return 0;
6918}
6919
6920#define ILLEGAL_ID ((u32)~0)
6921
6922static inline bool check_main_image_header_magic(u8 *context)
6923{
6924 u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
6925 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
6926}
6927static inline bool check_local_header_magic(u8 *context)
6928{
6929 u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
6930 return magic == ctxsw_prog_local_magic_value_v_value_v();
6931
6932}
6933
6934/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
6935static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
6936{
6937 return 256;
6938}
6939
6940void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
6941 u32 **ovr_perf_regs)
6942{
6943 *num_ovr_perf_regs = _num_ovr_perf_regs;
6944 *ovr_perf_regs = _ovr_perf_regs;
6945}
6946
6947static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
6948 u32 addr,
6949 bool is_quad, u32 quad,
6950 u32 *context_buffer,
6951 u32 context_buffer_size,
6952 u32 *priv_offset)
6953{
6954 u32 i, data32;
6955 u32 gpc_num, tpc_num;
6956 u32 num_gpcs, num_tpcs;
6957 u32 chk_addr;
6958 u32 ext_priv_offset, ext_priv_size;
6959 u8 *context;
6960 u32 offset_to_segment, offset_to_segment_end;
6961 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
6962 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
6963 u32 num_ext_gpccs_ext_buffer_segments;
6964 u32 inter_seg_offset;
6965 u32 max_tpc_count;
6966 u32 *sm_dsm_perf_ctrl_regs = NULL;
6967 u32 num_sm_dsm_perf_ctrl_regs = 0;
6968 u32 *sm_dsm_perf_regs = NULL;
6969 u32 num_sm_dsm_perf_regs = 0;
6970 u32 buffer_segments_size = 0;
6971 u32 marker_size = 0;
6972 u32 control_register_stride = 0;
6973 u32 perf_register_stride = 0;
6974 struct gr_gk20a *gr = &g->gr;
6975 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
6976 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6977 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
6978 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6979 u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1);
6980
6981 /* Only have TPC registers in extended region, so if not a TPC reg,
6982 then return error so caller can look elsewhere. */
6983 if (pri_is_gpc_addr(g, addr)) {
6984 u32 gpc_addr = 0;
6985 gpc_num = pri_get_gpc_num(g, addr);
6986 gpc_addr = pri_gpccs_addr_mask(addr);
6987 if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
6988 tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6989 } else {
6990 return -EINVAL;
6991 }
6992
6993 nvgpu_log_info(g, " gpc = %d tpc = %d",
6994 gpc_num, tpc_num);
6995 } else if ((g->ops.gr.is_etpc_addr != NULL) &&
6996 g->ops.gr.is_etpc_addr(g, addr)) {
6997 g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num);
6998 gpc_base = g->ops.gr.get_egpc_base(g);
6999 } else {
7000 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7001 "does not exist in extended region");
7002 return -EINVAL;
7003 }
7004
7005 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
7006 /* note below is in words/num_registers */
7007 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
7008
7009 context = (u8 *)context_buffer;
7010 /* sanity check main header */
7011 if (!check_main_image_header_magic(context)) {
7012 nvgpu_err(g,
7013 "Invalid main header: magic value");
7014 return -EINVAL;
7015 }
7016 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
7017 if (gpc_num >= num_gpcs) {
7018 nvgpu_err(g,
7019 "GPC 0x%08x is greater than total count 0x%08x!",
7020 gpc_num, num_gpcs);
7021 return -EINVAL;
7022 }
7023
7024 data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
7025 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
7026 if (0 == ext_priv_size) {
7027 nvgpu_log_info(g, " No extended memory in context buffer");
7028 return -EINVAL;
7029 }
7030 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
7031
7032 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
7033 offset_to_segment_end = offset_to_segment +
7034 (ext_priv_size * buffer_segments_size);
7035
7036 /* check local header magic */
7037 context += ctxsw_prog_ucode_header_size_in_bytes();
7038 if (!check_local_header_magic(context)) {
7039 nvgpu_err(g,
7040 "Invalid local header: magic value");
7041 return -EINVAL;
7042 }
7043
7044 /*
7045 * See if the incoming register address is in the first table of
7046 * registers. We check this by decoding only the TPC addr portion.
7047 * If we get a hit on the TPC bit, we then double check the address
7048 * by computing it from the base gpc/tpc strides. Then make sure
7049 * it is a real match.
7050 */
7051 g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
7052 &sm_dsm_perf_regs,
7053 &perf_register_stride);
7054
7055 g->ops.gr.init_sm_dsm_reg_info();
7056
7057 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
7058 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
7059 sm_dsm_perf_reg_id = i;
7060
7061 nvgpu_log_info(g, "register match: 0x%08x",
7062 sm_dsm_perf_regs[i]);
7063
7064 chk_addr = (gpc_base + gpc_stride * gpc_num) +
7065 tpc_in_gpc_base +
7066 (tpc_in_gpc_stride * tpc_num) +
7067 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask);
7068
7069 if (chk_addr != addr) {
7070 nvgpu_err(g,
7071 "Oops addr miss-match! : 0x%08x != 0x%08x",
7072 addr, chk_addr);
7073 return -EINVAL;
7074 }
7075 break;
7076 }
7077 }
7078
7079 /* Didn't find reg in supported group 1.
7080 * so try the second group now */
7081 g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
7082 &sm_dsm_perf_ctrl_regs,
7083 &control_register_stride);
7084
7085 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
7086 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
7087 if ((addr & tpc_gpc_mask) ==
7088 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
7089 sm_dsm_perf_ctrl_reg_id = i;
7090
7091 nvgpu_log_info(g, "register match: 0x%08x",
7092 sm_dsm_perf_ctrl_regs[i]);
7093
7094 chk_addr = (gpc_base + gpc_stride * gpc_num) +
7095 tpc_in_gpc_base +
7096 tpc_in_gpc_stride * tpc_num +
7097 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
7098 tpc_gpc_mask);
7099
7100 if (chk_addr != addr) {
7101 nvgpu_err(g,
7102 "Oops addr miss-match! : 0x%08x != 0x%08x",
7103 addr, chk_addr);
7104 return -EINVAL;
7105
7106 }
7107
7108 break;
7109 }
7110 }
7111 }
7112
7113 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
7114 (ILLEGAL_ID == sm_dsm_perf_reg_id)) {
7115 return -EINVAL;
7116 }
7117
7118 /* Skip the FECS extended header, nothing there for us now. */
7119 offset_to_segment += buffer_segments_size;
7120
7121 /* skip through the GPCCS extended headers until we get to the data for
7122 * our GPC. The size of each gpc extended segment is enough to hold the
7123 * max tpc count for the gpcs,in 256b chunks.
7124 */
7125
7126 max_tpc_count = gr->max_tpc_per_gpc_count;
7127
7128 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
7129
7130 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
7131 buffer_segments_size * gpc_num);
7132
7133 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
7134
7135 /* skip the head marker to start with */
7136 inter_seg_offset = marker_size;
7137
7138 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
7139 /* skip over control regs of TPC's before the one we want.
7140 * then skip to the register in this tpc */
7141 inter_seg_offset = inter_seg_offset +
7142 (tpc_num * control_register_stride) +
7143 sm_dsm_perf_ctrl_reg_id;
7144 } else {
7145 /* skip all the control registers */
7146 inter_seg_offset = inter_seg_offset +
7147 (num_tpcs * control_register_stride);
7148
7149 /* skip the marker between control and counter segments */
7150 inter_seg_offset += marker_size;
7151
7152 /* skip over counter regs of TPCs before the one we want */
7153 inter_seg_offset = inter_seg_offset +
7154 (tpc_num * perf_register_stride) *
7155 ctxsw_prog_extended_num_smpc_quadrants_v();
7156
7157 /* skip over the register for the quadrants we do not want.
7158 * then skip to the register in this tpc */
7159 inter_seg_offset = inter_seg_offset +
7160 (perf_register_stride * quad) +
7161 sm_dsm_perf_reg_id;
7162 }
7163
7164 /* set the offset to the segment offset plus the inter segment offset to
7165 * our register */
7166 offset_to_segment += (inter_seg_offset * 4);
7167
7168 /* last sanity check: did we somehow compute an offset outside the
7169 * extended buffer? */
7170 if (offset_to_segment > offset_to_segment_end) {
7171 nvgpu_err(g,
7172 "Overflow ctxsw buffer! 0x%08x > 0x%08x",
7173 offset_to_segment, offset_to_segment_end);
7174 return -EINVAL;
7175 }
7176
7177 *priv_offset = offset_to_segment;
7178
7179 return 0;
7180}
7181
7182
7183static int
7184gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
7185 enum ctxsw_addr_type addr_type,
7186 u32 pri_addr,
7187 u32 gpc_num, u32 num_tpcs,
7188 u32 num_ppcs, u32 ppc_mask,
7189 u32 *priv_offset)
7190{
7191 u32 i;
7192 u32 address, base_address;
7193 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
7194 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
7195 struct aiv_gk20a *reg;
7196 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7197 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7198 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
7199 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
7200 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7201 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7202
7203 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
7204
7205 if (!g->gr.ctx_vars.valid) {
7206 return -EINVAL;
7207 }
7208
7209 /* Process the SYS/BE segment. */
7210 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
7211 (addr_type == CTXSW_ADDR_TYPE_BE)) {
7212 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
7213 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
7214 address = reg->addr;
7215 sys_offset = reg->index;
7216
7217 if (pri_addr == address) {
7218 *priv_offset = sys_offset;
7219 return 0;
7220 }
7221 }
7222 }
7223
7224 /* Process the TPC segment. */
7225 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
7226 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
7227 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
7228 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
7229 address = reg->addr;
7230 tpc_addr = pri_tpccs_addr_mask(address);
7231 base_address = gpc_base +
7232 (gpc_num * gpc_stride) +
7233 tpc_in_gpc_base +
7234 (tpc_num * tpc_in_gpc_stride);
7235 address = base_address + tpc_addr;
7236 /*
7237 * The data for the TPCs is interleaved in the context buffer.
7238 * Example with num_tpcs = 2
7239 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7240 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7241 */
7242 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
7243
7244 if (pri_addr == address) {
7245 *priv_offset = tpc_offset;
7246 return 0;
7247 }
7248 }
7249 }
7250 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7251 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7252 if (g->ops.gr.get_egpc_base == NULL) {
7253 return -EINVAL;
7254 }
7255
7256 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
7257 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.etpc.count; i++) {
7258 reg = &g->gr.ctx_vars.ctxsw_regs.etpc.l[i];
7259 address = reg->addr;
7260 tpc_addr = pri_tpccs_addr_mask(address);
7261 base_address = g->ops.gr.get_egpc_base(g) +
7262 (gpc_num * gpc_stride) +
7263 tpc_in_gpc_base +
7264 (tpc_num * tpc_in_gpc_stride);
7265 address = base_address + tpc_addr;
7266 /*
7267 * The data for the TPCs is interleaved in the context buffer.
7268 * Example with num_tpcs = 2
7269 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7270 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7271 */
7272 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
7273
7274 if (pri_addr == address) {
7275 *priv_offset = tpc_offset;
7276 nvgpu_log(g,
7277 gpu_dbg_fn | gpu_dbg_gpu_dbg,
7278 "egpc/etpc priv_offset=0x%#08x",
7279 *priv_offset);
7280 return 0;
7281 }
7282 }
7283 }
7284 }
7285
7286
7287 /* Process the PPC segment. */
7288 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7289 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
7290 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
7291 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
7292 address = reg->addr;
7293 ppc_addr = pri_ppccs_addr_mask(address);
7294 base_address = gpc_base +
7295 (gpc_num * gpc_stride) +
7296 ppc_in_gpc_base +
7297 (ppc_num * ppc_in_gpc_stride);
7298 address = base_address + ppc_addr;
7299 /*
7300 * The data for the PPCs is interleaved in the context buffer.
7301 * Example with numPpcs = 2
7302 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7303 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7304 */
7305 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
7306
7307 if (pri_addr == address) {
7308 *priv_offset = ppc_offset;
7309 return 0;
7310 }
7311 }
7312 }
7313 }
7314
7315
7316 /* Process the GPC segment. */
7317 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7318 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
7319 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
7320
7321 address = reg->addr;
7322 gpc_addr = pri_gpccs_addr_mask(address);
7323 gpc_offset = reg->index;
7324
7325 base_address = gpc_base + (gpc_num * gpc_stride);
7326 address = base_address + gpc_addr;
7327
7328 if (pri_addr == address) {
7329 *priv_offset = gpc_offset;
7330 return 0;
7331 }
7332 }
7333 }
7334 return -EINVAL;
7335}
7336
7337static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
7338 u8 *context,
7339 u32 *num_ppcs, u32 *ppc_mask,
7340 u32 *reg_ppc_count)
7341{
7342 u32 data32;
7343 u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
7344
7345 /*
7346 * if there is only 1 PES_PER_GPC, then we put the PES registers
7347 * in the GPC reglist, so we can't error out if ppc.count == 0
7348 */
7349 if ((!g->gr.ctx_vars.valid) ||
7350 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
7351 (num_pes_per_gpc > 1))) {
7352 return -EINVAL;
7353 }
7354
7355 data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
7356
7357 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
7358 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
7359
7360 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
7361
7362 return 0;
7363}
7364
7365int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g,
7366 enum ctxsw_addr_type addr_type,
7367 u32 num_tpcs,
7368 u32 num_ppcs,
7369 u32 reg_list_ppc_count,
7370 u32 *__offset_in_segment)
7371{
7372 u32 offset_in_segment = 0;
7373 struct gr_gk20a *gr = &g->gr;
7374
7375 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
7376 /*
7377 * reg = gr->ctx_vars.ctxsw_regs.tpc.l;
7378 * offset_in_segment = 0;
7379 */
7380 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7381 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7382 offset_in_segment =
7383 ((gr->ctx_vars.ctxsw_regs.tpc.count *
7384 num_tpcs) << 2);
7385
7386 nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg,
7387 "egpc etpc offset_in_segment 0x%#08x",
7388 offset_in_segment);
7389 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7390 /*
7391 * The ucode stores TPC data before PPC data.
7392 * Advance offset past TPC data to PPC data.
7393 */
7394 offset_in_segment =
7395 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7396 gr->ctx_vars.ctxsw_regs.etpc.count) *
7397 num_tpcs) << 2);
7398 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7399 /*
7400 * The ucode stores TPC/PPC data before GPC data.
7401 * Advance offset past TPC/PPC data to GPC data.
7402 *
7403 * Note 1 PES_PER_GPC case
7404 */
7405 u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
7406 GPU_LIT_NUM_PES_PER_GPC);
7407 if (num_pes_per_gpc > 1) {
7408 offset_in_segment =
7409 ((((gr->ctx_vars.ctxsw_regs.tpc.count +
7410 gr->ctx_vars.ctxsw_regs.etpc.count) *
7411 num_tpcs) << 2) +
7412 ((reg_list_ppc_count * num_ppcs) << 2));
7413 } else {
7414 offset_in_segment =
7415 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7416 gr->ctx_vars.ctxsw_regs.etpc.count) *
7417 num_tpcs) << 2);
7418 }
7419 } else {
7420 nvgpu_log_fn(g, "Unknown address type.");
7421 return -EINVAL;
7422 }
7423
7424 *__offset_in_segment = offset_in_segment;
7425 return 0;
7426}
7427
7428/*
7429 * This function will return the 32 bit offset for a priv register if it is
7430 * present in the context buffer. The context buffer is in CPU memory.
7431 */
7432static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
7433 u32 addr,
7434 bool is_quad, u32 quad,
7435 u32 *context_buffer,
7436 u32 context_buffer_size,
7437 u32 *priv_offset)
7438{
7439 u32 i, data32;
7440 int err;
7441 enum ctxsw_addr_type addr_type;
7442 u32 broadcast_flags;
7443 u32 gpc_num, tpc_num, ppc_num, be_num;
7444 u32 num_gpcs, num_tpcs, num_ppcs;
7445 u32 offset;
7446 u32 sys_priv_offset, gpc_priv_offset;
7447 u32 ppc_mask, reg_list_ppc_count;
7448 u8 *context;
7449 u32 offset_to_segment, offset_in_segment = 0;
7450
7451 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
7452
7453 err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
7454 &gpc_num, &tpc_num, &ppc_num, &be_num,
7455 &broadcast_flags);
7456 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7457 "addr_type = %d, broadcast_flags: %08x",
7458 addr_type, broadcast_flags);
7459 if (err != 0) {
7460 return err;
7461 }
7462
7463 context = (u8 *)context_buffer;
7464 if (!check_main_image_header_magic(context)) {
7465 nvgpu_err(g,
7466 "Invalid main header: magic value");
7467 return -EINVAL;
7468 }
7469 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
7470
7471 /* Parse the FECS local header. */
7472 context += ctxsw_prog_ucode_header_size_in_bytes();
7473 if (!check_local_header_magic(context)) {
7474 nvgpu_err(g,
7475 "Invalid FECS local header: magic value");
7476 return -EINVAL;
7477 }
7478 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7479 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7480 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);
7481
7482 /* If found in Ext buffer, ok.
7483 * If it failed and we expected to find it there (quad offset)
7484 * then return the error. Otherwise continue on.
7485 */
7486 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
7487 addr, is_quad, quad, context_buffer,
7488 context_buffer_size, priv_offset);
7489 if ((err == 0) || ((err != 0) && is_quad)) {
7490 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7491 "err = %d, is_quad = %s",
7492 err, is_quad ? "true" : "false");
7493 return err;
7494 }
7495
7496 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
7497 (addr_type == CTXSW_ADDR_TYPE_BE)) {
7498 /* Find the offset in the FECS segment. */
7499 offset_to_segment = sys_priv_offset *
7500 ctxsw_prog_ucode_header_size_in_bytes();
7501
7502 err = gr_gk20a_process_context_buffer_priv_segment(g,
7503 addr_type, addr,
7504 0, 0, 0, 0,
7505 &offset);
7506 if (err != 0) {
7507 return err;
7508 }
7509
7510 *priv_offset = (offset_to_segment + offset);
7511 return 0;
7512 }
7513
7514 if ((gpc_num + 1) > num_gpcs) {
7515 nvgpu_err(g,
7516 "GPC %d not in this context buffer.",
7517 gpc_num);
7518 return -EINVAL;
7519 }
7520
7521 /* Parse the GPCCS local header(s).*/
7522 for (i = 0; i < num_gpcs; i++) {
7523 context += ctxsw_prog_ucode_header_size_in_bytes();
7524 if (!check_local_header_magic(context)) {
7525 nvgpu_err(g,
7526 "Invalid GPCCS local header: magic value");
7527 return -EINVAL;
7528
7529 }
7530 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7531 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7532
7533 err = gr_gk20a_determine_ppc_configuration(g, context,
7534 &num_ppcs, &ppc_mask,
7535 &reg_list_ppc_count);
7536 if (err != 0) {
7537 nvgpu_err(g, "determine ppc configuration failed");
7538 return err;
7539 }
7540
7541
7542 num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
7543
7544 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
7545 nvgpu_err(g,
7546 "GPC %d TPC %d not in this context buffer.",
7547 gpc_num, tpc_num);
7548 return -EINVAL;
7549 }
7550
7551 /* Find the offset in the GPCCS segment.*/
7552 if (i == gpc_num) {
7553 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7554 "gpc_priv_offset 0x%#08x",
7555 gpc_priv_offset);
7556 offset_to_segment = gpc_priv_offset *
7557 ctxsw_prog_ucode_header_size_in_bytes();
7558
7559 err = g->ops.gr.get_offset_in_gpccs_segment(g,
7560 addr_type,
7561 num_tpcs, num_ppcs, reg_list_ppc_count,
7562 &offset_in_segment);
7563 if (err != 0) {
7564 return -EINVAL;
7565 }
7566
7567 offset_to_segment += offset_in_segment;
7568 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7569 "offset_to_segment 0x%#08x",
7570 offset_to_segment);
7571
7572 err = gr_gk20a_process_context_buffer_priv_segment(g,
7573 addr_type, addr,
7574 i, num_tpcs,
7575 num_ppcs, ppc_mask,
7576 &offset);
7577 if (err != 0) {
7578 return -EINVAL;
7579 }
7580
7581 *priv_offset = offset_to_segment + offset;
7582 return 0;
7583 }
7584 }
7585
7586 return -EINVAL;
7587}
7588
7589static int map_cmp(const void *a, const void *b)
7590{
7591 struct ctxsw_buf_offset_map_entry *e1 =
7592 (struct ctxsw_buf_offset_map_entry *)a;
7593 struct ctxsw_buf_offset_map_entry *e2 =
7594 (struct ctxsw_buf_offset_map_entry *)b;
7595
7596 if (e1->addr < e2->addr) {
7597 return -1;
7598 }
7599
7600 if (e1->addr > e2->addr) {
7601 return 1;
7602 }
7603 return 0;
7604}
7605
7606static int add_ctxsw_buffer_map_entries_pmsys(struct ctxsw_buf_offset_map_entry *map,
7607 struct aiv_list_gk20a *regs,
7608 u32 *count, u32 *offset,
7609 u32 max_cnt, u32 base, u32 mask)
7610{
7611 u32 idx;
7612 u32 cnt = *count;
7613 u32 off = *offset;
7614
7615 if ((cnt + regs->count) > max_cnt) {
7616 return -EINVAL;
7617 }
7618
7619 for (idx = 0; idx < regs->count; idx++) {
7620 if ((base + (regs->l[idx].addr & mask)) < 0xFFF) {
7621 map[cnt].addr = base + (regs->l[idx].addr & mask)
7622 + NV_PCFG_BASE;
7623 } else {
7624 map[cnt].addr = base + (regs->l[idx].addr & mask);
7625 }
7626 map[cnt++].offset = off;
7627 off += 4;
7628 }
7629 *count = cnt;
7630 *offset = off;
7631 return 0;
7632}
7633
7634static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g,
7635 struct ctxsw_buf_offset_map_entry *map,
7636 struct aiv_list_gk20a *regs,
7637 u32 *count, u32 *offset,
7638 u32 max_cnt, u32 base, u32 mask)
7639{
7640 u32 idx;
7641 u32 cnt = *count;
7642 u32 off = *offset;
7643
7644 if ((cnt + regs->count) > max_cnt) {
7645 return -EINVAL;
7646 }
7647
7648 /* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1
7649 * To handle the case of PPC registers getting added into GPC, the below
7650 * code specifically checks for any PPC offsets and adds them using
7651 * proper mask
7652 */
7653 for (idx = 0; idx < regs->count; idx++) {
7654 /* Check if the address is PPC address */
7655 if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) {
7656 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
7657 GPU_LIT_PPC_IN_GPC_BASE);
7658 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
7659 GPU_LIT_PPC_IN_GPC_STRIDE);
7660 /* Use PPC mask instead of the GPC mask provided */
7661 u32 ppcmask = ppc_in_gpc_stride - 1;
7662
7663 map[cnt].addr = base + ppc_in_gpc_base
7664 + (regs->l[idx].addr & ppcmask);
7665 } else {
7666 map[cnt].addr = base + (regs->l[idx].addr & mask);
7667 }
7668 map[cnt++].offset = off;
7669 off += 4;
7670 }
7671 *count = cnt;
7672 *offset = off;
7673 return 0;
7674}
7675
7676static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
7677 struct aiv_list_gk20a *regs,
7678 u32 *count, u32 *offset,
7679 u32 max_cnt, u32 base, u32 mask)
7680{
7681 u32 idx;
7682 u32 cnt = *count;
7683 u32 off = *offset;
7684
7685 if ((cnt + regs->count) > max_cnt) {
7686 return -EINVAL;
7687 }
7688
7689 for (idx = 0; idx < regs->count; idx++) {
7690 map[cnt].addr = base + (regs->l[idx].addr & mask);
7691 map[cnt++].offset = off;
7692 off += 4;
7693 }
7694 *count = cnt;
7695 *offset = off;
7696 return 0;
7697}
7698
7699/* Helper function to add register entries to the register map for all
7700 * subunits
7701 */
7702static int add_ctxsw_buffer_map_entries_subunits(
7703 struct ctxsw_buf_offset_map_entry *map,
7704 struct aiv_list_gk20a *regs,
7705 u32 *count, u32 *offset,
7706 u32 max_cnt, u32 base,
7707 u32 num_units, u32 stride, u32 mask)
7708{
7709 u32 unit;
7710 u32 idx;
7711 u32 cnt = *count;
7712 u32 off = *offset;
7713
7714 if ((cnt + (regs->count * num_units)) > max_cnt) {
7715 return -EINVAL;
7716 }
7717
7718 /* Data is interleaved for units in ctxsw buffer */
7719 for (idx = 0; idx < regs->count; idx++) {
7720 for (unit = 0; unit < num_units; unit++) {
7721 map[cnt].addr = base + (regs->l[idx].addr & mask) +
7722 (unit * stride);
7723 map[cnt++].offset = off;
7724 off += 4;
7725 }
7726 }
7727 *count = cnt;
7728 *offset = off;
7729 return 0;
7730}
7731
7732int gr_gk20a_add_ctxsw_reg_pm_fbpa(struct gk20a *g,
7733 struct ctxsw_buf_offset_map_entry *map,
7734 struct aiv_list_gk20a *regs,
7735 u32 *count, u32 *offset,
7736 u32 max_cnt, u32 base,
7737 u32 num_fbpas, u32 stride, u32 mask)
7738{
7739 return add_ctxsw_buffer_map_entries_subunits(map, regs, count, offset,
7740 max_cnt, base, num_fbpas, stride, mask);
7741}
7742
7743static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
7744 struct ctxsw_buf_offset_map_entry *map,
7745 u32 *count, u32 *offset, u32 max_cnt)
7746{
7747 u32 num_gpcs = g->gr.gpc_count;
7748 u32 num_ppcs, num_tpcs, gpc_num, base;
7749 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7750 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7751 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
7752 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
7753 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7754 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7755
7756 for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
7757 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
7758 base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base;
7759 if (add_ctxsw_buffer_map_entries_subunits(map,
7760 &g->gr.ctx_vars.ctxsw_regs.pm_tpc,
7761 count, offset, max_cnt, base, num_tpcs,
7762 tpc_in_gpc_stride,
7763 (tpc_in_gpc_stride - 1))) {
7764 return -EINVAL;
7765 }
7766
7767 num_ppcs = g->gr.gpc_ppc_count[gpc_num];
7768 base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base;
7769 if (add_ctxsw_buffer_map_entries_subunits(map,
7770 &g->gr.ctx_vars.ctxsw_regs.pm_ppc,
7771 count, offset, max_cnt, base, num_ppcs,
7772 ppc_in_gpc_stride,
7773 (ppc_in_gpc_stride - 1))) {
7774 return -EINVAL;
7775 }
7776
7777 base = gpc_base + (gpc_stride * gpc_num);
7778 if (add_ctxsw_buffer_map_entries_pmgpc(g, map,
7779 &g->gr.ctx_vars.ctxsw_regs.pm_gpc,
7780 count, offset, max_cnt, base,
7781 (gpc_stride - 1))) {
7782 return -EINVAL;
7783 }
7784
7785 base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num;
7786 if (add_ctxsw_buffer_map_entries(map,
7787 &g->gr.ctx_vars.ctxsw_regs.pm_ucgpc,
7788 count, offset, max_cnt, base, ~0)) {
7789 return -EINVAL;
7790 }
7791
7792 base = (g->ops.gr.get_pmm_per_chiplet_offset() * gpc_num);
7793 if (add_ctxsw_buffer_map_entries(map,
7794 &g->gr.ctx_vars.ctxsw_regs.perf_gpc,
7795 count, offset, max_cnt, base, ~0)) {
7796 return -EINVAL;
7797 }
7798
7799 base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
7800 if (add_ctxsw_buffer_map_entries(map,
7801 &g->gr.ctx_vars.ctxsw_regs.gpc_router,
7802 count, offset, max_cnt, base, ~0)) {
7803 return -EINVAL;
7804 }
7805
7806 /* Counter Aggregation Unit, if available */
7807 if (g->gr.ctx_vars.ctxsw_regs.pm_cau.count) {
7808 base = gpc_base + (gpc_stride * gpc_num)
7809 + tpc_in_gpc_base;
7810 if (add_ctxsw_buffer_map_entries_subunits(map,
7811 &g->gr.ctx_vars.ctxsw_regs.pm_cau,
7812 count, offset, max_cnt, base, num_tpcs,
7813 tpc_in_gpc_stride,
7814 (tpc_in_gpc_stride - 1))) {
7815 return -EINVAL;
7816 }
7817 }
7818
7819 *offset = ALIGN(*offset, 256);
7820 }
7821 return 0;
7822}
7823
7824int gr_gk20a_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map,
7825 struct aiv_list_gk20a *regs,
7826 u32 *count, u32 *offset,
7827 u32 max_cnt, u32 base, u32 mask)
7828{
7829 return add_ctxsw_buffer_map_entries(map, regs,
7830 count, offset, max_cnt, base, mask);
7831}
7832
7833/*
7834 * PM CTXSW BUFFER LAYOUT :
7835 *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE
7836 *| |
7837 *| LIST_compressed_pm_ctx_reg_SYS |Space allocated: numRegs words
7838 *|---------------------------------------------|
7839 *| |
7840 *| LIST_compressed_nv_perf_ctx_reg_SYS |Space allocated: numRegs words
7841 *|---------------------------------------------|
7842 *| |
7843 *| LIST_compressed_nv_perf_ctx_reg_sysrouter|Space allocated: numRegs words
7844 *|---------------------------------------------|
7845 *| |
7846 *| LIST_compressed_nv_perf_ctx_reg_PMA |Space allocated: numRegs words
7847 *|---------------------------------------------|
7848 *| PADDING for 256 byte alignment |
7849 *|---------------------------------------------|<----256 byte aligned
7850 *| LIST_compressed_nv_perf_fbp_ctx_regs |
7851 *| |Space allocated: numRegs * n words (for n FB units)
7852 *|---------------------------------------------|
7853 *| LIST_compressed_nv_perf_fbprouter_ctx_regs |
7854 *| |Space allocated: numRegs * n words (for n FB units)
7855 *|---------------------------------------------|
7856 *| LIST_compressed_pm_fbpa_ctx_regs |
7857 *| |Space allocated: numRegs * n words (for n FB units)
7858 *|---------------------------------------------|
7859 *| LIST_compressed_pm_rop_ctx_regs |
7860 *|---------------------------------------------|
7861 *| LIST_compressed_pm_ltc_ctx_regs |
7862 *| LTC0 LTS0 |
7863 *| LTC1 LTS0 |Space allocated: numRegs * n words (for n LTC units)
7864 *| LTCn LTS0 |
7865 *| LTC0 LTS1 |
7866 *| LTC1 LTS1 |
7867 *| LTCn LTS1 |
7868 *| LTC0 LTSn |
7869 *| LTC1 LTSn |
7870 *| LTCn LTSn |
7871 *|---------------------------------------------|
7872 *| PADDING for 256 byte alignment |
7873 *|---------------------------------------------|<----256 byte aligned
7874 *| GPC0 REG0 TPC0 |Each GPC has space allocated to accommodate
7875 *| REG0 TPC1 | all the GPC/TPC register lists
7876 *| Lists in each GPC region: REG0 TPCn |Per GPC allocated space is always 256 byte aligned
7877 *| LIST_pm_ctx_reg_TPC REG1 TPC0 |
7878 *| * numTpcs REG1 TPC1 |
7879 *| LIST_pm_ctx_reg_PPC REG1 TPCn |
7880 *| * numPpcs REGn TPC0 |
7881 *| LIST_pm_ctx_reg_GPC REGn TPC1 |
7882 *| List_pm_ctx_reg_uc_GPC REGn TPCn |
7883 *| LIST_nv_perf_ctx_reg_GPC |
7884 *| LIST_nv_perf_gpcrouter_ctx_reg |
7885 *| LIST_nv_perf_ctx_reg_CAU |
7886 *| ---- |--
7887 *| GPC1 . |
7888 *| . |<----
7889 *|---------------------------------------------|
7890 *= =
7891 *| GPCn |
7892 *= =
7893 *|---------------------------------------------|
7894 */
7895
7896static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
7897{
7898 u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size;
7899 u32 hwpm_ctxsw_reg_count_max;
7900 u32 map_size;
7901 u32 i, count = 0;
7902 u32 offset = 0;
7903 struct ctxsw_buf_offset_map_entry *map;
7904 u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
7905 u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
7906 u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
7907 u32 num_ltc = g->ops.gr.get_max_ltc_per_fbp(g) * g->gr.num_fbps;
7908
7909 if (hwpm_ctxsw_buffer_size == 0) {
7910 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7911 "no PM Ctxsw buffer memory in context buffer");
7912 return -EINVAL;
7913 }
7914
7915 hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
7916 map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
7917
7918 map = nvgpu_big_zalloc(g, map_size);
7919 if (map == NULL) {
7920 return -ENOMEM;
7921 }
7922
7923 /* Add entries from _LIST_pm_ctx_reg_SYS */
7924 if (add_ctxsw_buffer_map_entries_pmsys(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys,
7925 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
7926 goto cleanup;
7927 }
7928
7929 /* Add entries from _LIST_nv_perf_ctx_reg_SYS */
7930 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys,
7931 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
7932 goto cleanup;
7933 }
7934
7935 /* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/
7936 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys_router,
7937 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
7938 goto cleanup;
7939 }
7940
7941 /* Add entries from _LIST_nv_perf_pma_ctx_reg*/
7942 if (g->ops.gr.add_ctxsw_reg_perf_pma(map, &g->gr.ctx_vars.ctxsw_regs.perf_pma,
7943 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
7944 goto cleanup;
7945 }
7946
7947 offset = ALIGN(offset, 256);
7948
7949 /* Add entries from _LIST_nv_perf_fbp_ctx_regs */
7950 if (add_ctxsw_buffer_map_entries_subunits(map,
7951 &g->gr.ctx_vars.ctxsw_regs.fbp,
7952 &count, &offset,
7953 hwpm_ctxsw_reg_count_max, 0,
7954 g->gr.num_fbps,
7955 g->ops.gr.get_pmm_per_chiplet_offset(),
7956 ~0)) {
7957 goto cleanup;
7958 }
7959
7960 /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
7961 if (add_ctxsw_buffer_map_entries_subunits(map,
7962 &g->gr.ctx_vars.ctxsw_regs.fbp_router,
7963 &count, &offset,
7964 hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps,
7965 NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0)) {
7966 goto cleanup;
7967 }
7968
7969 /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
7970 if (g->ops.gr.add_ctxsw_reg_pm_fbpa(g, map,
7971 &g->gr.ctx_vars.ctxsw_regs.pm_fbpa,
7972 &count, &offset,
7973 hwpm_ctxsw_reg_count_max, 0,
7974 num_fbpas, fbpa_stride, ~0)) {
7975 goto cleanup;
7976 }
7977
7978 /* Add entries from _LIST_nv_pm_rop_ctx_regs */
7979 if (add_ctxsw_buffer_map_entries(map,
7980 &g->gr.ctx_vars.ctxsw_regs.pm_rop,
7981 &count, &offset,
7982 hwpm_ctxsw_reg_count_max, 0, ~0)) {
7983 goto cleanup;
7984 }
7985
7986 /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
7987 if (add_ctxsw_buffer_map_entries_subunits(map,
7988 &g->gr.ctx_vars.ctxsw_regs.pm_ltc,
7989 &count, &offset,
7990 hwpm_ctxsw_reg_count_max, 0,
7991 num_ltc, ltc_stride, ~0)) {
7992 goto cleanup;
7993 }
7994
7995 offset = ALIGN(offset, 256);
7996
7997 /* Add GPC entries */
7998 if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
7999 hwpm_ctxsw_reg_count_max)) {
8000 goto cleanup;
8001 }
8002
8003 if (offset > hwpm_ctxsw_buffer_size) {
8004 nvgpu_err(g, "offset > buffer size");
8005 goto cleanup;
8006 }
8007
8008 sort(map, count, sizeof(*map), map_cmp, NULL);
8009
8010 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map;
8011 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count;
8012
8013 nvgpu_log_info(g, "Reg Addr => HWPM Ctxt switch buffer offset");
8014
8015 for (i = 0; i < count; i++) {
8016 nvgpu_log_info(g, "%08x => %08x", map[i].addr, map[i].offset);
8017 }
8018
8019 return 0;
8020cleanup:
8021 nvgpu_err(g, "Failed to create HWPM buffer offset map");
8022 nvgpu_big_free(g, map);
8023 return -EINVAL;
8024}
8025
8026/*
8027 * This function will return the 32 bit offset for a priv register if it is
8028 * present in the PM context buffer.
8029 */
8030static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
8031 u32 addr,
8032 u32 *priv_offset)
8033{
8034 struct gr_gk20a *gr = &g->gr;
8035 int err = 0;
8036 u32 count;
8037 struct ctxsw_buf_offset_map_entry *map, *result, map_key;
8038
8039 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
8040
8041 /* Create map of pri address and pm offset if necessary */
8042 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) {
8043 err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g);
8044 if (err != 0) {
8045 return err;
8046 }
8047 }
8048
8049 *priv_offset = 0;
8050
8051 map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map;
8052 count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count;
8053
8054 map_key.addr = addr;
8055 result = bsearch(&map_key, map, count, sizeof(*map), map_cmp);
8056
8057 if (result) {
8058 *priv_offset = result->offset;
8059 } else {
8060 nvgpu_err(g, "Lookup failed for address 0x%x", addr);
8061 err = -EINVAL;
8062 }
8063 return err;
8064}
8065
8066bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
8067{
8068 int curr_gr_ctx;
8069 u32 curr_gr_tsgid;
8070 struct gk20a *g = ch->g;
8071 struct channel_gk20a *curr_ch;
8072 bool ret = false;
8073 struct tsg_gk20a *tsg;
8074
8075 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
8076
8077 /* when contexts are unloaded from GR, the valid bit is reset
8078 * but the instance pointer information remains intact. So the
8079 * valid bit must be checked to be absolutely certain that a
8080 * valid context is currently resident.
8081 */
8082 if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
8083 return NULL;
8084 }
8085
8086 curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
8087 &curr_gr_tsgid);
8088
8089 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
8090 "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
8091 " ch->chid=%d",
8092 (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
8093 curr_gr_tsgid,
8094 ch->tsgid,
8095 ch->chid);
8096
8097 if (curr_ch == NULL) {
8098 return false;
8099 }
8100
8101 if (ch->chid == curr_ch->chid) {
8102 ret = true;
8103 }
8104
8105 tsg = tsg_gk20a_from_ch(ch);
8106 if ((tsg != NULL) && (tsg->tsgid == curr_gr_tsgid)) {
8107 ret = true;
8108 }
8109
8110 gk20a_channel_put(curr_ch);
8111 return ret;
8112}
8113
8114int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
8115 struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
8116 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
8117 bool ch_is_curr_ctx)
8118{
8119 struct gk20a *g = ch->g;
8120 struct tsg_gk20a *tsg;
8121 struct nvgpu_gr_ctx *gr_ctx;
8122 bool gr_ctx_ready = false;
8123 bool pm_ctx_ready = false;
8124 struct nvgpu_mem *current_mem = NULL;
8125 u32 i, j, offset, v;
8126 struct gr_gk20a *gr = &g->gr;
8127 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8128 u32 max_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
8129 sm_per_tpc;
8130 u32 *offsets = NULL;
8131 u32 *offset_addrs = NULL;
8132 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
8133 int err = 0, pass;
8134
8135 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
8136 num_ctx_wr_ops, num_ctx_rd_ops);
8137
8138 tsg = tsg_gk20a_from_ch(ch);
8139 if (tsg == NULL) {
8140 return -EINVAL;
8141 }
8142
8143 gr_ctx = &tsg->gr_ctx;
8144
8145 if (ch_is_curr_ctx) {
8146 for (pass = 0; pass < 2; pass++) {
8147 ctx_op_nr = 0;
8148 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
8149 /* only do ctx ops and only on the right pass */
8150 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
8151 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
8152 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
8153 continue;
8154 }
8155
8156 /* if this is a quad access, setup for special access*/
8157 if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
8158 && (g->ops.gr.access_smpc_reg != NULL)) {
8159 g->ops.gr.access_smpc_reg(g,
8160 ctx_ops[i].quad,
8161 ctx_ops[i].offset);
8162 }
8163 offset = ctx_ops[i].offset;
8164
8165 if (pass == 0) { /* write pass */
8166 v = gk20a_readl(g, offset);
8167 v &= ~ctx_ops[i].and_n_mask_lo;
8168 v |= ctx_ops[i].value_lo;
8169 gk20a_writel(g, offset, v);
8170
8171 nvgpu_log(g, gpu_dbg_gpu_dbg,
8172 "direct wr: offset=0x%x v=0x%x",
8173 offset, v);
8174
8175 if (ctx_ops[i].op == REGOP(WRITE_64)) {
8176 v = gk20a_readl(g, offset + 4);
8177 v &= ~ctx_ops[i].and_n_mask_hi;
8178 v |= ctx_ops[i].value_hi;
8179 gk20a_writel(g, offset + 4, v);
8180
8181 nvgpu_log(g, gpu_dbg_gpu_dbg,
8182 "direct wr: offset=0x%x v=0x%x",
8183 offset + 4, v);
8184 }
8185
8186 } else { /* read pass */
8187 ctx_ops[i].value_lo =
8188 gk20a_readl(g, offset);
8189
8190 nvgpu_log(g, gpu_dbg_gpu_dbg,
8191 "direct rd: offset=0x%x v=0x%x",
8192 offset, ctx_ops[i].value_lo);
8193
8194 if (ctx_ops[i].op == REGOP(READ_64)) {
8195 ctx_ops[i].value_hi =
8196 gk20a_readl(g, offset + 4);
8197
8198 nvgpu_log(g, gpu_dbg_gpu_dbg,
8199 "direct rd: offset=0x%x v=0x%x",
8200 offset, ctx_ops[i].value_lo);
8201 } else {
8202 ctx_ops[i].value_hi = 0;
8203 }
8204 }
8205 ctx_op_nr++;
8206 }
8207 }
8208 goto cleanup;
8209 }
8210
8211 /* they're the same size, so just use one alloc for both */
8212 offsets = nvgpu_kzalloc(g, 2 * sizeof(u32) * max_offsets);
8213 if (offsets == NULL) {
8214 err = -ENOMEM;
8215 goto cleanup;
8216 }
8217 offset_addrs = offsets + max_offsets;
8218
8219 err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
8220 if (err != 0) {
8221 goto cleanup;
8222 }
8223
8224 g->ops.mm.l2_flush(g, true);
8225
8226 /* write to appropriate place in context image,
8227 * first have to figure out where that really is */
8228
8229 /* first pass is writes, second reads */
8230 for (pass = 0; pass < 2; pass++) {
8231 ctx_op_nr = 0;
8232 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
8233 u32 num_offsets;
8234
8235 /* only do ctx ops and only on the right pass */
8236 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
8237 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
8238 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
8239 continue;
8240 }
8241
8242 err = gr_gk20a_get_ctx_buffer_offsets(g,
8243 ctx_ops[i].offset,
8244 max_offsets,
8245 offsets, offset_addrs,
8246 &num_offsets,
8247 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
8248 ctx_ops[i].quad);
8249 if (err == 0) {
8250 if (!gr_ctx_ready) {
8251 gr_ctx_ready = true;
8252 }
8253 current_mem = &gr_ctx->mem;
8254 } else {
8255 err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
8256 ctx_ops[i].offset,
8257 max_offsets,
8258 offsets, offset_addrs,
8259 &num_offsets);
8260 if (err != 0) {
8261 nvgpu_log(g, gpu_dbg_gpu_dbg,
8262 "ctx op invalid offset: offset=0x%x",
8263 ctx_ops[i].offset);
8264 ctx_ops[i].status =
8265 REGOP(STATUS_INVALID_OFFSET);
8266 continue;
8267 }
8268 if (!pm_ctx_ready) {
8269 /* Make sure ctx buffer was initialized */
8270 if (!nvgpu_mem_is_valid(&gr_ctx->pm_ctx.mem)) {
8271 nvgpu_err(g,
8272 "Invalid ctx buffer");
8273 err = -EINVAL;
8274 goto cleanup;
8275 }
8276 pm_ctx_ready = true;
8277 }
8278 current_mem = &gr_ctx->pm_ctx.mem;
8279 }
8280
8281 /* if this is a quad access, setup for special access*/
8282 if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) &&
8283 (g->ops.gr.access_smpc_reg != NULL)) {
8284 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
8285 ctx_ops[i].offset);
8286 }
8287
8288 for (j = 0; j < num_offsets; j++) {
8289 /* sanity check gr ctxt offsets,
8290 * don't write outside, worst case
8291 */
8292 if ((current_mem == &gr_ctx->mem) &&
8293 (offsets[j] >= g->gr.ctx_vars.golden_image_size)) {
8294 continue;
8295 }
8296 if (pass == 0) { /* write pass */
8297 v = nvgpu_mem_rd(g, current_mem, offsets[j]);
8298 v &= ~ctx_ops[i].and_n_mask_lo;
8299 v |= ctx_ops[i].value_lo;
8300 nvgpu_mem_wr(g, current_mem, offsets[j], v);
8301
8302 nvgpu_log(g, gpu_dbg_gpu_dbg,
8303 "context wr: offset=0x%x v=0x%x",
8304 offsets[j], v);
8305
8306 if (ctx_ops[i].op == REGOP(WRITE_64)) {
8307 v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4);
8308 v &= ~ctx_ops[i].and_n_mask_hi;
8309 v |= ctx_ops[i].value_hi;
8310 nvgpu_mem_wr(g, current_mem, offsets[j] + 4, v);
8311
8312 nvgpu_log(g, gpu_dbg_gpu_dbg,
8313 "context wr: offset=0x%x v=0x%x",
8314 offsets[j] + 4, v);
8315 }
8316
8317 /* check to see if we need to add a special WAR
8318 for some of the SMPC perf regs */
8319 gr_gk20a_ctx_patch_smpc(g, ch, offset_addrs[j],
8320 v, current_mem);
8321
8322 } else { /* read pass */
8323 ctx_ops[i].value_lo =
8324 nvgpu_mem_rd(g, current_mem, offsets[0]);
8325
8326 nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
8327 offsets[0], ctx_ops[i].value_lo);
8328
8329 if (ctx_ops[i].op == REGOP(READ_64)) {
8330 ctx_ops[i].value_hi =
8331 nvgpu_mem_rd(g, current_mem, offsets[0] + 4);
8332
8333 nvgpu_log(g, gpu_dbg_gpu_dbg,
8334 "context rd: offset=0x%x v=0x%x",
8335 offsets[0] + 4, ctx_ops[i].value_hi);
8336 } else {
8337 ctx_ops[i].value_hi = 0;
8338 }
8339 }
8340 }
8341 ctx_op_nr++;
8342 }
8343 }
8344
8345 cleanup:
8346 if (offsets) {
8347 nvgpu_kfree(g, offsets);
8348 }
8349
8350 if (gr_ctx->patch_ctx.mem.cpu_va) {
8351 gr_gk20a_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready);
8352 }
8353
8354 return err;
8355}
8356
8357int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
8358 struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
8359 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
8360 bool *is_curr_ctx)
8361{
8362 struct gk20a *g = ch->g;
8363 int err, tmp_err;
8364 bool ch_is_curr_ctx;
8365
8366 /* disable channel switching.
8367 * at that point the hardware state can be inspected to
8368 * determine if the context we're interested in is current.
8369 */
8370 err = gr_gk20a_disable_ctxsw(g);
8371 if (err != 0) {
8372 nvgpu_err(g, "unable to stop gr ctxsw");
8373 /* this should probably be ctx-fatal... */
8374 return err;
8375 }
8376
8377 ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
8378 if (is_curr_ctx != NULL) {
8379 *is_curr_ctx = ch_is_curr_ctx;
8380 }
8381 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
8382 ch_is_curr_ctx);
8383
8384 err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
8385 num_ctx_rd_ops, ch_is_curr_ctx);
8386
8387 tmp_err = gr_gk20a_enable_ctxsw(g);
8388 if (tmp_err) {
8389 nvgpu_err(g, "unable to restart ctxsw!");
8390 err = tmp_err;
8391 }
8392
8393 return err;
8394}
8395
8396void gr_gk20a_commit_global_pagepool(struct gk20a *g,
8397 struct nvgpu_gr_ctx *gr_ctx,
8398 u64 addr, u32 size, bool patch)
8399{
8400 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(),
8401 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
8402
8403 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(),
8404 gr_scc_pagepool_total_pages_f(size) |
8405 gr_scc_pagepool_valid_true_f(), patch);
8406
8407 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(),
8408 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
8409
8410 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(),
8411 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
8412
8413 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_pagepool_r(),
8414 gr_pd_pagepool_total_pages_f(size) |
8415 gr_pd_pagepool_valid_true_f(), patch);
8416}
8417
8418void gk20a_init_gr(struct gk20a *g)
8419{
8420 nvgpu_cond_init(&g->gr.init_wq);
8421}
8422
8423int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
8424 u32 global_esr_mask, bool check_errors)
8425{
8426 bool locked_down;
8427 bool no_error_pending;
8428 u32 delay = GR_IDLE_CHECK_DEFAULT;
8429 bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g);
8430 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8431 u32 dbgr_status0 = 0, dbgr_control0 = 0;
8432 u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
8433 struct nvgpu_timeout timeout;
8434 u32 warp_esr;
8435
8436 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
8437 "GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm);
8438
8439 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
8440 NVGPU_TIMER_CPU_TIMER);
8441
8442 /* wait for the sm to lock down */
8443 do {
8444 u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
8445 gpc, tpc, sm);
8446 dbgr_status0 = gk20a_readl(g,
8447 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
8448
8449 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
8450
8451 locked_down =
8452 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
8453 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
8454 no_error_pending =
8455 check_errors &&
8456 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
8457 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
8458 ((global_esr & ~global_esr_mask) == 0);
8459
8460 if (locked_down || no_error_pending) {
8461 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
8462 "GPC%d TPC%d SM%d: locked down SM",
8463 gpc, tpc, sm);
8464 return 0;
8465 }
8466
8467 /* if an mmu fault is pending and mmu debug mode is not
8468 * enabled, the sm will never lock down. */
8469 if (!mmu_debug_mode_enabled &&
8470 (g->ops.mm.mmu_fault_pending(g))) {
8471 nvgpu_err(g,
8472 "GPC%d TPC%d: mmu fault pending,"
8473 " SM%d will never lock down!", gpc, tpc, sm);
8474 return -EFAULT;
8475 }
8476
8477 nvgpu_usleep_range(delay, delay * 2);
8478 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
8479 } while (nvgpu_timeout_expired(&timeout) == 0);
8480
8481 dbgr_control0 = gk20a_readl(g,
8482 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8483
8484 /* 64 bit read */
8485 warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
8486 warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);
8487
8488 /* 64 bit read */
8489 warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
8490 warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);
8491
8492 /* 64 bit read */
8493 warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
8494 warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);
8495
8496 nvgpu_err(g,
8497 "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
8498 nvgpu_err(g,
8499 "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx",
8500 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
8501 warps_valid, warps_paused, warps_trapped);
8502
8503 return -ETIMEDOUT;
8504}
8505
8506void gk20a_gr_suspend_single_sm(struct gk20a *g,
8507 u32 gpc, u32 tpc, u32 sm,
8508 u32 global_esr_mask, bool check_errors)
8509{
8510 int err;
8511 u32 dbgr_control0;
8512 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8513
8514 /* if an SM debugger isn't attached, skip suspend */
8515 if (!g->ops.gr.sm_debugger_attached(g)) {
8516 nvgpu_err(g,
8517 "SM debugger not attached, skipping suspend!");
8518 return;
8519 }
8520
8521 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
8522 "suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm);
8523
8524 /* assert stop trigger. */
8525 dbgr_control0 = gk20a_readl(g,
8526 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8527 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8528 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset,
8529 dbgr_control0);
8530
8531 err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm,
8532 global_esr_mask, check_errors);
8533 if (err != 0) {
8534 nvgpu_err(g,
8535 "SuspendSm failed");
8536 return;
8537 }
8538}
8539
8540void gk20a_gr_suspend_all_sms(struct gk20a *g,
8541 u32 global_esr_mask, bool check_errors)
8542{
8543 struct gr_gk20a *gr = &g->gr;
8544 u32 gpc, tpc, sm;
8545 int err;
8546 u32 dbgr_control0;
8547 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8548
8549 /* if an SM debugger isn't attached, skip suspend */
8550 if (!g->ops.gr.sm_debugger_attached(g)) {
8551 nvgpu_err(g,
8552 "SM debugger not attached, skipping suspend!");
8553 return;
8554 }
8555
8556 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms");
8557 /* assert stop trigger. uniformity assumption: all SMs will have
8558 * the same state in dbg_control0.
8559 */
8560 dbgr_control0 =
8561 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8562 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8563
8564 /* broadcast write */
8565 gk20a_writel(g,
8566 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8567
8568 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
8569 for (tpc = 0; tpc < gr_gk20a_get_tpc_count(gr, gpc); tpc++) {
8570 for (sm = 0; sm < sm_per_tpc; sm++) {
8571 err = g->ops.gr.wait_for_sm_lock_down(g,
8572 gpc, tpc, sm,
8573 global_esr_mask, check_errors);
8574 if (err != 0) {
8575 nvgpu_err(g, "SuspendAllSms failed");
8576 return;
8577 }
8578 }
8579 }
8580 }
8581}
8582
8583void gk20a_gr_resume_single_sm(struct gk20a *g,
8584 u32 gpc, u32 tpc, u32 sm)
8585{
8586 u32 dbgr_control0;
8587 u32 offset;
8588 /*
8589 * The following requires some clarification. Despite the fact that both
8590 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8591 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8592 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8593 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8594 * (_DISABLE) as well.
8595
8596 * Advice from the arch group: Disable the stop trigger first, as a
8597 * separate operation, in order to ensure that the trigger has taken
8598 * effect, before enabling the run trigger.
8599 */
8600
8601 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8602
8603 /*De-assert stop trigger */
8604 dbgr_control0 =
8605 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8606 dbgr_control0 = set_field(dbgr_control0,
8607 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(),
8608 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f());
8609 gk20a_writel(g,
8610 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8611
8612 /* Run trigger */
8613 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8614 gk20a_writel(g,
8615 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8616}
8617
8618void gk20a_gr_resume_all_sms(struct gk20a *g)
8619{
8620 u32 dbgr_control0;
8621 /*
8622 * The following requires some clarification. Despite the fact that both
8623 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8624 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8625 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8626 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8627 * (_DISABLE) as well.
8628
8629 * Advice from the arch group: Disable the stop trigger first, as a
8630 * separate operation, in order to ensure that the trigger has taken
8631 * effect, before enabling the run trigger.
8632 */
8633
8634 /*De-assert stop trigger */
8635 dbgr_control0 =
8636 gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
8637 dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8638 gk20a_writel(g,
8639 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8640
8641 /* Run trigger */
8642 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8643 gk20a_writel(g,
8644 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8645}
8646
8647int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
8648 struct channel_gk20a *ch, u64 sms, bool enable)
8649{
8650 struct nvgpu_dbg_reg_op *ops;
8651 unsigned int i = 0, sm_id;
8652 int err;
8653 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
8654 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
8655
8656 ops = nvgpu_kcalloc(g, g->gr.no_of_sm, sizeof(*ops));
8657 if (ops == NULL) {
8658 return -ENOMEM;
8659 }
8660 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
8661 int gpc, tpc;
8662 u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val;
8663
8664 if ((sms & BIT64(sm_id)) == 0ULL) {
8665 continue;
8666 }
8667
8668 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8669 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8670
8671 tpc_offset = tpc_in_gpc_stride * tpc;
8672 gpc_offset = gpc_stride * gpc;
8673 reg_offset = tpc_offset + gpc_offset;
8674
8675 ops[i].op = REGOP(WRITE_32);
8676 ops[i].type = REGOP(TYPE_GR_CTX);
8677 ops[i].offset = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset;
8678
8679 reg_mask = 0;
8680 reg_val = 0;
8681 if (enable) {
8682 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8683 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f();
8684 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m();
8685 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f();
8686 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m();
8687 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f();
8688 } else {
8689 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8690 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f();
8691 }
8692
8693 ops[i].and_n_mask_lo = reg_mask;
8694 ops[i].value_lo = reg_val;
8695 i++;
8696 }
8697
8698 err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
8699 if (err != 0) {
8700 nvgpu_err(g, "Failed to access register");
8701 }
8702 nvgpu_kfree(g, ops);
8703 return err;
8704}
8705
8706/*
8707 * gr_gk20a_suspend_context()
8708 * This API should be called with dbg_session lock held
8709 * and ctxsw disabled
8710 * Returns bool value indicating if context was resident
8711 * or not
8712 */
8713bool gr_gk20a_suspend_context(struct channel_gk20a *ch)
8714{
8715 struct gk20a *g = ch->g;
8716 bool ctx_resident = false;
8717
8718 if (gk20a_is_channel_ctx_resident(ch)) {
8719 g->ops.gr.suspend_all_sms(g, 0, false);
8720 ctx_resident = true;
8721 } else {
8722 gk20a_disable_channel_tsg(g, ch);
8723 }
8724
8725 return ctx_resident;
8726}
8727
8728bool gr_gk20a_resume_context(struct channel_gk20a *ch)
8729{
8730 struct gk20a *g = ch->g;
8731 bool ctx_resident = false;
8732
8733 if (gk20a_is_channel_ctx_resident(ch)) {
8734 g->ops.gr.resume_all_sms(g);
8735 ctx_resident = true;
8736 } else {
8737 gk20a_enable_channel_tsg(g, ch);
8738 }
8739
8740 return ctx_resident;
8741}
8742
8743int gr_gk20a_suspend_contexts(struct gk20a *g,
8744 struct dbg_session_gk20a *dbg_s,
8745 int *ctx_resident_ch_fd)
8746{
8747 int local_ctx_resident_ch_fd = -1;
8748 bool ctx_resident;
8749 struct channel_gk20a *ch;
8750 struct dbg_session_channel_data *ch_data;
8751 int err = 0;
8752
8753 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8754
8755 err = gr_gk20a_disable_ctxsw(g);
8756 if (err != 0) {
8757 nvgpu_err(g, "unable to stop gr ctxsw");
8758 goto clean_up;
8759 }
8760
8761 nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
8762
8763 nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
8764 dbg_session_channel_data, ch_entry) {
8765 ch = g->fifo.channel + ch_data->chid;
8766
8767 ctx_resident = gr_gk20a_suspend_context(ch);
8768 if (ctx_resident) {
8769 local_ctx_resident_ch_fd = ch_data->channel_fd;
8770 }
8771 }
8772
8773 nvgpu_mutex_release(&dbg_s->ch_list_lock);
8774
8775 err = gr_gk20a_enable_ctxsw(g);
8776 if (err != 0) {
8777 nvgpu_err(g, "unable to restart ctxsw!");
8778 }
8779
8780 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8781
8782clean_up:
8783 nvgpu_mutex_release(&g->dbg_sessions_lock);
8784
8785 return err;
8786}
8787
8788int gr_gk20a_resume_contexts(struct gk20a *g,
8789 struct dbg_session_gk20a *dbg_s,
8790 int *ctx_resident_ch_fd)
8791{
8792 int local_ctx_resident_ch_fd = -1;
8793 bool ctx_resident;
8794 struct channel_gk20a *ch;
8795 int err = 0;
8796 struct dbg_session_channel_data *ch_data;
8797
8798 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8799
8800 err = gr_gk20a_disable_ctxsw(g);
8801 if (err != 0) {
8802 nvgpu_err(g, "unable to stop gr ctxsw");
8803 goto clean_up;
8804 }
8805
8806 nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
8807 dbg_session_channel_data, ch_entry) {
8808 ch = g->fifo.channel + ch_data->chid;
8809
8810 ctx_resident = gr_gk20a_resume_context(ch);
8811 if (ctx_resident) {
8812 local_ctx_resident_ch_fd = ch_data->channel_fd;
8813 }
8814 }
8815
8816 err = gr_gk20a_enable_ctxsw(g);
8817 if (err != 0) {
8818 nvgpu_err(g, "unable to restart ctxsw!");
8819 }
8820
8821 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8822
8823clean_up:
8824 nvgpu_mutex_release(&g->dbg_sessions_lock);
8825
8826 return err;
8827}
8828
8829int gr_gk20a_trigger_suspend(struct gk20a *g)
8830{
8831 int err = 0;
8832 u32 dbgr_control0;
8833
8834 /* assert stop trigger. uniformity assumption: all SMs will have
8835 * the same state in dbg_control0. */
8836 dbgr_control0 =
8837 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8838 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8839
8840 /* broadcast write */
8841 gk20a_writel(g,
8842 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8843
8844 return err;
8845}
8846
8847int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state)
8848{
8849 int err = 0;
8850 struct gr_gk20a *gr = &g->gr;
8851 u32 gpc, tpc, sm, sm_id;
8852 u32 global_mask;
8853
8854 /* Wait for the SMs to reach full stop. This condition is:
8855 * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE)
8856 * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp
8857 * masks.
8858 */
8859 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
8860
8861 /* Lock down all SMs */
8862 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
8863
8864 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8865 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8866 sm = g->gr.sm_to_cluster[sm_id].sm_index;
8867
8868 err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
8869 global_mask, false);
8870 if (err != 0) {
8871 nvgpu_err(g, "sm did not lock down!");
8872 return err;
8873 }
8874 }
8875
8876 /* Read the warp status */
8877 g->ops.gr.bpt_reg_info(g, w_state);
8878
8879 return 0;
8880}
8881
8882int gr_gk20a_resume_from_pause(struct gk20a *g)
8883{
8884 int err = 0;
8885 u32 reg_val;
8886
8887 /* Clear the pause mask to tell the GPU we want to resume everyone */
8888 gk20a_writel(g,
8889 gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0);
8890
8891 /* explicitly re-enable forwarding of SM interrupts upon any resume */
8892 reg_val = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
8893 reg_val |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
8894 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), reg_val);
8895
8896 /* Now resume all sms, write a 0 to the stop trigger
8897 * then a 1 to the run trigger */
8898 g->ops.gr.resume_all_sms(g);
8899
8900 return err;
8901}
8902
8903int gr_gk20a_clear_sm_errors(struct gk20a *g)
8904{
8905 int ret = 0;
8906 u32 gpc, tpc, sm;
8907 struct gr_gk20a *gr = &g->gr;
8908 u32 global_esr;
8909 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8910
8911 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
8912
8913 /* check if any tpc has an exception */
8914 for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
8915
8916 for (sm = 0; sm < sm_per_tpc; sm++) {
8917 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
8918 gpc, tpc, sm);
8919
8920 /* clearing hwws, also causes tpc and gpc
8921 * exceptions to be cleared
8922 */
8923 g->ops.gr.clear_sm_hww(g,
8924 gpc, tpc, sm, global_esr);
8925 }
8926 }
8927 }
8928
8929 return ret;
8930}
8931
8932u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g)
8933{
8934 struct gr_gk20a *gr = &g->gr;
8935 u32 sm_id, tpc_exception_en = 0;
8936 u32 offset, regval, tpc_offset, gpc_offset;
8937 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
8938 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
8939
8940 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
8941
8942 tpc_offset = tpc_in_gpc_stride * g->gr.sm_to_cluster[sm_id].tpc_index;
8943 gpc_offset = gpc_stride * g->gr.sm_to_cluster[sm_id].gpc_index;
8944 offset = tpc_offset + gpc_offset;
8945
8946 regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
8947 offset);
8948 /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */
8949 tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id;
8950 }
8951
8952 return tpc_exception_en;
8953}
8954
8955u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
8956{
8957 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8958 u32 hww_warp_esr = gk20a_readl(g,
8959 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
8960 return hww_warp_esr;
8961}
8962
8963u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
8964{
8965 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8966
8967 u32 hww_global_esr = gk20a_readl(g,
8968 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
8969
8970 return hww_global_esr;
8971}
8972
8973u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g)
8974{
8975 /*
8976 * These three interrupts don't require locking down the SM. They can
8977 * be handled by usermode clients as they aren't fatal. Additionally,
8978 * usermode clients may wish to allow some warps to execute while others
8979 * are at breakpoints, as opposed to fatal errors where all warps should
8980 * halt.
8981 */
8982 u32 global_esr_mask =
8983 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
8984 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
8985 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
8986
8987 return global_esr_mask;
8988}
8989
8990/* invalidate channel lookup tlb */
8991void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
8992{
8993 nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
8994 memset(gr->chid_tlb, 0,
8995 sizeof(struct gr_channel_map_tlb_entry) *
8996 GR_CHANNEL_MAP_TLB_SIZE);
8997 nvgpu_spinlock_release(&gr->ch_tlb_lock);
8998}