aboutsummaryrefslogtreecommitdiffstats
path: root/include/gk20a/gr_gk20a.c
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2024-09-25 16:09:09 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2024-09-25 16:09:09 -0400
commitf347fde22f1297e4f022600d201780d5ead78114 (patch)
tree76be305d6187003a1e0486ff6e91efb1062ae118 /include/gk20a/gr_gk20a.c
parent8340d234d78a7d0f46c11a584de538148b78b7cb (diff)
Delete no-longer-needed nvgpu headersHEADmasterjbakita-wip
The dependency on these was removed in commit 8340d234.
Diffstat (limited to 'include/gk20a/gr_gk20a.c')
-rw-r--r--include/gk20a/gr_gk20a.c9090
1 files changed, 0 insertions, 9090 deletions
diff --git a/include/gk20a/gr_gk20a.c b/include/gk20a/gr_gk20a.c
deleted file mode 100644
index 1eda853..0000000
--- a/include/gk20a/gr_gk20a.c
+++ /dev/null
@@ -1,9090 +0,0 @@
1/*
2 * GK20A Graphics
3 *
4 * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25#include <nvgpu/dma.h>
26#include <nvgpu/kmem.h>
27#include <nvgpu/gmmu.h>
28#include <nvgpu/timers.h>
29#include <nvgpu/nvgpu_common.h>
30#include <nvgpu/log.h>
31#include <nvgpu/bsearch.h>
32#include <nvgpu/sort.h>
33#include <nvgpu/bug.h>
34#include <nvgpu/firmware.h>
35#include <nvgpu/enabled.h>
36#include <nvgpu/debug.h>
37#include <nvgpu/barrier.h>
38#include <nvgpu/mm.h>
39#include <nvgpu/ctxsw_trace.h>
40#include <nvgpu/error_notifier.h>
41#include <nvgpu/ecc.h>
42#include <nvgpu/io.h>
43#include <nvgpu/utils.h>
44#include <nvgpu/channel.h>
45#include <nvgpu/unit.h>
46#include <nvgpu/power_features/pg.h>
47#include <nvgpu/power_features/cg.h>
48
49#include "gk20a.h"
50#include "gr_gk20a.h"
51#include "gk20a/fecs_trace_gk20a.h"
52#include "gr_ctx_gk20a.h"
53#include "gr_pri_gk20a.h"
54#include "regops_gk20a.h"
55#include "dbg_gpu_gk20a.h"
56
57#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
58#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
59#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
60#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
61#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
62#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
63#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
64#include <nvgpu/hw/gk20a/hw_pri_ringmaster_gk20a.h>
65#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
66#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
67
68#define BLK_SIZE (256)
69#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200
70#define NV_PERF_PMMGPCROUTER_STRIDE 0x0200
71#define NV_PCFG_BASE 0x00088000
72#define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE 0x0020
73#define FE_PWR_MODE_TIMEOUT_MAX 2000
74#define FE_PWR_MODE_TIMEOUT_DEFAULT 10
75#define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000
76#define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10
77#define FECS_ARB_CMD_TIMEOUT_MAX 40
78#define FECS_ARB_CMD_TIMEOUT_DEFAULT 2
79
80static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
81
82static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
83 struct vm_gk20a *vm,
84 struct nvgpu_gr_ctx *gr_ctx);
85
86/* channel patch ctx buffer */
87static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
88 struct channel_gk20a *c);
89static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
90 struct vm_gk20a *vm,
91 struct nvgpu_gr_ctx *gr_ctx);
92
93/* golden ctx image */
94static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
95 struct channel_gk20a *c);
96
97int gr_gk20a_get_ctx_id(struct gk20a *g,
98 struct channel_gk20a *c,
99 u32 *ctx_id)
100{
101 struct tsg_gk20a *tsg;
102 struct nvgpu_gr_ctx *gr_ctx = NULL;
103 struct nvgpu_mem *mem = NULL;
104
105 tsg = tsg_gk20a_from_ch(c);
106 if (tsg == NULL) {
107 return -EINVAL;
108 }
109
110 gr_ctx = &tsg->gr_ctx;
111 mem = &gr_ctx->mem;
112
113 /* Channel gr_ctx buffer is gpu cacheable.
114 Flush and invalidate before cpu update. */
115 g->ops.mm.l2_flush(g, true);
116
117 *ctx_id = nvgpu_mem_rd(g, mem,
118 ctxsw_prog_main_image_context_id_o());
119 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "ctx_id: 0x%x", *ctx_id);
120
121 return 0;
122}
123
124void gk20a_gpccs_dump_falcon_stats(struct gk20a *g)
125{
126 unsigned int i;
127
128 nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqstat : %d",
129 gk20a_readl(g, gr_gpc0_gpccs_falcon_irqstat_r()));
130 nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqmode : %d",
131 gk20a_readl(g, gr_gpc0_gpccs_falcon_irqmode_r()));
132 nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqmask : %d",
133 gk20a_readl(g, gr_gpc0_gpccs_falcon_irqmask_r()));
134 nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqdest : %d",
135 gk20a_readl(g, gr_gpc0_gpccs_falcon_irqdest_r()));
136 nvgpu_err(g, "gr_gpc0_gpccs_falcon_debug1 : %d",
137 gk20a_readl(g, gr_gpc0_gpccs_falcon_debug1_r()));
138 nvgpu_err(g, "gr_gpc0_gpccs_falcon_debuginfo : %d",
139 gk20a_readl(g, gr_gpc0_gpccs_falcon_debuginfo_r()));
140 nvgpu_err(g, "gr_gpc0_gpccs_falcon_engctl : %d",
141 gk20a_readl(g, gr_gpc0_gpccs_falcon_engctl_r()));
142 nvgpu_err(g, "gr_gpc0_gpccs_falcon_curctx : %d",
143 gk20a_readl(g, gr_gpc0_gpccs_falcon_curctx_r()));
144 nvgpu_err(g, "gr_gpc0_gpccs_falcon_nxtctx : %d",
145 gk20a_readl(g, gr_gpc0_gpccs_falcon_nxtctx_r()));
146 nvgpu_err(g, "gr_gpc0_gpccs_ctxsw_status_1 : %d",
147 gk20a_readl(g, gr_gpc0_gpccs_ctxsw_status_1_r()));
148
149 for (i = 0; i < g->ops.gr.gpc0_gpccs_ctxsw_mailbox_size(); i++) {
150 nvgpu_err(g, "gr_gpc0_gpccs_ctxsw_mailbox_r(%d) : 0x%x",
151 i, gk20a_readl(g, gr_gpc0_gpccs_ctxsw_mailbox_r(i)));
152 }
153
154
155 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
156 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
157 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
158 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_IMB : 0x%x",
159 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
160
161 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
162 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
163 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
164 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_DMB : 0x%x",
165 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
166
167 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
168 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
169 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
170 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_CSW : 0x%x",
171 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
172
173 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
174 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
175 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
176 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_CTX : 0x%x",
177 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
178
179 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
180 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
181 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
182 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_EXCI : 0x%x",
183 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
184
185
186 for (i = 0; i < 4U; i++) {
187 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
188 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
189 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_PC));
190 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_PC : 0x%x",
191 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
192
193 gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
194 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
195 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_SP));
196 nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_SP : 0x%x",
197 gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
198 }
199}
200
201void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
202{
203 unsigned int i;
204
205 nvgpu_err(g, "gr_fecs_os_r : %d",
206 gk20a_readl(g, gr_fecs_os_r()));
207 nvgpu_err(g, "gr_fecs_cpuctl_r : 0x%x",
208 gk20a_readl(g, gr_fecs_cpuctl_r()));
209 nvgpu_err(g, "gr_fecs_idlestate_r : 0x%x",
210 gk20a_readl(g, gr_fecs_idlestate_r()));
211 nvgpu_err(g, "gr_fecs_mailbox0_r : 0x%x",
212 gk20a_readl(g, gr_fecs_mailbox0_r()));
213 nvgpu_err(g, "gr_fecs_mailbox1_r : 0x%x",
214 gk20a_readl(g, gr_fecs_mailbox1_r()));
215 nvgpu_err(g, "gr_fecs_irqstat_r : 0x%x",
216 gk20a_readl(g, gr_fecs_irqstat_r()));
217 nvgpu_err(g, "gr_fecs_irqmode_r : 0x%x",
218 gk20a_readl(g, gr_fecs_irqmode_r()));
219 nvgpu_err(g, "gr_fecs_irqmask_r : 0x%x",
220 gk20a_readl(g, gr_fecs_irqmask_r()));
221 nvgpu_err(g, "gr_fecs_irqdest_r : 0x%x",
222 gk20a_readl(g, gr_fecs_irqdest_r()));
223 nvgpu_err(g, "gr_fecs_debug1_r : 0x%x",
224 gk20a_readl(g, gr_fecs_debug1_r()));
225 nvgpu_err(g, "gr_fecs_debuginfo_r : 0x%x",
226 gk20a_readl(g, gr_fecs_debuginfo_r()));
227 nvgpu_err(g, "gr_fecs_ctxsw_status_1_r : 0x%x",
228 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
229
230 for (i = 0; i < g->ops.gr.fecs_ctxsw_mailbox_size(); i++) {
231 nvgpu_err(g, "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
232 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
233 }
234
235 nvgpu_err(g, "gr_fecs_engctl_r : 0x%x",
236 gk20a_readl(g, gr_fecs_engctl_r()));
237 nvgpu_err(g, "gr_fecs_curctx_r : 0x%x",
238 gk20a_readl(g, gr_fecs_curctx_r()));
239 nvgpu_err(g, "gr_fecs_nxtctx_r : 0x%x",
240 gk20a_readl(g, gr_fecs_nxtctx_r()));
241
242 gk20a_writel(g, gr_fecs_icd_cmd_r(),
243 gr_fecs_icd_cmd_opc_rreg_f() |
244 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
245 nvgpu_err(g, "FECS_FALCON_REG_IMB : 0x%x",
246 gk20a_readl(g, gr_fecs_icd_rdata_r()));
247
248 gk20a_writel(g, gr_fecs_icd_cmd_r(),
249 gr_fecs_icd_cmd_opc_rreg_f() |
250 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
251 nvgpu_err(g, "FECS_FALCON_REG_DMB : 0x%x",
252 gk20a_readl(g, gr_fecs_icd_rdata_r()));
253
254 gk20a_writel(g, gr_fecs_icd_cmd_r(),
255 gr_fecs_icd_cmd_opc_rreg_f() |
256 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
257 nvgpu_err(g, "FECS_FALCON_REG_CSW : 0x%x",
258 gk20a_readl(g, gr_fecs_icd_rdata_r()));
259
260 gk20a_writel(g, gr_fecs_icd_cmd_r(),
261 gr_fecs_icd_cmd_opc_rreg_f() |
262 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
263 nvgpu_err(g, "FECS_FALCON_REG_CTX : 0x%x",
264 gk20a_readl(g, gr_fecs_icd_rdata_r()));
265
266 gk20a_writel(g, gr_fecs_icd_cmd_r(),
267 gr_fecs_icd_cmd_opc_rreg_f() |
268 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
269 nvgpu_err(g, "FECS_FALCON_REG_EXCI : 0x%x",
270 gk20a_readl(g, gr_fecs_icd_rdata_r()));
271
272 for (i = 0; i < 4; i++) {
273 gk20a_writel(g, gr_fecs_icd_cmd_r(),
274 gr_fecs_icd_cmd_opc_rreg_f() |
275 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
276 nvgpu_err(g, "FECS_FALCON_REG_PC : 0x%x",
277 gk20a_readl(g, gr_fecs_icd_rdata_r()));
278
279 gk20a_writel(g, gr_fecs_icd_cmd_r(),
280 gr_fecs_icd_cmd_opc_rreg_f() |
281 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
282 nvgpu_err(g, "FECS_FALCON_REG_SP : 0x%x",
283 gk20a_readl(g, gr_fecs_icd_rdata_r()));
284 }
285}
286
287static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
288{
289 u32 i, ucode_u32_size;
290 const u32 *ucode_u32_data;
291 u32 checksum;
292
293 nvgpu_log_fn(g, " ");
294
295 gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
296 gr_gpccs_dmemc_blk_f(0) |
297 gr_gpccs_dmemc_aincw_f(1)));
298
299 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
300 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
301
302 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
303 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
304 checksum += ucode_u32_data[i];
305 }
306
307 gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
308 gr_fecs_dmemc_blk_f(0) |
309 gr_fecs_dmemc_aincw_f(1)));
310
311 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
312 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
313
314 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
315 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
316 checksum += ucode_u32_data[i];
317 }
318 nvgpu_log_fn(g, "done");
319}
320
321static void gr_gk20a_load_falcon_imem(struct gk20a *g)
322{
323 u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
324 const u32 *ucode_u32_data;
325 u32 tag, i, pad_start, pad_end;
326 u32 checksum;
327
328 nvgpu_log_fn(g, " ");
329
330 cfg = gk20a_readl(g, gr_fecs_cfg_r());
331 fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
332
333 cfg = gk20a_readl(g, gr_gpc0_cfg_r());
334 gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
335
336 /* Use the broadcast address to access all of the GPCCS units. */
337 gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
338 gr_gpccs_imemc_blk_f(0) |
339 gr_gpccs_imemc_aincw_f(1)));
340
341 /* Setup the tags for the instruction memory. */
342 tag = 0;
343 gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
344
345 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
346 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
347
348 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
349 if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
350 tag++;
351 gk20a_writel(g, gr_gpccs_imemt_r(0),
352 gr_gpccs_imemt_tag_f(tag));
353 }
354 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
355 checksum += ucode_u32_data[i];
356 }
357
358 pad_start = i * 4U;
359 pad_end = pad_start + (256U - pad_start % 256U) + 256U;
360 for (i = pad_start;
361 (i < gpccs_imem_size * 256U) && (i < pad_end);
362 i += 4U) {
363 if ((i != 0U) && ((i % 256U) == 0U)) {
364 tag++;
365 gk20a_writel(g, gr_gpccs_imemt_r(0),
366 gr_gpccs_imemt_tag_f(tag));
367 }
368 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
369 }
370
371 gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
372 gr_fecs_imemc_blk_f(0) |
373 gr_fecs_imemc_aincw_f(1)));
374
375 /* Setup the tags for the instruction memory. */
376 tag = 0;
377 gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
378
379 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
380 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
381
382 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
383 if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
384 tag++;
385 gk20a_writel(g, gr_fecs_imemt_r(0),
386 gr_fecs_imemt_tag_f(tag));
387 }
388 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
389 checksum += ucode_u32_data[i];
390 }
391
392 pad_start = i * 4U;
393 pad_end = pad_start + (256U - pad_start % 256U) + 256U;
394 for (i = pad_start;
395 (i < fecs_imem_size * 256U) && i < pad_end;
396 i += 4U) {
397 if ((i != 0U) && ((i % 256U) == 0U)) {
398 tag++;
399 gk20a_writel(g, gr_fecs_imemt_r(0),
400 gr_fecs_imemt_tag_f(tag));
401 }
402 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
403 }
404}
405
406int gr_gk20a_wait_idle(struct gk20a *g, unsigned long duration_ms,
407 u32 expect_delay)
408{
409 u32 delay = expect_delay;
410 bool ctxsw_active;
411 bool gr_busy;
412 u32 gr_engine_id;
413 u32 engine_status;
414 bool ctx_status_invalid;
415 struct nvgpu_timeout timeout;
416
417 nvgpu_log_fn(g, " ");
418
419 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
420
421 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
422
423 do {
424 /* fmodel: host gets fifo_engine_status(gr) from gr
425 only when gr_status is read */
426 (void) gk20a_readl(g, gr_status_r());
427
428 engine_status = gk20a_readl(g,
429 fifo_engine_status_r(gr_engine_id));
430
431 ctxsw_active = engine_status &
432 fifo_engine_status_ctxsw_in_progress_f();
433
434 ctx_status_invalid =
435 (fifo_engine_status_ctx_status_v(engine_status) ==
436 fifo_engine_status_ctx_status_invalid_v());
437
438 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
439 gr_engine_status_value_busy_f();
440
441 if (ctx_status_invalid || (!gr_busy && !ctxsw_active)) {
442 nvgpu_log_fn(g, "done");
443 return 0;
444 }
445
446 nvgpu_usleep_range(delay, delay * 2);
447 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
448
449 } while (nvgpu_timeout_expired(&timeout) == 0);
450
451 nvgpu_err(g,
452 "timeout, ctxsw busy : %d, gr busy : %d",
453 ctxsw_active, gr_busy);
454
455 return -EAGAIN;
456}
457
458int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long duration_ms,
459 u32 expect_delay)
460{
461 u32 val;
462 u32 delay = expect_delay;
463 struct nvgpu_timeout timeout;
464
465 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
466 return 0;
467 }
468
469 nvgpu_log_fn(g, " ");
470
471 nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
472
473 do {
474 val = gk20a_readl(g, gr_status_r());
475
476 if (gr_status_fe_method_lower_v(val) == 0U) {
477 nvgpu_log_fn(g, "done");
478 return 0;
479 }
480
481 nvgpu_usleep_range(delay, delay * 2);
482 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
483 } while (nvgpu_timeout_expired(&timeout) == 0);
484
485 nvgpu_err(g,
486 "timeout, fe busy : %x", val);
487
488 return -EAGAIN;
489}
490
491int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
492 u32 *mailbox_ret, u32 opc_success,
493 u32 mailbox_ok, u32 opc_fail,
494 u32 mailbox_fail, bool sleepduringwait)
495{
496 struct nvgpu_timeout timeout;
497 u32 delay = GR_FECS_POLL_INTERVAL;
498 u32 check = WAIT_UCODE_LOOP;
499 u32 reg;
500
501 nvgpu_log_fn(g, " ");
502
503 if (sleepduringwait) {
504 delay = GR_IDLE_CHECK_DEFAULT;
505 }
506
507 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
508 NVGPU_TIMER_CPU_TIMER);
509
510 while (check == WAIT_UCODE_LOOP) {
511 if (nvgpu_timeout_expired(&timeout)) {
512 check = WAIT_UCODE_TIMEOUT;
513 }
514
515 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
516
517 if (mailbox_ret) {
518 *mailbox_ret = reg;
519 }
520
521 switch (opc_success) {
522 case GR_IS_UCODE_OP_EQUAL:
523 if (reg == mailbox_ok) {
524 check = WAIT_UCODE_OK;
525 }
526 break;
527 case GR_IS_UCODE_OP_NOT_EQUAL:
528 if (reg != mailbox_ok) {
529 check = WAIT_UCODE_OK;
530 }
531 break;
532 case GR_IS_UCODE_OP_AND:
533 if (reg & mailbox_ok) {
534 check = WAIT_UCODE_OK;
535 }
536 break;
537 case GR_IS_UCODE_OP_LESSER:
538 if (reg < mailbox_ok) {
539 check = WAIT_UCODE_OK;
540 }
541 break;
542 case GR_IS_UCODE_OP_LESSER_EQUAL:
543 if (reg <= mailbox_ok) {
544 check = WAIT_UCODE_OK;
545 }
546 break;
547 case GR_IS_UCODE_OP_SKIP:
548 /* do no success check */
549 break;
550 default:
551 nvgpu_err(g,
552 "invalid success opcode 0x%x", opc_success);
553
554 check = WAIT_UCODE_ERROR;
555 break;
556 }
557
558 switch (opc_fail) {
559 case GR_IS_UCODE_OP_EQUAL:
560 if (reg == mailbox_fail) {
561 check = WAIT_UCODE_ERROR;
562 }
563 break;
564 case GR_IS_UCODE_OP_NOT_EQUAL:
565 if (reg != mailbox_fail) {
566 check = WAIT_UCODE_ERROR;
567 }
568 break;
569 case GR_IS_UCODE_OP_AND:
570 if (reg & mailbox_fail) {
571 check = WAIT_UCODE_ERROR;
572 }
573 break;
574 case GR_IS_UCODE_OP_LESSER:
575 if (reg < mailbox_fail) {
576 check = WAIT_UCODE_ERROR;
577 }
578 break;
579 case GR_IS_UCODE_OP_LESSER_EQUAL:
580 if (reg <= mailbox_fail) {
581 check = WAIT_UCODE_ERROR;
582 }
583 break;
584 case GR_IS_UCODE_OP_SKIP:
585 /* do no check on fail*/
586 break;
587 default:
588 nvgpu_err(g,
589 "invalid fail opcode 0x%x", opc_fail);
590 check = WAIT_UCODE_ERROR;
591 break;
592 }
593
594 if (sleepduringwait) {
595 nvgpu_usleep_range(delay, delay * 2);
596 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
597 } else {
598 nvgpu_udelay(delay);
599 }
600 }
601
602 if (check == WAIT_UCODE_TIMEOUT) {
603 nvgpu_err(g,
604 "timeout waiting on mailbox=%d value=0x%08x",
605 mailbox_id, reg);
606 gk20a_fecs_dump_falcon_stats(g);
607 gk20a_gpccs_dump_falcon_stats(g);
608 gk20a_gr_debug_dump(g);
609 return -1;
610 } else if (check == WAIT_UCODE_ERROR) {
611 nvgpu_err(g,
612 "ucode method failed on mailbox=%d value=0x%08x",
613 mailbox_id, reg);
614 gk20a_fecs_dump_falcon_stats(g);
615 gk20a_gpccs_dump_falcon_stats(g);
616 return -1;
617 }
618
619 nvgpu_log_fn(g, "done");
620 return 0;
621}
622
623int gr_gk20a_submit_fecs_method_op_locked(struct gk20a *g,
624 struct fecs_method_op_gk20a op,
625 bool sleepduringwait)
626{
627 int ret;
628
629 if (op.mailbox.id != 0) {
630 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
631 op.mailbox.data);
632 }
633
634 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
635 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
636
637 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
638 gk20a_writel(g, gr_fecs_method_push_r(),
639 gr_fecs_method_push_adr_f(op.method.addr));
640
641 /* op.mailbox.id == 4 cases require waiting for completion on
642 * for op.mailbox.id == 0 */
643 if (op.mailbox.id == 4) {
644 op.mailbox.id = 0;
645 }
646
647 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
648 op.cond.ok, op.mailbox.ok,
649 op.cond.fail, op.mailbox.fail,
650 sleepduringwait);
651 if (ret) {
652 nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
653 op.method.data, op.method.addr);
654 }
655
656 return ret;
657}
658
659/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
660 * We should replace most, if not all, fecs method calls to this instead. */
661int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
662 struct fecs_method_op_gk20a op,
663 bool sleepduringwait)
664{
665 struct gr_gk20a *gr = &g->gr;
666 int ret;
667
668 nvgpu_mutex_acquire(&gr->fecs_mutex);
669
670 ret = gr_gk20a_submit_fecs_method_op_locked(g, op, sleepduringwait);
671
672 nvgpu_mutex_release(&gr->fecs_mutex);
673
674 return ret;
675}
676
677/* Sideband mailbox writes are done a bit differently */
678int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
679 struct fecs_method_op_gk20a op)
680{
681 struct gr_gk20a *gr = &g->gr;
682 int ret;
683
684 nvgpu_mutex_acquire(&gr->fecs_mutex);
685
686 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(op.mailbox.id),
687 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
688
689 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
690 gk20a_writel(g, gr_fecs_method_push_r(),
691 gr_fecs_method_push_adr_f(op.method.addr));
692
693 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
694 op.cond.ok, op.mailbox.ok,
695 op.cond.fail, op.mailbox.fail,
696 false);
697 if (ret) {
698 nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
699 op.method.data, op.method.addr);
700 }
701
702 nvgpu_mutex_release(&gr->fecs_mutex);
703
704 return ret;
705}
706
707static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
708{
709 return gr_gk20a_submit_fecs_method_op(g,
710 (struct fecs_method_op_gk20a) {
711 .method.addr = fecs_method,
712 .method.data = ~0,
713 .mailbox = { .id = 1, /*sideband?*/
714 .data = ~0, .clr = ~0, .ret = ret,
715 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
716 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
717 .cond.ok = GR_IS_UCODE_OP_EQUAL,
718 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
719}
720
721/**
722 * Stop processing (stall) context switches at FECS:-
723 * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
724 * and may timeout. It could manifest as different error signatures
725 * depending on when stop_ctxsw fecs method gets sent with respect
726 * to pmu elpg sequence. It could come as pmu halt or abort or
727 * maybe ext error too.
728*/
729int gr_gk20a_disable_ctxsw(struct gk20a *g)
730{
731 int err = 0;
732
733 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
734
735 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
736 g->ctxsw_disable_count++;
737 if (g->ctxsw_disable_count == 1) {
738 err = nvgpu_pg_elpg_disable(g);
739 if (err != 0) {
740 nvgpu_err(g, "failed to disable elpg. not safe to "
741 "stop_ctxsw");
742 /* stop ctxsw command is not sent */
743 g->ctxsw_disable_count--;
744 } else {
745 err = gr_gk20a_ctrl_ctxsw(g,
746 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
747 if (err != 0) {
748 nvgpu_err(g, "failed to stop fecs ctxsw");
749 /* stop ctxsw failed */
750 g->ctxsw_disable_count--;
751 }
752 }
753 } else {
754 nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
755 g->ctxsw_disable_count);
756 }
757 nvgpu_mutex_release(&g->ctxsw_disable_lock);
758
759 return err;
760}
761
762/* Start processing (continue) context switches at FECS */
763int gr_gk20a_enable_ctxsw(struct gk20a *g)
764{
765 int err = 0;
766
767 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
768
769 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
770
771 if (g->ctxsw_disable_count == 0) {
772 goto ctxsw_already_enabled;
773 }
774 g->ctxsw_disable_count--;
775 WARN_ON(g->ctxsw_disable_count < 0);
776 if (g->ctxsw_disable_count == 0) {
777 err = gr_gk20a_ctrl_ctxsw(g,
778 gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
779 if (err != 0) {
780 nvgpu_err(g, "failed to start fecs ctxsw");
781 } else {
782 if (nvgpu_pg_elpg_enable(g) != 0) {
783 nvgpu_err(g, "failed to enable elpg "
784 "after start_ctxsw");
785 }
786 }
787 } else {
788 nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
789 g->ctxsw_disable_count);
790 }
791ctxsw_already_enabled:
792 nvgpu_mutex_release(&g->ctxsw_disable_lock);
793
794 return err;
795}
796
797int gr_gk20a_halt_pipe(struct gk20a *g)
798{
799 return gr_gk20a_submit_fecs_method_op(g,
800 (struct fecs_method_op_gk20a) {
801 .method.addr =
802 gr_fecs_method_push_adr_halt_pipeline_v(),
803 .method.data = ~0,
804 .mailbox = { .id = 1, /*sideband?*/
805 .data = ~0, .clr = ~0, .ret = NULL,
806 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
807 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
808 .cond.ok = GR_IS_UCODE_OP_EQUAL,
809 .cond.fail = GR_IS_UCODE_OP_EQUAL }, false);
810}
811
812
813int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
814{
815 u32 addr_lo;
816 u32 addr_hi;
817
818 nvgpu_log_fn(c->g, " ");
819
820 addr_lo = u64_lo32(gpu_va) >> 12;
821 addr_hi = u64_hi32(gpu_va);
822
823 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
824 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
825 ram_in_gr_wfi_ptr_lo_f(addr_lo));
826
827 nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
828 ram_in_gr_wfi_ptr_hi_f(addr_hi));
829
830 return 0;
831}
832
833/*
834 * Context state can be written directly, or "patched" at times. So that code
835 * can be used in either situation it is written using a series of
836 * _ctx_patch_write(..., patch) statements. However any necessary map overhead
837 * should be minimized; thus, bundle the sequence of these writes together, and
838 * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
839 */
840
841int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
842 struct nvgpu_gr_ctx *gr_ctx,
843 bool update_patch_count)
844{
845 if (update_patch_count) {
846 /* reset patch count if ucode has already processed it */
847 gr_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
848 &gr_ctx->mem,
849 ctxsw_prog_main_image_patch_count_o());
850 nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
851 gr_ctx->patch_ctx.data_count);
852 }
853 return 0;
854}
855
856void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
857 struct nvgpu_gr_ctx *gr_ctx,
858 bool update_patch_count)
859{
860 /* Write context count to context image if it is mapped */
861 if (update_patch_count) {
862 nvgpu_mem_wr(g, &gr_ctx->mem,
863 ctxsw_prog_main_image_patch_count_o(),
864 gr_ctx->patch_ctx.data_count);
865 nvgpu_log(g, gpu_dbg_info, "write patch count %d",
866 gr_ctx->patch_ctx.data_count);
867 }
868}
869
870void gr_gk20a_ctx_patch_write(struct gk20a *g,
871 struct nvgpu_gr_ctx *gr_ctx,
872 u32 addr, u32 data, bool patch)
873{
874 if (patch) {
875 u32 patch_slot = gr_ctx->patch_ctx.data_count *
876 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
877 if (patch_slot > (PATCH_CTX_ENTRIES_FROM_SIZE(
878 gr_ctx->patch_ctx.mem.size) -
879 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY)) {
880 nvgpu_err(g, "failed to access patch_slot %d",
881 patch_slot);
882 return;
883 }
884 nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot, addr);
885 nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot + 1, data);
886 gr_ctx->patch_ctx.data_count++;
887 nvgpu_log(g, gpu_dbg_info,
888 "patch addr = 0x%x data = 0x%x data_count %d",
889 addr, data, gr_ctx->patch_ctx.data_count);
890 } else {
891 gk20a_writel(g, addr, data);
892 }
893}
894
895static u32 fecs_current_ctx_data(struct gk20a *g, struct nvgpu_mem *inst_block)
896{
897 u64 ptr = nvgpu_inst_block_addr(g, inst_block) >>
898 ram_in_base_shift_v();
899 u32 aperture = nvgpu_aperture_mask(g, inst_block,
900 gr_fecs_current_ctx_target_sys_mem_ncoh_f(),
901 gr_fecs_current_ctx_target_sys_mem_coh_f(),
902 gr_fecs_current_ctx_target_vid_mem_f());
903
904 return gr_fecs_current_ctx_ptr_f(u64_lo32(ptr)) | aperture |
905 gr_fecs_current_ctx_valid_f(1);
906}
907
908int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
909 struct channel_gk20a *c)
910{
911 u32 inst_base_ptr = u64_lo32(nvgpu_inst_block_addr(g, &c->inst_block)
912 >> ram_in_base_shift_v());
913 u32 data = fecs_current_ctx_data(g, &c->inst_block);
914 u32 ret;
915
916 nvgpu_log_info(g, "bind channel %d inst ptr 0x%08x",
917 c->chid, inst_base_ptr);
918
919 ret = gr_gk20a_submit_fecs_method_op(g,
920 (struct fecs_method_op_gk20a) {
921 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
922 .method.data = data,
923 .mailbox = { .id = 0, .data = 0,
924 .clr = 0x30,
925 .ret = NULL,
926 .ok = 0x10,
927 .fail = 0x20, },
928 .cond.ok = GR_IS_UCODE_OP_AND,
929 .cond.fail = GR_IS_UCODE_OP_AND}, true);
930 if (ret) {
931 nvgpu_err(g,
932 "bind channel instance failed");
933 }
934
935 return ret;
936}
937
938void gr_gk20a_write_zcull_ptr(struct gk20a *g,
939 struct nvgpu_mem *mem, u64 gpu_va)
940{
941 u32 va = u64_lo32(gpu_va >> 8);
942
943 nvgpu_mem_wr(g, mem,
944 ctxsw_prog_main_image_zcull_ptr_o(), va);
945}
946
947void gr_gk20a_write_pm_ptr(struct gk20a *g,
948 struct nvgpu_mem *mem, u64 gpu_va)
949{
950 u32 va = u64_lo32(gpu_va >> 8);
951
952 nvgpu_mem_wr(g, mem,
953 ctxsw_prog_main_image_pm_ptr_o(), va);
954}
955
956static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
957{
958 struct tsg_gk20a *tsg;
959 struct nvgpu_gr_ctx *gr_ctx = NULL;
960 struct nvgpu_mem *mem = NULL;
961 struct nvgpu_mem *ctxheader = &c->ctx_header;
962 int ret = 0;
963
964 nvgpu_log_fn(g, " ");
965
966 tsg = tsg_gk20a_from_ch(c);
967 if (tsg == NULL) {
968 return -EINVAL;
969 }
970
971 gr_ctx = &tsg->gr_ctx;
972 mem = &gr_ctx->mem;
973
974 if (gr_ctx->zcull_ctx.gpu_va == 0 &&
975 gr_ctx->zcull_ctx.ctx_sw_mode ==
976 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
977 return -EINVAL;
978 }
979
980 ret = gk20a_disable_channel_tsg(g, c);
981 if (ret) {
982 nvgpu_err(g, "failed to disable channel/TSG");
983 return ret;
984 }
985 ret = gk20a_fifo_preempt(g, c);
986 if (ret) {
987 gk20a_enable_channel_tsg(g, c);
988 nvgpu_err(g, "failed to preempt channel/TSG");
989 return ret;
990 }
991
992 nvgpu_mem_wr(g, mem,
993 ctxsw_prog_main_image_zcull_o(),
994 gr_ctx->zcull_ctx.ctx_sw_mode);
995
996 if (ctxheader->gpu_va) {
997 g->ops.gr.write_zcull_ptr(g, ctxheader,
998 gr_ctx->zcull_ctx.gpu_va);
999 } else {
1000 g->ops.gr.write_zcull_ptr(g, mem, gr_ctx->zcull_ctx.gpu_va);
1001 }
1002
1003 gk20a_enable_channel_tsg(g, c);
1004
1005 return ret;
1006}
1007
1008u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc)
1009{
1010 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
1011 u32 gpc_offset = gpc_stride * gpc;
1012
1013 return gpc_offset;
1014}
1015
1016u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc)
1017{
1018 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
1019 GPU_LIT_TPC_IN_GPC_STRIDE);
1020 u32 tpc_offset = tpc_in_gpc_stride * tpc;
1021
1022 return tpc_offset;
1023}
1024
1025int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
1026 struct channel_gk20a *c, bool patch)
1027{
1028 struct gr_gk20a *gr = &g->gr;
1029 struct tsg_gk20a *tsg;
1030 struct nvgpu_gr_ctx *gr_ctx = NULL;
1031 u64 addr;
1032 u32 size;
1033
1034 nvgpu_log_fn(g, " ");
1035
1036 tsg = tsg_gk20a_from_ch(c);
1037 if (tsg == NULL) {
1038 return -EINVAL;
1039 }
1040
1041 gr_ctx = &tsg->gr_ctx;
1042 if (patch) {
1043 int err;
1044 err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
1045 if (err != 0) {
1046 return err;
1047 }
1048 }
1049
1050 /* global pagepool buffer */
1051 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
1052 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
1053 (u64_hi32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
1054 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
1055
1056 size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
1057 gr_scc_pagepool_total_pages_byte_granularity_v();
1058
1059 if (size == g->ops.gr.pagepool_default_size(g)) {
1060 size = gr_scc_pagepool_total_pages_hwmax_v();
1061 }
1062
1063 nvgpu_log_info(g, "pagepool buffer addr : 0x%016llx, size : %d",
1064 addr, size);
1065
1066 g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, patch);
1067
1068 /* global bundle cb */
1069 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
1070 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
1071 (u64_hi32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
1072 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
1073
1074 size = gr->bundle_cb_default_size;
1075
1076 nvgpu_log_info(g, "bundle cb addr : 0x%016llx, size : %d",
1077 addr, size);
1078
1079 g->ops.gr.commit_global_bundle_cb(g, gr_ctx, addr, size, patch);
1080
1081 /* global attrib cb */
1082 addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
1083 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
1084 (u64_hi32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
1085 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
1086
1087 nvgpu_log_info(g, "attrib cb addr : 0x%016llx", addr);
1088 g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, patch);
1089 g->ops.gr.commit_global_cb_manager(g, c, patch);
1090
1091 if (patch) {
1092 gr_gk20a_ctx_patch_write_end(g, gr_ctx, false);
1093 }
1094
1095 return 0;
1096}
1097
1098int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
1099{
1100 struct gr_gk20a *gr = &g->gr;
1101 struct nvgpu_gr_ctx *gr_ctx = NULL;
1102 u32 gpm_pd_cfg;
1103 u32 pd_ab_dist_cfg0;
1104 u32 ds_debug;
1105 u32 mpc_vtg_debug;
1106 u32 pe_vaf;
1107 u32 pe_vsc_vpc;
1108
1109 nvgpu_log_fn(g, " ");
1110
1111 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1112 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1113 ds_debug = gk20a_readl(g, gr_ds_debug_r());
1114 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1115
1116 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1117 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1118 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1119
1120 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1121 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1122 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1123 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1124 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1125 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1126
1127 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
1128 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
1129 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
1130 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
1131 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
1132 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
1133 } else {
1134 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1135 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1136 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1137 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1138
1139 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
1140 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
1141 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
1142 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
1143 }
1144
1145 return 0;
1146}
1147
1148/*
1149 * Return map tiles count for given index
1150 * Return 0 if index is out-of-bounds
1151 */
1152static u32 gr_gk20a_get_map_tile_count(struct gr_gk20a *gr, u32 index)
1153{
1154 if (index >= gr->map_tile_count) {
1155 return 0;
1156 }
1157
1158 return gr->map_tiles[index];
1159}
1160
1161int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1162{
1163 u32 norm_entries, norm_shift;
1164 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1165 u32 map0, map1, map2, map3, map4, map5;
1166
1167 if (gr->map_tiles == NULL) {
1168 return -1;
1169 }
1170
1171 nvgpu_log_fn(g, " ");
1172
1173 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1174 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1175 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1176
1177 map0 = gr_crstr_gpc_map0_tile0_f(gr_gk20a_get_map_tile_count(gr, 0)) |
1178 gr_crstr_gpc_map0_tile1_f(gr_gk20a_get_map_tile_count(gr, 1)) |
1179 gr_crstr_gpc_map0_tile2_f(gr_gk20a_get_map_tile_count(gr, 2)) |
1180 gr_crstr_gpc_map0_tile3_f(gr_gk20a_get_map_tile_count(gr, 3)) |
1181 gr_crstr_gpc_map0_tile4_f(gr_gk20a_get_map_tile_count(gr, 4)) |
1182 gr_crstr_gpc_map0_tile5_f(gr_gk20a_get_map_tile_count(gr, 5));
1183
1184 map1 = gr_crstr_gpc_map1_tile6_f(gr_gk20a_get_map_tile_count(gr, 6)) |
1185 gr_crstr_gpc_map1_tile7_f(gr_gk20a_get_map_tile_count(gr, 7)) |
1186 gr_crstr_gpc_map1_tile8_f(gr_gk20a_get_map_tile_count(gr, 8)) |
1187 gr_crstr_gpc_map1_tile9_f(gr_gk20a_get_map_tile_count(gr, 9)) |
1188 gr_crstr_gpc_map1_tile10_f(gr_gk20a_get_map_tile_count(gr, 10)) |
1189 gr_crstr_gpc_map1_tile11_f(gr_gk20a_get_map_tile_count(gr, 11));
1190
1191 map2 = gr_crstr_gpc_map2_tile12_f(gr_gk20a_get_map_tile_count(gr, 12)) |
1192 gr_crstr_gpc_map2_tile13_f(gr_gk20a_get_map_tile_count(gr, 13)) |
1193 gr_crstr_gpc_map2_tile14_f(gr_gk20a_get_map_tile_count(gr, 14)) |
1194 gr_crstr_gpc_map2_tile15_f(gr_gk20a_get_map_tile_count(gr, 15)) |
1195 gr_crstr_gpc_map2_tile16_f(gr_gk20a_get_map_tile_count(gr, 16)) |
1196 gr_crstr_gpc_map2_tile17_f(gr_gk20a_get_map_tile_count(gr, 17));
1197
1198 map3 = gr_crstr_gpc_map3_tile18_f(gr_gk20a_get_map_tile_count(gr, 18)) |
1199 gr_crstr_gpc_map3_tile19_f(gr_gk20a_get_map_tile_count(gr, 19)) |
1200 gr_crstr_gpc_map3_tile20_f(gr_gk20a_get_map_tile_count(gr, 20)) |
1201 gr_crstr_gpc_map3_tile21_f(gr_gk20a_get_map_tile_count(gr, 21)) |
1202 gr_crstr_gpc_map3_tile22_f(gr_gk20a_get_map_tile_count(gr, 22)) |
1203 gr_crstr_gpc_map3_tile23_f(gr_gk20a_get_map_tile_count(gr, 23));
1204
1205 map4 = gr_crstr_gpc_map4_tile24_f(gr_gk20a_get_map_tile_count(gr, 24)) |
1206 gr_crstr_gpc_map4_tile25_f(gr_gk20a_get_map_tile_count(gr, 25)) |
1207 gr_crstr_gpc_map4_tile26_f(gr_gk20a_get_map_tile_count(gr, 26)) |
1208 gr_crstr_gpc_map4_tile27_f(gr_gk20a_get_map_tile_count(gr, 27)) |
1209 gr_crstr_gpc_map4_tile28_f(gr_gk20a_get_map_tile_count(gr, 28)) |
1210 gr_crstr_gpc_map4_tile29_f(gr_gk20a_get_map_tile_count(gr, 29));
1211
1212 map5 = gr_crstr_gpc_map5_tile30_f(gr_gk20a_get_map_tile_count(gr, 30)) |
1213 gr_crstr_gpc_map5_tile31_f(gr_gk20a_get_map_tile_count(gr, 31)) |
1214 gr_crstr_gpc_map5_tile32_f(0) |
1215 gr_crstr_gpc_map5_tile33_f(0) |
1216 gr_crstr_gpc_map5_tile34_f(0) |
1217 gr_crstr_gpc_map5_tile35_f(0);
1218
1219 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1220 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1221 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1222 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1223 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1224 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1225
1226 switch (gr->tpc_count) {
1227 case 1:
1228 norm_shift = 4;
1229 break;
1230 case 2:
1231 case 3:
1232 norm_shift = 3;
1233 break;
1234 case 4:
1235 case 5:
1236 case 6:
1237 case 7:
1238 norm_shift = 2;
1239 break;
1240 case 8:
1241 case 9:
1242 case 10:
1243 case 11:
1244 case 12:
1245 case 13:
1246 case 14:
1247 case 15:
1248 norm_shift = 1;
1249 break;
1250 default:
1251 norm_shift = 0;
1252 break;
1253 }
1254
1255 norm_entries = gr->tpc_count << norm_shift;
1256 coeff5_mod = (1 << 5) % norm_entries;
1257 coeff6_mod = (1 << 6) % norm_entries;
1258 coeff7_mod = (1 << 7) % norm_entries;
1259 coeff8_mod = (1 << 8) % norm_entries;
1260 coeff9_mod = (1 << 9) % norm_entries;
1261 coeff10_mod = (1 << 10) % norm_entries;
1262 coeff11_mod = (1 << 11) % norm_entries;
1263
1264 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1265 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1266 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1267 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1268 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1269 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1270
1271 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1272 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1273 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1274 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1275 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1276 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1277 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1278
1279 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1280 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1281 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1282 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1283 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1284 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1285
1286 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1287 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1288 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1289
1290 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1291 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1292 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1293 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1294 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1295 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1296
1297 return 0;
1298}
1299
1300static inline u32 count_bits(u32 mask)
1301{
1302 u32 temp = mask;
1303 u32 count;
1304 for (count = 0; temp != 0; count++) {
1305 temp &= temp - 1;
1306 }
1307
1308 return count;
1309}
1310
1311int gr_gk20a_init_sm_id_table(struct gk20a *g)
1312{
1313 u32 gpc, tpc;
1314 u32 sm_id = 0;
1315
1316 for (tpc = 0; tpc < g->gr.max_tpc_per_gpc_count; tpc++) {
1317 for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
1318
1319 if (tpc < g->gr.gpc_tpc_count[gpc]) {
1320 g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
1321 g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
1322 g->gr.sm_to_cluster[sm_id].sm_index = 0;
1323 g->gr.sm_to_cluster[sm_id].global_tpc_index =
1324 sm_id;
1325 sm_id++;
1326 }
1327 }
1328 }
1329 g->gr.no_of_sm = sm_id;
1330 return 0;
1331}
1332
1333/*
1334 * Return number of TPCs in a GPC
1335 * Return 0 if GPC index is invalid i.e. GPC is disabled
1336 */
1337u32 gr_gk20a_get_tpc_count(struct gr_gk20a *gr, u32 gpc_index)
1338{
1339 if (gpc_index >= gr->gpc_count) {
1340 return 0;
1341 }
1342
1343 return gr->gpc_tpc_count[gpc_index];
1344}
1345
1346int gr_gk20a_init_fs_state(struct gk20a *g)
1347{
1348 struct gr_gk20a *gr = &g->gr;
1349 u32 tpc_index, gpc_index;
1350 u32 sm_id = 0, gpc_id = 0;
1351 u32 tpc_per_gpc;
1352 u32 fuse_tpc_mask;
1353 u32 reg_index;
1354 int err;
1355
1356 nvgpu_log_fn(g, " ");
1357
1358 if (g->ops.gr.init_sm_id_table) {
1359 err = g->ops.gr.init_sm_id_table(g);
1360 if (err != 0) {
1361 return err;
1362 }
1363
1364 /* Is table empty ? */
1365 if (g->gr.no_of_sm == 0) {
1366 return -EINVAL;
1367 }
1368 }
1369
1370 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
1371 tpc_index = g->gr.sm_to_cluster[sm_id].tpc_index;
1372 gpc_index = g->gr.sm_to_cluster[sm_id].gpc_index;
1373
1374 g->ops.gr.program_sm_id_numbering(g, gpc_index, tpc_index, sm_id);
1375
1376 if (g->ops.gr.program_active_tpc_counts) {
1377 g->ops.gr.program_active_tpc_counts(g, gpc_index);
1378 }
1379 }
1380
1381 for (reg_index = 0, gpc_id = 0;
1382 reg_index < gr_pd_num_tpc_per_gpc__size_1_v();
1383 reg_index++, gpc_id += 8) {
1384
1385 tpc_per_gpc =
1386 gr_pd_num_tpc_per_gpc_count0_f(gr_gk20a_get_tpc_count(gr, gpc_id + 0)) |
1387 gr_pd_num_tpc_per_gpc_count1_f(gr_gk20a_get_tpc_count(gr, gpc_id + 1)) |
1388 gr_pd_num_tpc_per_gpc_count2_f(gr_gk20a_get_tpc_count(gr, gpc_id + 2)) |
1389 gr_pd_num_tpc_per_gpc_count3_f(gr_gk20a_get_tpc_count(gr, gpc_id + 3)) |
1390 gr_pd_num_tpc_per_gpc_count4_f(gr_gk20a_get_tpc_count(gr, gpc_id + 4)) |
1391 gr_pd_num_tpc_per_gpc_count5_f(gr_gk20a_get_tpc_count(gr, gpc_id + 5)) |
1392 gr_pd_num_tpc_per_gpc_count6_f(gr_gk20a_get_tpc_count(gr, gpc_id + 6)) |
1393 gr_pd_num_tpc_per_gpc_count7_f(gr_gk20a_get_tpc_count(gr, gpc_id + 7));
1394
1395 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1396 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
1397 }
1398
1399 /* gr__setup_pd_mapping stubbed for gk20a */
1400 g->ops.gr.setup_rop_mapping(g, gr);
1401 if (g->ops.gr.setup_alpha_beta_tables) {
1402 g->ops.gr.setup_alpha_beta_tables(g, gr);
1403 }
1404
1405 for (gpc_index = 0;
1406 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1407 gpc_index += 4) {
1408
1409 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1410 (gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) != 0U) ||
1411 (gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) != 0U) ||
1412 (gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) != 0U) ||
1413 (gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]) != 0U));
1414 }
1415
1416 fuse_tpc_mask = g->ops.gr.get_gpc_tpc_mask(g, 0);
1417 if ((g->tpc_fs_mask_user != 0U) &&
1418 (fuse_tpc_mask == BIT32(gr->max_tpc_count) - 1U)) {
1419 u32 val = g->tpc_fs_mask_user;
1420 val &= (0x1U << gr->max_tpc_count) - 1U;
1421 gk20a_writel(g, gr_cwd_fs_r(),
1422 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1423 gr_cwd_fs_num_tpcs_f(hweight32(val)));
1424 } else {
1425 gk20a_writel(g, gr_cwd_fs_r(),
1426 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1427 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1428 }
1429
1430 gk20a_writel(g, gr_bes_zrop_settings_r(),
1431 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1432 gk20a_writel(g, gr_bes_crop_settings_r(),
1433 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1434
1435 return 0;
1436}
1437
1438int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1439{
1440 struct gk20a *g = c->g;
1441 int ret;
1442
1443 nvgpu_log_fn(g, " ");
1444
1445 ret = gr_gk20a_submit_fecs_method_op(g,
1446 (struct fecs_method_op_gk20a) {
1447 .method.addr = save_type,
1448 .method.data = fecs_current_ctx_data(g, &c->inst_block),
1449 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1450 .ok = 1, .fail = 2,
1451 },
1452 .cond.ok = GR_IS_UCODE_OP_AND,
1453 .cond.fail = GR_IS_UCODE_OP_AND,
1454 }, true);
1455
1456 if (ret) {
1457 nvgpu_err(g, "save context image failed");
1458 }
1459
1460 return ret;
1461}
1462
1463u32 gk20a_init_sw_bundle(struct gk20a *g)
1464{
1465 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1466 u32 last_bundle_data = 0;
1467 u32 err = 0;
1468 unsigned int i;
1469
1470 /* disable fe_go_idle */
1471 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1472 gr_fe_go_idle_timeout_count_disabled_f());
1473 /* enable pipe mode override */
1474 gk20a_writel(g, gr_pipe_bundle_config_r(),
1475 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1476
1477 /* load bundle init */
1478 for (i = 0; i < sw_bundle_init->count; i++) {
1479 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1480 gk20a_writel(g, gr_pipe_bundle_data_r(),
1481 sw_bundle_init->l[i].value);
1482 last_bundle_data = sw_bundle_init->l[i].value;
1483 }
1484
1485 gk20a_writel(g, gr_pipe_bundle_address_r(),
1486 sw_bundle_init->l[i].addr);
1487
1488 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1489 GR_GO_IDLE_BUNDLE) {
1490 err = gr_gk20a_wait_idle(g,
1491 gk20a_get_gr_idle_timeout(g),
1492 GR_IDLE_CHECK_DEFAULT);
1493 if (err != 0U) {
1494 goto error;
1495 }
1496 }
1497
1498 err = gr_gk20a_wait_fe_idle(g, gk20a_get_gr_idle_timeout(g),
1499 GR_IDLE_CHECK_DEFAULT);
1500 if (err != 0U) {
1501 goto error;
1502 }
1503 }
1504
1505 if ((err == 0U) && (g->ops.gr.init_sw_veid_bundle != NULL)) {
1506 err = g->ops.gr.init_sw_veid_bundle(g);
1507 if (err != 0U) {
1508 goto error;
1509 }
1510 }
1511
1512 if (g->ops.gr.init_sw_bundle64) {
1513 err = g->ops.gr.init_sw_bundle64(g);
1514 if (err != 0U) {
1515 goto error;
1516 }
1517 }
1518
1519 /* disable pipe mode override */
1520 gk20a_writel(g, gr_pipe_bundle_config_r(),
1521 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1522
1523 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1524 GR_IDLE_CHECK_DEFAULT);
1525
1526 /* restore fe_go_idle */
1527 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1528 gr_fe_go_idle_timeout_count_prod_f());
1529
1530 return err;
1531
1532error:
1533 /* in case of error skip waiting for GR idle - just restore state */
1534 gk20a_writel(g, gr_pipe_bundle_config_r(),
1535 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1536
1537 /* restore fe_go_idle */
1538 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1539 gr_fe_go_idle_timeout_count_prod_f());
1540
1541 return err;
1542}
1543
1544/* init global golden image from a fresh gr_ctx in channel ctx.
1545 save a copy in local_golden_image in ctx_vars */
1546static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1547 struct channel_gk20a *c)
1548{
1549 struct gr_gk20a *gr = &g->gr;
1550 struct tsg_gk20a *tsg;
1551 struct nvgpu_gr_ctx *gr_ctx = NULL;
1552 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1553 u32 ctx_header_words;
1554 u32 i;
1555 u32 data;
1556 struct nvgpu_mem *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
1557 struct nvgpu_mem *gr_mem;
1558 u32 err = 0;
1559 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
1560 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
1561 u32 last_method_data = 0;
1562
1563 nvgpu_log_fn(g, " ");
1564
1565 tsg = tsg_gk20a_from_ch(c);
1566 if (tsg == NULL) {
1567 return -EINVAL;
1568 }
1569
1570 gr_ctx = &tsg->gr_ctx;
1571 gr_mem = &gr_ctx->mem;
1572
1573 /* golden ctx is global to all channels. Although only the first
1574 channel initializes golden image, driver needs to prevent multiple
1575 channels from initializing golden ctx at the same time */
1576 nvgpu_mutex_acquire(&gr->ctx_mutex);
1577
1578 if (gr->ctx_vars.golden_image_initialized) {
1579 goto clean_up;
1580 }
1581 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1582 struct nvgpu_timeout timeout;
1583
1584 nvgpu_timeout_init(g, &timeout,
1585 FE_PWR_MODE_TIMEOUT_MAX /
1586 FE_PWR_MODE_TIMEOUT_DEFAULT,
1587 NVGPU_TIMER_RETRY_TIMER);
1588 gk20a_writel(g, gr_fe_pwr_mode_r(),
1589 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_force_on_f());
1590 do {
1591 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1592 if (req == gr_fe_pwr_mode_req_done_v()) {
1593 break;
1594 }
1595 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1596 } while (nvgpu_timeout_expired_msg(&timeout,
1597 "timeout forcing FE on") == 0);
1598 }
1599
1600
1601 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1602 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1603 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1604 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1605 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1606 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1607 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1608 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
1609 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
1610 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
1611 (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1612 nvgpu_udelay(10);
1613
1614 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
1615 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
1616 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
1617 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
1618 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
1619 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
1620 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
1621 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
1622 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
1623 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
1624 (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
1625 nvgpu_udelay(10);
1626
1627 if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
1628 struct nvgpu_timeout timeout;
1629
1630 nvgpu_timeout_init(g, &timeout,
1631 FE_PWR_MODE_TIMEOUT_MAX /
1632 FE_PWR_MODE_TIMEOUT_DEFAULT,
1633 NVGPU_TIMER_RETRY_TIMER);
1634 gk20a_writel(g, gr_fe_pwr_mode_r(),
1635 gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_auto_f());
1636
1637 do {
1638 u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
1639 if (req == gr_fe_pwr_mode_req_done_v()) {
1640 break;
1641 }
1642 nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
1643 } while (nvgpu_timeout_expired_msg(&timeout,
1644 "timeout setting FE power to auto") == 0);
1645 }
1646
1647 /* clear scc ram */
1648 gk20a_writel(g, gr_scc_init_r(),
1649 gr_scc_init_ram_trigger_f());
1650
1651 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1652 if (err != 0U) {
1653 goto clean_up;
1654 }
1655
1656 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1657 GR_IDLE_CHECK_DEFAULT);
1658
1659 /* load ctx init */
1660 for (i = 0; i < sw_ctx_load->count; i++) {
1661 gk20a_writel(g, sw_ctx_load->l[i].addr,
1662 sw_ctx_load->l[i].value);
1663 }
1664
1665 if (g->ops.gr.disable_rd_coalesce) {
1666 g->ops.gr.disable_rd_coalesce(g);
1667 }
1668
1669 if (g->ops.gr.init_preemption_state) {
1670 g->ops.gr.init_preemption_state(g);
1671 }
1672
1673 if (g->ops.clock_gating.blcg_gr_load_gating_prod) {
1674 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
1675 }
1676
1677 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1678 GR_IDLE_CHECK_DEFAULT);
1679 if (err != 0U) {
1680 goto clean_up;
1681 }
1682
1683 /* disable fe_go_idle */
1684 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1685 gr_fe_go_idle_timeout_count_disabled_f());
1686
1687 err = g->ops.gr.commit_global_ctx_buffers(g, c, false);
1688 if (err != 0U) {
1689 goto clean_up;
1690 }
1691
1692 /* override a few ctx state registers */
1693 g->ops.gr.commit_global_timeslice(g, c);
1694
1695 /* floorsweep anything left */
1696 err = g->ops.gr.init_fs_state(g);
1697 if (err != 0U) {
1698 goto clean_up;
1699 }
1700
1701 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1702 GR_IDLE_CHECK_DEFAULT);
1703 if (err != 0U) {
1704 goto restore_fe_go_idle;
1705 }
1706
1707 err = gk20a_init_sw_bundle(g);
1708 if (err != 0U) {
1709 goto clean_up;
1710 }
1711
1712restore_fe_go_idle:
1713 /* restore fe_go_idle */
1714 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1715 gr_fe_go_idle_timeout_count_prod_f());
1716
1717 if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1718 GR_IDLE_CHECK_DEFAULT) != 0)) {
1719 goto clean_up;
1720 }
1721
1722 /* load method init */
1723 if (sw_method_init->count) {
1724 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1725 sw_method_init->l[0].value);
1726 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1727 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1728 sw_method_init->l[0].addr);
1729 last_method_data = sw_method_init->l[0].value;
1730 }
1731 for (i = 1; i < sw_method_init->count; i++) {
1732 if (sw_method_init->l[i].value != last_method_data) {
1733 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
1734 sw_method_init->l[i].value);
1735 last_method_data = sw_method_init->l[i].value;
1736 }
1737 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
1738 gr_pri_mme_shadow_raw_index_write_trigger_f() |
1739 sw_method_init->l[i].addr);
1740 }
1741
1742 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
1743 GR_IDLE_CHECK_DEFAULT);
1744 if (err != 0U) {
1745 goto clean_up;
1746 }
1747
1748 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1749 ctx_header_words >>= 2;
1750
1751 g->ops.mm.l2_flush(g, true);
1752
1753 for (i = 0; i < ctx_header_words; i++) {
1754 data = nvgpu_mem_rd32(g, gr_mem, i);
1755 nvgpu_mem_wr32(g, gold_mem, i, data);
1756 }
1757 nvgpu_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
1758 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1759
1760 g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
1761
1762 err = g->ops.gr.commit_inst(c, gr_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1763 if (err != 0U) {
1764 goto clean_up;
1765 }
1766
1767 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1768
1769
1770
1771 if (gr->ctx_vars.local_golden_image == NULL) {
1772
1773 gr->ctx_vars.local_golden_image =
1774 nvgpu_vzalloc(g, gr->ctx_vars.golden_image_size);
1775
1776 if (gr->ctx_vars.local_golden_image == NULL) {
1777 err = -ENOMEM;
1778 goto clean_up;
1779 }
1780 nvgpu_mem_rd_n(g, gold_mem, 0,
1781 gr->ctx_vars.local_golden_image,
1782 gr->ctx_vars.golden_image_size);
1783
1784 }
1785
1786 err = g->ops.gr.commit_inst(c, gr_mem->gpu_va);
1787 if (err != 0U) {
1788 goto clean_up;
1789 }
1790
1791 gr->ctx_vars.golden_image_initialized = true;
1792
1793 gk20a_writel(g, gr_fecs_current_ctx_r(),
1794 gr_fecs_current_ctx_valid_false_f());
1795
1796clean_up:
1797 if (err != 0U) {
1798 nvgpu_err(g, "fail");
1799 } else {
1800 nvgpu_log_fn(g, "done");
1801 }
1802
1803 nvgpu_mutex_release(&gr->ctx_mutex);
1804 return err;
1805}
1806
1807int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1808 struct channel_gk20a *c,
1809 bool enable_smpc_ctxsw)
1810{
1811 struct tsg_gk20a *tsg;
1812 struct nvgpu_gr_ctx *gr_ctx = NULL;
1813 struct nvgpu_mem *mem = NULL;
1814 u32 data;
1815 int ret;
1816
1817 nvgpu_log_fn(g, " ");
1818
1819 tsg = tsg_gk20a_from_ch(c);
1820 if (tsg == NULL) {
1821 return -EINVAL;
1822 }
1823
1824 gr_ctx = &tsg->gr_ctx;
1825 mem = &gr_ctx->mem;
1826 if (!nvgpu_mem_is_valid(mem)) {
1827 nvgpu_err(g, "no graphics context allocated");
1828 return -EFAULT;
1829 }
1830
1831 ret = gk20a_disable_channel_tsg(g, c);
1832 if (ret) {
1833 nvgpu_err(g, "failed to disable channel/TSG");
1834 goto out;
1835 }
1836 ret = gk20a_fifo_preempt(g, c);
1837 if (ret) {
1838 gk20a_enable_channel_tsg(g, c);
1839 nvgpu_err(g, "failed to preempt channel/TSG");
1840 goto out;
1841 }
1842
1843 /* Channel gr_ctx buffer is gpu cacheable.
1844 Flush and invalidate before cpu update. */
1845 g->ops.mm.l2_flush(g, true);
1846
1847 data = nvgpu_mem_rd(g, mem,
1848 ctxsw_prog_main_image_pm_o());
1849
1850 data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1851 data |= enable_smpc_ctxsw ?
1852 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1853 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1854
1855 nvgpu_mem_wr(g, mem,
1856 ctxsw_prog_main_image_pm_o(), data);
1857
1858out:
1859 gk20a_enable_channel_tsg(g, c);
1860 return ret;
1861}
1862
1863int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
1864 struct channel_gk20a *c,
1865 u64 gpu_va,
1866 u32 mode)
1867{
1868 struct tsg_gk20a *tsg;
1869 struct nvgpu_mem *gr_mem = NULL;
1870 struct nvgpu_gr_ctx *gr_ctx;
1871 struct pm_ctx_desc *pm_ctx;
1872 u32 data;
1873 u64 virt_addr = 0;
1874 struct nvgpu_mem *ctxheader = &c->ctx_header;
1875 int ret;
1876
1877 nvgpu_log_fn(g, " ");
1878
1879 tsg = tsg_gk20a_from_ch(c);
1880 if (tsg == NULL) {
1881 return -EINVAL;
1882 }
1883
1884 gr_ctx = &tsg->gr_ctx;
1885 pm_ctx = &gr_ctx->pm_ctx;
1886 gr_mem = &gr_ctx->mem;
1887 if (!nvgpu_mem_is_valid(gr_mem)) {
1888 nvgpu_err(g, "no graphics context allocated");
1889 return -EFAULT;
1890 }
1891
1892 if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
1893 (g->ops.gr.get_hw_accessor_stream_out_mode == NULL)) {
1894 nvgpu_err(g, "Mode-E hwpm context switch mode is not supported");
1895 return -EINVAL;
1896 }
1897
1898 switch (mode) {
1899 case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
1900 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
1901 return 0;
1902 }
1903 break;
1904 case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
1905 if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
1906 return 0;
1907 }
1908 break;
1909 case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
1910 if (pm_ctx->pm_mode == g->ops.gr.get_hw_accessor_stream_out_mode()) {
1911 return 0;
1912 }
1913 break;
1914 default:
1915 nvgpu_err(g, "invalid hwpm context switch mode");
1916 return -EINVAL;
1917 }
1918
1919 ret = gk20a_disable_channel_tsg(g, c);
1920 if (ret) {
1921 nvgpu_err(g, "failed to disable channel/TSG");
1922 return ret;
1923 }
1924
1925 ret = gk20a_fifo_preempt(g, c);
1926 if (ret) {
1927 gk20a_enable_channel_tsg(g, c);
1928 nvgpu_err(g, "failed to preempt channel/TSG");
1929 return ret;
1930 }
1931
1932 /* Channel gr_ctx buffer is gpu cacheable.
1933 Flush and invalidate before cpu update. */
1934 g->ops.mm.l2_flush(g, true);
1935
1936 if (mode != NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW) {
1937 /* Allocate buffer if necessary */
1938 if (pm_ctx->mem.gpu_va == 0) {
1939 ret = nvgpu_dma_alloc_sys(g,
1940 g->gr.ctx_vars.pm_ctxsw_image_size,
1941 &pm_ctx->mem);
1942 if (ret) {
1943 c->g->ops.fifo.enable_channel(c);
1944 nvgpu_err(g,
1945 "failed to allocate pm ctxt buffer");
1946 return ret;
1947 }
1948
1949 pm_ctx->mem.gpu_va = nvgpu_gmmu_map_fixed(c->vm,
1950 &pm_ctx->mem,
1951 gpu_va,
1952 pm_ctx->mem.size,
1953 NVGPU_VM_MAP_CACHEABLE,
1954 gk20a_mem_flag_none, true,
1955 pm_ctx->mem.aperture);
1956 if (pm_ctx->mem.gpu_va == 0ULL) {
1957 nvgpu_err(g,
1958 "failed to map pm ctxt buffer");
1959 nvgpu_dma_free(g, &pm_ctx->mem);
1960 c->g->ops.fifo.enable_channel(c);
1961 return -ENOMEM;
1962 }
1963 }
1964
1965 if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
1966 (g->ops.gr.init_hwpm_pmm_register != NULL)) {
1967 g->ops.gr.init_hwpm_pmm_register(g);
1968 }
1969 }
1970
1971 data = nvgpu_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
1972 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1973
1974 switch (mode) {
1975 case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
1976 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
1977 virt_addr = pm_ctx->mem.gpu_va;
1978 break;
1979 case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
1980 pm_ctx->pm_mode = g->ops.gr.get_hw_accessor_stream_out_mode();
1981 virt_addr = pm_ctx->mem.gpu_va;
1982 break;
1983 case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
1984 pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1985 virt_addr = 0;
1986 }
1987
1988 data |= pm_ctx->pm_mode;
1989
1990 nvgpu_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
1991
1992 if (ctxheader->gpu_va) {
1993 struct channel_gk20a *ch;
1994
1995 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
1996 nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
1997 g->ops.gr.write_pm_ptr(g, &ch->ctx_header, virt_addr);
1998 }
1999 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
2000 } else {
2001 g->ops.gr.write_pm_ptr(g, gr_mem, virt_addr);
2002 }
2003
2004 /* enable channel */
2005 gk20a_enable_channel_tsg(g, c);
2006
2007 return 0;
2008}
2009
2010void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
2011 struct nvgpu_mem *mem)
2012{
2013 nvgpu_mem_wr(g, mem,
2014 ctxsw_prog_main_image_num_save_ops_o(), 0);
2015 nvgpu_mem_wr(g, mem,
2016 ctxsw_prog_main_image_num_restore_ops_o(), 0);
2017}
2018
2019/* load saved fresh copy of gloden image into channel gr_ctx */
2020int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
2021 struct channel_gk20a *c)
2022{
2023 struct gr_gk20a *gr = &g->gr;
2024 struct tsg_gk20a *tsg;
2025 struct nvgpu_gr_ctx *gr_ctx;
2026 u32 virt_addr_lo;
2027 u32 virt_addr_hi;
2028 u64 virt_addr = 0;
2029 u32 v, data;
2030 int ret = 0;
2031 struct nvgpu_mem *mem;
2032
2033 nvgpu_log_fn(g, " ");
2034
2035 tsg = tsg_gk20a_from_ch(c);
2036 if (tsg == NULL) {
2037 return -EINVAL;
2038 }
2039
2040 gr_ctx = &tsg->gr_ctx;
2041 mem = &gr_ctx->mem;
2042 if (gr->ctx_vars.local_golden_image == NULL) {
2043 return -EINVAL;
2044 }
2045
2046 /* Channel gr_ctx buffer is gpu cacheable.
2047 Flush and invalidate before cpu update. */
2048 g->ops.mm.l2_flush(g, true);
2049
2050 nvgpu_mem_wr_n(g, mem, 0,
2051 gr->ctx_vars.local_golden_image,
2052 gr->ctx_vars.golden_image_size);
2053
2054 if (g->ops.gr.init_ctxsw_hdr_data) {
2055 g->ops.gr.init_ctxsw_hdr_data(g, mem);
2056 }
2057
2058 if ((g->ops.gr.enable_cde_in_fecs != NULL) && c->cde) {
2059 g->ops.gr.enable_cde_in_fecs(g, mem);
2060 }
2061
2062 /* set priv access map */
2063 virt_addr_lo =
2064 u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
2065 virt_addr_hi =
2066 u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
2067
2068 if (g->allow_all) {
2069 data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
2070 } else {
2071 data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
2072 }
2073
2074 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
2075 data);
2076
2077 nvgpu_mem_wr(g, mem,
2078 ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
2079 virt_addr_lo);
2080 nvgpu_mem_wr(g, mem,
2081 ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
2082 virt_addr_hi);
2083
2084 /* disable verif features */
2085 v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
2086 v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
2087 v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
2088 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
2089
2090 if (g->ops.gr.update_ctxsw_preemption_mode) {
2091 g->ops.gr.update_ctxsw_preemption_mode(g, c, mem);
2092 }
2093
2094 if (g->ops.gr.update_boosted_ctx) {
2095 g->ops.gr.update_boosted_ctx(g, mem, gr_ctx);
2096 }
2097
2098 virt_addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
2099 virt_addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
2100
2101 nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
2102 gr_ctx->patch_ctx.data_count);
2103 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
2104 gr_ctx->patch_ctx.data_count);
2105
2106 nvgpu_mem_wr(g, mem,
2107 ctxsw_prog_main_image_patch_adr_lo_o(),
2108 virt_addr_lo);
2109 nvgpu_mem_wr(g, mem,
2110 ctxsw_prog_main_image_patch_adr_hi_o(),
2111 virt_addr_hi);
2112
2113 /* Update main header region of the context buffer with the info needed
2114 * for PM context switching, including mode and possibly a pointer to
2115 * the PM backing store.
2116 */
2117 if (gr_ctx->pm_ctx.pm_mode != ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
2118 if (gr_ctx->pm_ctx.mem.gpu_va == 0) {
2119 nvgpu_err(g,
2120 "context switched pm with no pm buffer!");
2121 return -EFAULT;
2122 }
2123
2124 virt_addr = gr_ctx->pm_ctx.mem.gpu_va;
2125 } else {
2126 virt_addr = 0;
2127 }
2128
2129 data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
2130 data = data & ~ctxsw_prog_main_image_pm_mode_m();
2131 data |= gr_ctx->pm_ctx.pm_mode;
2132
2133 nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
2134
2135 g->ops.gr.write_pm_ptr(g, mem, virt_addr);
2136
2137 return ret;
2138}
2139
2140static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
2141{
2142 nvgpu_log_fn(g, " ");
2143
2144 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
2145 gr_fecs_ctxsw_mailbox_clear_value_f(~0));
2146
2147 gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
2148 gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
2149
2150 gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
2151 gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
2152
2153 nvgpu_log_fn(g, "done");
2154}
2155
2156static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
2157{
2158 struct mm_gk20a *mm = &g->mm;
2159 struct vm_gk20a *vm = mm->pmu.vm;
2160 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2161 int err;
2162
2163 err = g->ops.mm.alloc_inst_block(g, &ucode_info->inst_blk_desc);
2164 if (err != 0) {
2165 return err;
2166 }
2167
2168 g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
2169
2170 /* Map ucode surface to GMMU */
2171 ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
2172 &ucode_info->surface_desc,
2173 ucode_info->surface_desc.size,
2174 0, /* flags */
2175 gk20a_mem_flag_read_only,
2176 false,
2177 ucode_info->surface_desc.aperture);
2178 if (ucode_info->surface_desc.gpu_va == 0ULL) {
2179 nvgpu_err(g, "failed to update gmmu ptes");
2180 return -ENOMEM;
2181 }
2182
2183 return 0;
2184}
2185
2186static void gr_gk20a_init_ctxsw_ucode_segment(
2187 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
2188{
2189 p_seg->offset = *offset;
2190 p_seg->size = size;
2191 *offset = ALIGN(*offset + size, BLK_SIZE);
2192}
2193
2194static void gr_gk20a_init_ctxsw_ucode_segments(
2195 struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
2196 struct gk20a_ctxsw_bootloader_desc *bootdesc,
2197 u32 code_size, u32 data_size)
2198{
2199 u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
2200 segments->boot_entry = bootdesc->entry_point;
2201 segments->boot_imem_offset = bootdesc->imem_offset;
2202 gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
2203 gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
2204 gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
2205}
2206
2207static int gr_gk20a_copy_ctxsw_ucode_segments(
2208 struct gk20a *g,
2209 struct nvgpu_mem *dst,
2210 struct gk20a_ctxsw_ucode_segments *segments,
2211 u32 *bootimage,
2212 u32 *code, u32 *data)
2213{
2214 unsigned int i;
2215
2216 nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage,
2217 segments->boot.size);
2218 nvgpu_mem_wr_n(g, dst, segments->code.offset, code,
2219 segments->code.size);
2220 nvgpu_mem_wr_n(g, dst, segments->data.offset, data,
2221 segments->data.size);
2222
2223 /* compute a "checksum" for the boot binary to detect its version */
2224 segments->boot_signature = 0;
2225 for (i = 0; i < segments->boot.size / sizeof(u32); i++) {
2226 segments->boot_signature += bootimage[i];
2227 }
2228
2229 return 0;
2230}
2231
2232int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
2233{
2234 struct mm_gk20a *mm = &g->mm;
2235 struct vm_gk20a *vm = mm->pmu.vm;
2236 struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
2237 struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
2238 struct nvgpu_firmware *fecs_fw;
2239 struct nvgpu_firmware *gpccs_fw;
2240 u32 *fecs_boot_image;
2241 u32 *gpccs_boot_image;
2242 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2243 u32 ucode_size;
2244 int err = 0;
2245
2246 fecs_fw = nvgpu_request_firmware(g, GK20A_FECS_UCODE_IMAGE, 0);
2247 if (fecs_fw == NULL) {
2248 nvgpu_err(g, "failed to load fecs ucode!!");
2249 return -ENOENT;
2250 }
2251
2252 fecs_boot_desc = (void *)fecs_fw->data;
2253 fecs_boot_image = (void *)(fecs_fw->data +
2254 sizeof(struct gk20a_ctxsw_bootloader_desc));
2255
2256 gpccs_fw = nvgpu_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE, 0);
2257 if (gpccs_fw == NULL) {
2258 nvgpu_release_firmware(g, fecs_fw);
2259 nvgpu_err(g, "failed to load gpccs ucode!!");
2260 return -ENOENT;
2261 }
2262
2263 gpccs_boot_desc = (void *)gpccs_fw->data;
2264 gpccs_boot_image = (void *)(gpccs_fw->data +
2265 sizeof(struct gk20a_ctxsw_bootloader_desc));
2266
2267 ucode_size = 0;
2268 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
2269 fecs_boot_desc,
2270 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
2271 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
2272 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
2273 gpccs_boot_desc,
2274 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
2275 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
2276
2277 err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc);
2278 if (err != 0) {
2279 goto clean_up;
2280 }
2281
2282 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2283 &ucode_info->fecs,
2284 fecs_boot_image,
2285 g->gr.ctx_vars.ucode.fecs.inst.l,
2286 g->gr.ctx_vars.ucode.fecs.data.l);
2287
2288 nvgpu_release_firmware(g, fecs_fw);
2289 fecs_fw = NULL;
2290
2291 gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
2292 &ucode_info->gpccs,
2293 gpccs_boot_image,
2294 g->gr.ctx_vars.ucode.gpccs.inst.l,
2295 g->gr.ctx_vars.ucode.gpccs.data.l);
2296
2297 nvgpu_release_firmware(g, gpccs_fw);
2298 gpccs_fw = NULL;
2299
2300 err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
2301 if (err != 0) {
2302 goto clean_up;
2303 }
2304
2305 return 0;
2306
2307clean_up:
2308 if (ucode_info->surface_desc.gpu_va) {
2309 nvgpu_gmmu_unmap(vm, &ucode_info->surface_desc,
2310 ucode_info->surface_desc.gpu_va);
2311 }
2312 nvgpu_dma_free(g, &ucode_info->surface_desc);
2313
2314 nvgpu_release_firmware(g, gpccs_fw);
2315 gpccs_fw = NULL;
2316 nvgpu_release_firmware(g, fecs_fw);
2317 fecs_fw = NULL;
2318
2319 return err;
2320}
2321
2322static void gr_gk20a_wait_for_fecs_arb_idle(struct gk20a *g)
2323{
2324 int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2325 u32 val;
2326
2327 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2328 while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) {
2329 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2330 retries--;
2331 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2332 }
2333
2334 if (retries == 0) {
2335 nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x",
2336 gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2337 }
2338
2339 retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2340 while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2341 gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
2342 (retries != 0)) {
2343 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2344 retries--;
2345 }
2346 if (retries == 0) {
2347 nvgpu_err(g,
2348 "arbiter idle timeout, fecs ctxsw status: 0x%08x",
2349 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
2350 }
2351}
2352
2353void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
2354{
2355 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2356 int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
2357 u64 inst_ptr;
2358
2359 while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2360 gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
2361 (retries != 0)) {
2362 nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
2363 retries--;
2364 }
2365 if (retries == 0) {
2366 nvgpu_err(g,
2367 "arbiter idle timeout, status: %08x",
2368 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
2369 }
2370
2371 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2372
2373 inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
2374 gk20a_writel(g, gr_fecs_new_ctx_r(),
2375 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2376 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2377 gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
2378 gr_fecs_new_ctx_target_sys_mem_coh_f(),
2379 gr_fecs_new_ctx_target_vid_mem_f()) |
2380 gr_fecs_new_ctx_valid_m());
2381
2382 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2383 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2384 nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
2385 gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
2386 gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(),
2387 gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
2388
2389 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2390
2391 /* Wait for arbiter command to complete */
2392 gr_gk20a_wait_for_fecs_arb_idle(g);
2393
2394 gk20a_writel(g, gr_fecs_current_ctx_r(),
2395 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2396 gr_fecs_current_ctx_target_m() |
2397 gr_fecs_current_ctx_valid_m());
2398 /* Send command to arbiter to flush */
2399 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2400
2401 gr_gk20a_wait_for_fecs_arb_idle(g);
2402
2403}
2404
2405void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
2406 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2407{
2408 u32 addr_code32;
2409 u32 addr_data32;
2410
2411 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2412 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2413
2414 /*
2415 * Copy falcon bootloader header into dmem at offset 0.
2416 * Configure dmem port 0 for auto-incrementing writes starting at dmem
2417 * offset 0.
2418 */
2419 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2420 gr_fecs_dmemc_offs_f(0) |
2421 gr_fecs_dmemc_blk_f(0) |
2422 gr_fecs_dmemc_aincw_f(1));
2423
2424 /* Write out the actual data */
2425 switch (segments->boot_signature) {
2426 case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
2427 case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
2428 case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
2429 case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
2430 case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
2431 case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
2432 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2433 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2434 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2435 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2436 /* fallthrough */
2437 case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
2438 case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
2439 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
2440 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
2441 case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
2442 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2443 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2444 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2445 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2446 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
2447 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2448 addr_code32);
2449 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2450 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2451 segments->code.size);
2452 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2453 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2454 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2455 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2456 addr_data32);
2457 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2458 segments->data.size);
2459 break;
2460 case FALCON_UCODE_SIG_T12X_FECS_OLDER:
2461 case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
2462 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2463 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2464 addr_code32);
2465 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2466 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2467 segments->code.size);
2468 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2469 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2470 addr_data32);
2471 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2472 segments->data.size);
2473 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2474 addr_code32);
2475 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2476 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2477 break;
2478 default:
2479 nvgpu_err(g,
2480 "unknown falcon ucode boot signature 0x%08x"
2481 " with reg_offset 0x%08x",
2482 segments->boot_signature, reg_offset);
2483 BUG();
2484 }
2485}
2486
2487void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
2488 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2489{
2490 u32 addr_load32;
2491 u32 blocks;
2492 u32 b;
2493 u32 dst;
2494
2495 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2496 blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2497
2498 /*
2499 * Set the base FB address for the DMA transfer. Subtract off the 256
2500 * byte IMEM block offset such that the relative FB and IMEM offsets
2501 * match, allowing the IMEM tags to be properly created.
2502 */
2503
2504 dst = segments->boot_imem_offset;
2505 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2506 (addr_load32 - (dst >> 8)));
2507
2508 for (b = 0; b < blocks; b++) {
2509 /* Setup destination IMEM offset */
2510 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2511 dst + (b << 8));
2512
2513 /* Setup source offset (relative to BASE) */
2514 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2515 dst + (b << 8));
2516
2517 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2518 gr_fecs_dmatrfcmd_imem_f(0x01) |
2519 gr_fecs_dmatrfcmd_write_f(0x00) |
2520 gr_fecs_dmatrfcmd_size_f(0x06) |
2521 gr_fecs_dmatrfcmd_ctxdma_f(0));
2522 }
2523
2524 /* Specify the falcon boot vector */
2525 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2526 gr_fecs_bootvec_vec_f(segments->boot_entry));
2527}
2528
2529static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2530{
2531 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2532 u64 addr_base = ucode_info->surface_desc.gpu_va;
2533
2534 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2535
2536 gr_gk20a_load_falcon_bind_instblk(g);
2537
2538 g->ops.gr.falcon_load_ucode(g, addr_base,
2539 &g->ctxsw_ucode_info.fecs, 0);
2540
2541 g->ops.gr.falcon_load_ucode(g, addr_base,
2542 &g->ctxsw_ucode_info.gpccs,
2543 gr_gpcs_gpccs_falcon_hwcfg_r() -
2544 gr_fecs_falcon_hwcfg_r());
2545}
2546
2547int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
2548{
2549 int err;
2550
2551 nvgpu_log_fn(g, " ");
2552
2553 if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
2554 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2555 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2556 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2557 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2558 }
2559
2560 /*
2561 * In case bootloader is not supported, revert to the old way of
2562 * loading gr ucode, without the faster bootstrap routine.
2563 */
2564 if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
2565 gr_gk20a_load_falcon_dmem(g);
2566 gr_gk20a_load_falcon_imem(g);
2567 gr_gk20a_start_falcon_ucode(g);
2568 } else {
2569 if (!g->gr.skip_ucode_init) {
2570 err = gr_gk20a_init_ctxsw_ucode(g);
2571
2572 if (err != 0) {
2573 return err;
2574 }
2575 }
2576 gr_gk20a_load_falcon_with_bootloader(g);
2577 g->gr.skip_ucode_init = true;
2578 }
2579 nvgpu_log_fn(g, "done");
2580 return 0;
2581}
2582
2583int gr_gk20a_set_fecs_watchdog_timeout(struct gk20a *g)
2584{
2585 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2586 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2587 gk20a_writel(g, gr_fecs_method_push_r(),
2588 gr_fecs_method_push_adr_set_watchdog_timeout_f());
2589
2590 return 0;
2591}
2592
2593static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
2594{
2595 u32 ret;
2596
2597 nvgpu_log_fn(g, " ");
2598
2599 ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL,
2600 GR_IS_UCODE_OP_EQUAL,
2601 eUcodeHandshakeInitComplete,
2602 GR_IS_UCODE_OP_SKIP, 0, false);
2603 if (ret) {
2604 nvgpu_err(g, "falcon ucode init timeout");
2605 return ret;
2606 }
2607
2608 if (nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP) ||
2609 nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
2610 gk20a_writel(g, gr_fecs_current_ctx_r(),
2611 gr_fecs_current_ctx_valid_false_f());
2612 }
2613
2614 ret = g->ops.gr.set_fecs_watchdog_timeout(g);
2615 if (ret) {
2616 nvgpu_err(g, "fail to set watchdog timeout");
2617 return ret;
2618 }
2619
2620 nvgpu_log_fn(g, "done");
2621 return 0;
2622}
2623
2624int gr_gk20a_init_ctx_state(struct gk20a *g)
2625{
2626 u32 ret;
2627 struct fecs_method_op_gk20a op = {
2628 .mailbox = { .id = 0, .data = 0,
2629 .clr = ~0, .ok = 0, .fail = 0},
2630 .method.data = 0,
2631 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2632 .cond.fail = GR_IS_UCODE_OP_SKIP,
2633 };
2634
2635 nvgpu_log_fn(g, " ");
2636 /* query ctxsw image sizes, if golden context is not created */
2637 if (!g->gr.ctx_vars.golden_image_initialized) {
2638 op.method.addr =
2639 gr_fecs_method_push_adr_discover_image_size_v();
2640 op.mailbox.ret = &g->gr.ctx_vars.golden_image_size;
2641 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2642 if (ret) {
2643 nvgpu_err(g,
2644 "query golden image size failed");
2645 return ret;
2646 }
2647 op.method.addr =
2648 gr_fecs_method_push_adr_discover_zcull_image_size_v();
2649 op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size;
2650 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2651 if (ret) {
2652 nvgpu_err(g,
2653 "query zcull ctx image size failed");
2654 return ret;
2655 }
2656 op.method.addr =
2657 gr_fecs_method_push_adr_discover_pm_image_size_v();
2658 op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size;
2659 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2660 if (ret) {
2661 nvgpu_err(g,
2662 "query pm ctx image size failed");
2663 return ret;
2664 }
2665 g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2666#ifdef CONFIG_GK20A_CTXSW_TRACE
2667 g->gr.ctx_vars.fecs_trace_buffer_size =
2668 gk20a_fecs_trace_buffer_size(g);
2669#endif
2670 }
2671
2672 nvgpu_log_fn(g, "done");
2673 return 0;
2674}
2675
2676void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
2677 struct gr_ctx_buffer_desc *desc)
2678{
2679 if (desc == NULL) {
2680 return;
2681 }
2682 nvgpu_dma_free(g, &desc->mem);
2683 desc->destroy = NULL;
2684}
2685
2686int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
2687 struct gr_ctx_buffer_desc *desc,
2688 size_t size)
2689{
2690 int err = 0;
2691
2692 nvgpu_log_fn(g, " ");
2693
2694 if (nvgpu_mem_is_valid(&desc->mem)) {
2695 return 0;
2696 }
2697
2698 err = nvgpu_dma_alloc_sys(g, size, &desc->mem);
2699 if (err != 0) {
2700 return err;
2701 }
2702
2703 desc->destroy = gk20a_gr_destroy_ctx_buffer;
2704
2705 return err;
2706}
2707
2708static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2709{
2710 struct gr_gk20a *gr = &g->gr;
2711 u32 i;
2712
2713 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2714 /* destroy exists iff buffer is allocated */
2715 if (gr->global_ctx_buffer[i].destroy) {
2716 gr->global_ctx_buffer[i].destroy(g,
2717 &gr->global_ctx_buffer[i]);
2718 }
2719 }
2720
2721 nvgpu_log_fn(g, "done");
2722}
2723
2724int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2725{
2726 struct gr_gk20a *gr = &g->gr;
2727 int attr_buffer_size, err;
2728
2729 u32 cb_buffer_size = gr->bundle_cb_default_size *
2730 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2731
2732 u32 pagepool_buffer_size = g->ops.gr.pagepool_default_size(g) *
2733 gr_scc_pagepool_total_pages_byte_granularity_v();
2734
2735 nvgpu_log_fn(g, " ");
2736
2737 attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2738
2739 nvgpu_log_info(g, "cb_buffer_size : %d", cb_buffer_size);
2740
2741 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[CIRCULAR],
2742 cb_buffer_size);
2743 if (err != 0) {
2744 goto clean_up;
2745 }
2746
2747 if (g->ops.secure_alloc) {
2748 err = g->ops.secure_alloc(g,
2749 &gr->global_ctx_buffer[CIRCULAR_VPR],
2750 cb_buffer_size);
2751 if (err != 0) {
2752 goto clean_up;
2753 }
2754 }
2755
2756 nvgpu_log_info(g, "pagepool_buffer_size : %d", pagepool_buffer_size);
2757
2758 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[PAGEPOOL],
2759 pagepool_buffer_size);
2760 if (err != 0) {
2761 goto clean_up;
2762 }
2763
2764 if (g->ops.secure_alloc) {
2765 err = g->ops.secure_alloc(g,
2766 &gr->global_ctx_buffer[PAGEPOOL_VPR],
2767 pagepool_buffer_size);
2768 if (err != 0) {
2769 goto clean_up;
2770 }
2771 }
2772
2773 nvgpu_log_info(g, "attr_buffer_size : %d", attr_buffer_size);
2774
2775 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[ATTRIBUTE],
2776 attr_buffer_size);
2777 if (err != 0) {
2778 goto clean_up;
2779 }
2780
2781 if (g->ops.secure_alloc) {
2782 err = g->ops.secure_alloc(g,
2783 &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2784 attr_buffer_size);
2785 if (err != 0) {
2786 goto clean_up;
2787 }
2788 }
2789
2790 nvgpu_log_info(g, "golden_image_size : %d",
2791 gr->ctx_vars.golden_image_size);
2792
2793 err = gk20a_gr_alloc_ctx_buffer(g,
2794 &gr->global_ctx_buffer[GOLDEN_CTX],
2795 gr->ctx_vars.golden_image_size);
2796 if (err != 0) {
2797 goto clean_up;
2798 }
2799
2800 nvgpu_log_info(g, "priv_access_map_size : %d",
2801 gr->ctx_vars.priv_access_map_size);
2802
2803 err = gk20a_gr_alloc_ctx_buffer(g,
2804 &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2805 gr->ctx_vars.priv_access_map_size);
2806
2807 if (err != 0) {
2808 goto clean_up;
2809 }
2810
2811#ifdef CONFIG_GK20A_CTXSW_TRACE
2812 nvgpu_log_info(g, "fecs_trace_buffer_size : %d",
2813 gr->ctx_vars.fecs_trace_buffer_size);
2814
2815 err = nvgpu_dma_alloc_sys(g,
2816 gr->ctx_vars.fecs_trace_buffer_size,
2817 &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem);
2818 if (err != 0) {
2819 goto clean_up;
2820 }
2821
2822 gr->global_ctx_buffer[FECS_TRACE_BUFFER].destroy =
2823 gk20a_gr_destroy_ctx_buffer;
2824#endif
2825
2826 nvgpu_log_fn(g, "done");
2827 return 0;
2828
2829 clean_up:
2830 nvgpu_err(g, "fail");
2831 gr_gk20a_free_global_ctx_buffers(g);
2832 return -ENOMEM;
2833}
2834
2835static void gr_gk20a_unmap_global_ctx_buffers(struct gk20a *g,
2836 struct vm_gk20a *vm,
2837 struct nvgpu_gr_ctx *gr_ctx)
2838{
2839 u64 *g_bfr_va = gr_ctx->global_ctx_buffer_va;
2840 u64 *g_bfr_size = gr_ctx->global_ctx_buffer_size;
2841 int *g_bfr_index = gr_ctx->global_ctx_buffer_index;
2842 u32 i;
2843
2844 nvgpu_log_fn(g, " ");
2845
2846 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2847 if (g_bfr_index[i]) {
2848 struct nvgpu_mem *mem;
2849
2850 /*
2851 * Translate from VA index to buffer index to determine
2852 * the correct struct nvgpu_mem to use. Handles the VPR
2853 * vs non-VPR difference in context images.
2854 */
2855 mem = &g->gr.global_ctx_buffer[g_bfr_index[i]].mem;
2856
2857 nvgpu_gmmu_unmap(vm, mem, g_bfr_va[i]);
2858 }
2859 }
2860
2861 memset(g_bfr_va, 0, sizeof(gr_ctx->global_ctx_buffer_va));
2862 memset(g_bfr_size, 0, sizeof(gr_ctx->global_ctx_buffer_size));
2863 memset(g_bfr_index, 0, sizeof(gr_ctx->global_ctx_buffer_index));
2864
2865 gr_ctx->global_ctx_buffer_mapped = false;
2866}
2867
2868int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2869 struct channel_gk20a *c)
2870{
2871 struct tsg_gk20a *tsg;
2872 struct vm_gk20a *ch_vm = c->vm;
2873 u64 *g_bfr_va;
2874 u64 *g_bfr_size;
2875 int *g_bfr_index;
2876 struct gr_gk20a *gr = &g->gr;
2877 struct nvgpu_mem *mem;
2878 u64 gpu_va;
2879
2880 nvgpu_log_fn(g, " ");
2881
2882 tsg = tsg_gk20a_from_ch(c);
2883 if (tsg == NULL) {
2884 return -EINVAL;
2885 }
2886
2887 g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
2888 g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
2889 g_bfr_index = tsg->gr_ctx.global_ctx_buffer_index;
2890
2891 /* Circular Buffer */
2892 if (c->vpr &&
2893 nvgpu_mem_is_valid(&gr->global_ctx_buffer[CIRCULAR_VPR].mem)) {
2894 mem = &gr->global_ctx_buffer[CIRCULAR_VPR].mem;
2895 g_bfr_index[CIRCULAR_VA] = CIRCULAR_VPR;
2896 } else {
2897 mem = &gr->global_ctx_buffer[CIRCULAR].mem;
2898 g_bfr_index[CIRCULAR_VA] = CIRCULAR;
2899 }
2900
2901 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2902 NVGPU_VM_MAP_CACHEABLE,
2903 gk20a_mem_flag_none, true, mem->aperture);
2904 if (gpu_va == 0ULL) {
2905 goto clean_up;
2906 }
2907 g_bfr_va[CIRCULAR_VA] = gpu_va;
2908 g_bfr_size[CIRCULAR_VA] = mem->size;
2909
2910 /* Attribute Buffer */
2911 if (c->vpr &&
2912 nvgpu_mem_is_valid(&gr->global_ctx_buffer[ATTRIBUTE_VPR].mem)) {
2913 mem = &gr->global_ctx_buffer[ATTRIBUTE_VPR].mem;
2914 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE_VPR;
2915 } else {
2916 mem = &gr->global_ctx_buffer[ATTRIBUTE].mem;
2917 g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE;
2918 }
2919
2920 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2921 NVGPU_VM_MAP_CACHEABLE,
2922 gk20a_mem_flag_none, false, mem->aperture);
2923 if (gpu_va == 0ULL) {
2924 goto clean_up;
2925 }
2926 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2927 g_bfr_size[ATTRIBUTE_VA] = mem->size;
2928
2929 /* Page Pool */
2930 if (c->vpr &&
2931 nvgpu_mem_is_valid(&gr->global_ctx_buffer[PAGEPOOL_VPR].mem)) {
2932 mem = &gr->global_ctx_buffer[PAGEPOOL_VPR].mem;
2933 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL_VPR;
2934 } else {
2935 mem = &gr->global_ctx_buffer[PAGEPOOL].mem;
2936 g_bfr_index[PAGEPOOL_VA] = PAGEPOOL;
2937 }
2938
2939 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
2940 NVGPU_VM_MAP_CACHEABLE,
2941 gk20a_mem_flag_none, true, mem->aperture);
2942 if (gpu_va == 0ULL) {
2943 goto clean_up;
2944 }
2945 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2946 g_bfr_size[PAGEPOOL_VA] = mem->size;
2947
2948 /* Golden Image */
2949 mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
2950 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2951 gk20a_mem_flag_none, true, mem->aperture);
2952 if (gpu_va == 0ULL) {
2953 goto clean_up;
2954 }
2955 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2956 g_bfr_size[GOLDEN_CTX_VA] = mem->size;
2957 g_bfr_index[GOLDEN_CTX_VA] = GOLDEN_CTX;
2958
2959 /* Priv register Access Map */
2960 mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
2961 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2962 gk20a_mem_flag_none, true, mem->aperture);
2963 if (gpu_va == 0ULL) {
2964 goto clean_up;
2965 }
2966 g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2967 g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
2968 g_bfr_index[PRIV_ACCESS_MAP_VA] = PRIV_ACCESS_MAP;
2969
2970 tsg->gr_ctx.global_ctx_buffer_mapped = true;
2971
2972#ifdef CONFIG_GK20A_CTXSW_TRACE
2973 /* FECS trace buffer */
2974 if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
2975 mem = &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem;
2976 gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
2977 gk20a_mem_flag_none, true, mem->aperture);
2978 if (!gpu_va)
2979 goto clean_up;
2980 g_bfr_va[FECS_TRACE_BUFFER_VA] = gpu_va;
2981 g_bfr_size[FECS_TRACE_BUFFER_VA] = mem->size;
2982 g_bfr_index[FECS_TRACE_BUFFER_VA] = FECS_TRACE_BUFFER;
2983 }
2984#endif
2985
2986 return 0;
2987
2988clean_up:
2989 gr_gk20a_unmap_global_ctx_buffers(g, ch_vm, &tsg->gr_ctx);
2990
2991 return -ENOMEM;
2992}
2993
2994int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
2995 struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
2996 u32 class,
2997 u32 padding)
2998{
2999 struct gr_gk20a *gr = &g->gr;
3000 int err = 0;
3001
3002 nvgpu_log_fn(g, " ");
3003
3004 if (gr->ctx_vars.buffer_size == 0) {
3005 return 0;
3006 }
3007
3008 /* alloc channel gr ctx buffer */
3009 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
3010 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
3011
3012 err = nvgpu_dma_alloc(g, gr->ctx_vars.buffer_total_size, &gr_ctx->mem);
3013 if (err != 0) {
3014 return err;
3015 }
3016
3017 gr_ctx->mem.gpu_va = nvgpu_gmmu_map(vm,
3018 &gr_ctx->mem,
3019 gr_ctx->mem.size,
3020 0, /* not GPU-cacheable */
3021 gk20a_mem_flag_none, true,
3022 gr_ctx->mem.aperture);
3023 if (gr_ctx->mem.gpu_va == 0ULL) {
3024 goto err_free_mem;
3025 }
3026
3027 return 0;
3028
3029 err_free_mem:
3030 nvgpu_dma_free(g, &gr_ctx->mem);
3031
3032 return err;
3033}
3034
3035static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
3036 struct tsg_gk20a *tsg, u32 class, u32 padding)
3037{
3038 struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
3039 int err;
3040
3041 if (tsg->vm == NULL) {
3042 nvgpu_err(tsg->g, "No address space bound");
3043 return -ENOMEM;
3044 }
3045
3046 err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
3047 if (err != 0) {
3048 return err;
3049 }
3050
3051 gr_ctx->tsgid = tsg->tsgid;
3052
3053 return 0;
3054}
3055
3056void gr_gk20a_free_gr_ctx(struct gk20a *g,
3057 struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
3058{
3059 nvgpu_log_fn(g, " ");
3060
3061 if (gr_ctx->mem.gpu_va) {
3062 gr_gk20a_unmap_global_ctx_buffers(g, vm, gr_ctx);
3063 gr_gk20a_free_channel_patch_ctx(g, vm, gr_ctx);
3064 gr_gk20a_free_channel_pm_ctx(g, vm, gr_ctx);
3065
3066 if ((g->ops.gr.dump_ctxsw_stats != NULL) &&
3067 g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close) {
3068 g->ops.gr.dump_ctxsw_stats(g, vm, gr_ctx);
3069 }
3070
3071 nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
3072 nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
3073 nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
3074 nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
3075 nvgpu_dma_unmap_free(vm, &gr_ctx->mem);
3076
3077 memset(gr_ctx, 0, sizeof(*gr_ctx));
3078 }
3079}
3080
3081void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
3082{
3083 struct gk20a *g = tsg->g;
3084
3085 if (tsg->vm == NULL) {
3086 nvgpu_err(g, "No address space bound");
3087 return;
3088 }
3089 tsg->g->ops.gr.free_gr_ctx(g, tsg->vm, &tsg->gr_ctx);
3090}
3091
3092u32 gr_gk20a_get_patch_slots(struct gk20a *g)
3093{
3094 return PATCH_CTX_SLOTS_PER_PAGE;
3095}
3096
3097static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
3098 struct channel_gk20a *c)
3099{
3100 struct tsg_gk20a *tsg;
3101 struct patch_desc *patch_ctx;
3102 struct vm_gk20a *ch_vm = c->vm;
3103 u32 alloc_size;
3104 int err = 0;
3105
3106 nvgpu_log_fn(g, " ");
3107
3108 tsg = tsg_gk20a_from_ch(c);
3109 if (tsg == NULL) {
3110 return -EINVAL;
3111 }
3112
3113 patch_ctx = &tsg->gr_ctx.patch_ctx;
3114 alloc_size = g->ops.gr.get_patch_slots(g) *
3115 PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
3116
3117 nvgpu_log(g, gpu_dbg_info, "patch buffer size in entries: %d",
3118 alloc_size);
3119
3120 err = nvgpu_dma_alloc_map_sys(ch_vm,
3121 alloc_size * sizeof(u32), &patch_ctx->mem);
3122 if (err != 0) {
3123 return err;
3124 }
3125
3126 nvgpu_log_fn(g, "done");
3127 return 0;
3128}
3129
3130static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
3131 struct vm_gk20a *vm,
3132 struct nvgpu_gr_ctx *gr_ctx)
3133{
3134 struct patch_desc *patch_ctx = &gr_ctx->patch_ctx;
3135
3136 nvgpu_log_fn(g, " ");
3137
3138 if (patch_ctx->mem.gpu_va) {
3139 nvgpu_gmmu_unmap(vm, &patch_ctx->mem,
3140 patch_ctx->mem.gpu_va);
3141 }
3142
3143 nvgpu_dma_free(g, &patch_ctx->mem);
3144 patch_ctx->data_count = 0;
3145}
3146
3147static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
3148 struct vm_gk20a *vm,
3149 struct nvgpu_gr_ctx *gr_ctx)
3150{
3151 struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
3152
3153 nvgpu_log_fn(g, " ");
3154
3155 if (pm_ctx->mem.gpu_va) {
3156 nvgpu_gmmu_unmap(vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
3157
3158 nvgpu_dma_free(g, &pm_ctx->mem);
3159 }
3160}
3161
3162int gk20a_alloc_obj_ctx(struct channel_gk20a *c, u32 class_num, u32 flags)
3163{
3164 struct gk20a *g = c->g;
3165 struct nvgpu_gr_ctx *gr_ctx;
3166 struct tsg_gk20a *tsg = NULL;
3167 int err = 0;
3168
3169 nvgpu_log_fn(g, " ");
3170
3171 /* an address space needs to have been bound at this point.*/
3172 if (!gk20a_channel_as_bound(c) && (c->vm == NULL)) {
3173 nvgpu_err(g,
3174 "not bound to address space at time"
3175 " of grctx allocation");
3176 return -EINVAL;
3177 }
3178
3179 if (!g->ops.gr.is_valid_class(g, class_num)) {
3180 nvgpu_err(g,
3181 "invalid obj class 0x%x", class_num);
3182 err = -EINVAL;
3183 goto out;
3184 }
3185 c->obj_class = class_num;
3186
3187 tsg = tsg_gk20a_from_ch(c);
3188 if (tsg == NULL) {
3189 return -EINVAL;
3190 }
3191
3192 gr_ctx = &tsg->gr_ctx;
3193
3194 if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
3195 tsg->vm = c->vm;
3196 nvgpu_vm_get(tsg->vm);
3197 err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
3198 class_num,
3199 flags);
3200 if (err != 0) {
3201 nvgpu_err(g,
3202 "fail to allocate TSG gr ctx buffer");
3203 nvgpu_vm_put(tsg->vm);
3204 tsg->vm = NULL;
3205 goto out;
3206 }
3207
3208 /* allocate patch buffer */
3209 if (!nvgpu_mem_is_valid(&gr_ctx->patch_ctx.mem)) {
3210 gr_ctx->patch_ctx.data_count = 0;
3211 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
3212 if (err != 0) {
3213 nvgpu_err(g,
3214 "fail to allocate patch buffer");
3215 goto out;
3216 }
3217 }
3218
3219 /* map global buffer to channel gpu_va and commit */
3220 err = g->ops.gr.map_global_ctx_buffers(g, c);
3221 if (err != 0) {
3222 nvgpu_err(g,
3223 "fail to map global ctx buffer");
3224 goto out;
3225 }
3226 g->ops.gr.commit_global_ctx_buffers(g, c, true);
3227
3228 /* commit gr ctx buffer */
3229 err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
3230 if (err != 0) {
3231 nvgpu_err(g,
3232 "fail to commit gr ctx buffer");
3233 goto out;
3234 }
3235
3236 /* init golden image */
3237 err = gr_gk20a_init_golden_ctx_image(g, c);
3238 if (err != 0) {
3239 nvgpu_err(g,
3240 "fail to init golden ctx image");
3241 goto out;
3242 }
3243
3244 /* Re-enable ELPG now that golden image has been initialized.
3245 * The PMU PG init code may already have tried to enable elpg, but
3246 * would not have been able to complete this action since the golden
3247 * image hadn't been initialized yet, so do this now.
3248 */
3249 err = nvgpu_pmu_reenable_elpg(g);
3250 if (err != 0) {
3251 nvgpu_err(g, "fail to re-enable elpg");
3252 goto out;
3253 }
3254
3255 /* load golden image */
3256 gr_gk20a_load_golden_ctx_image(g, c);
3257 if (err != 0) {
3258 nvgpu_err(g,
3259 "fail to load golden ctx image");
3260 goto out;
3261 }
3262#ifdef CONFIG_GK20A_CTXSW_TRACE
3263 if (g->ops.fecs_trace.bind_channel && !c->vpr) {
3264 err = g->ops.fecs_trace.bind_channel(g, c);
3265 if (err != 0) {
3266 nvgpu_warn(g,
3267 "fail to bind channel for ctxsw trace");
3268 }
3269 }
3270#endif
3271
3272 if (g->ops.gr.set_czf_bypass) {
3273 g->ops.gr.set_czf_bypass(g, c);
3274 }
3275
3276 /* PM ctxt switch is off by default */
3277 gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
3278 } else {
3279 /* commit gr ctx buffer */
3280 err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
3281 if (err != 0) {
3282 nvgpu_err(g,
3283 "fail to commit gr ctx buffer");
3284 goto out;
3285 }
3286#ifdef CONFIG_GK20A_CTXSW_TRACE
3287 if (g->ops.fecs_trace.bind_channel && !c->vpr) {
3288 err = g->ops.fecs_trace.bind_channel(g, c);
3289 if (err != 0) {
3290 nvgpu_warn(g,
3291 "fail to bind channel for ctxsw trace");
3292 }
3293 }
3294#endif
3295 }
3296
3297 nvgpu_log_fn(g, "done");
3298 return 0;
3299out:
3300 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
3301 can be reused so no need to release them.
3302 2. golden image init and load is a one time thing so if
3303 they pass, no need to undo. */
3304 nvgpu_err(g, "fail");
3305 return err;
3306}
3307
3308static void gk20a_remove_gr_support(struct gr_gk20a *gr)
3309{
3310 struct gk20a *g = gr->g;
3311
3312 nvgpu_log_fn(g, " ");
3313
3314 gr_gk20a_free_cyclestats_snapshot_data(g);
3315
3316 gr_gk20a_free_global_ctx_buffers(g);
3317
3318 nvgpu_dma_free(g, &gr->compbit_store.mem);
3319
3320 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
3321
3322 nvgpu_kfree(g, gr->gpc_tpc_count);
3323 nvgpu_kfree(g, gr->gpc_zcb_count);
3324 nvgpu_kfree(g, gr->gpc_ppc_count);
3325 nvgpu_kfree(g, gr->pes_tpc_count[0]);
3326 nvgpu_kfree(g, gr->pes_tpc_count[1]);
3327 nvgpu_kfree(g, gr->pes_tpc_mask[0]);
3328 nvgpu_kfree(g, gr->pes_tpc_mask[1]);
3329 nvgpu_kfree(g, gr->sm_to_cluster);
3330 nvgpu_kfree(g, gr->gpc_skip_mask);
3331 nvgpu_kfree(g, gr->map_tiles);
3332 nvgpu_kfree(g, gr->fbp_rop_l2_en_mask);
3333 gr->gpc_tpc_count = NULL;
3334 gr->gpc_zcb_count = NULL;
3335 gr->gpc_ppc_count = NULL;
3336 gr->pes_tpc_count[0] = NULL;
3337 gr->pes_tpc_count[1] = NULL;
3338 gr->pes_tpc_mask[0] = NULL;
3339 gr->pes_tpc_mask[1] = NULL;
3340 gr->gpc_skip_mask = NULL;
3341 gr->map_tiles = NULL;
3342 gr->fbp_rop_l2_en_mask = NULL;
3343
3344 gr->ctx_vars.valid = false;
3345 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.inst.l);
3346 nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.data.l);
3347 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.inst.l);
3348 nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.data.l);
3349 nvgpu_kfree(g, gr->ctx_vars.sw_bundle_init.l);
3350 nvgpu_kfree(g, gr->ctx_vars.sw_veid_bundle_init.l);
3351 nvgpu_kfree(g, gr->ctx_vars.sw_method_init.l);
3352 nvgpu_kfree(g, gr->ctx_vars.sw_ctx_load.l);
3353 nvgpu_kfree(g, gr->ctx_vars.sw_non_ctx_load.l);
3354 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.sys.l);
3355 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc.l);
3356 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.tpc.l);
3357 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
3358 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.ppc.l);
3359 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_sys.l);
3360 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_gpc.l);
3361 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_tpc.l);
3362 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ppc.l);
3363 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_sys.l);
3364 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp.l);
3365 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_gpc.l);
3366 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp_router.l);
3367 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc_router.l);
3368 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ltc.l);
3369 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_fbpa.l);
3370 nvgpu_kfree(g, gr->ctx_vars.sw_bundle64_init.l);
3371 nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_cau.l);
3372
3373 nvgpu_vfree(g, gr->ctx_vars.local_golden_image);
3374 gr->ctx_vars.local_golden_image = NULL;
3375
3376 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map) {
3377 nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
3378 }
3379 gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
3380
3381 gk20a_comptag_allocator_destroy(g, &gr->comp_tags);
3382
3383 nvgpu_ecc_remove_support(g);
3384}
3385
3386static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
3387{
3388 u32 gpc_index, pes_index;
3389 u32 pes_tpc_mask;
3390 u32 pes_tpc_count;
3391 u32 pes_heavy_index;
3392 u32 gpc_new_skip_mask;
3393 u32 tmp;
3394 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
3395 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
3396
3397 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
3398 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
3399
3400 tmp = gk20a_readl(g, top_num_gpcs_r());
3401 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
3402
3403 tmp = gk20a_readl(g, top_num_fbps_r());
3404 gr->max_fbps_count = top_num_fbps_value_v(tmp);
3405
3406 gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g);
3407
3408 if (gr->fbp_rop_l2_en_mask == NULL) {
3409 gr->fbp_rop_l2_en_mask =
3410 nvgpu_kzalloc(g, gr->max_fbps_count * sizeof(u32));
3411 if (gr->fbp_rop_l2_en_mask == NULL) {
3412 goto clean_up;
3413 }
3414 } else {
3415 memset(gr->fbp_rop_l2_en_mask, 0, gr->max_fbps_count *
3416 sizeof(u32));
3417 }
3418
3419 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
3420 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
3421
3422 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
3423
3424 tmp = gk20a_readl(g, top_num_fbps_r());
3425 gr->sys_count = top_num_fbps_value_v(tmp);
3426
3427 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
3428 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
3429
3430 gr->pe_count_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
3431 if (WARN(gr->pe_count_per_gpc > GK20A_GR_MAX_PES_PER_GPC,
3432 "too many pes per gpc\n")) {
3433 goto clean_up;
3434 }
3435
3436 gr->max_zcull_per_gpc_count = nvgpu_get_litter_value(g, GPU_LIT_NUM_ZCULL_BANKS);
3437
3438 if (gr->gpc_count == 0U) {
3439 nvgpu_err(g, "gpc_count==0!");
3440 goto clean_up;
3441 }
3442
3443 if (gr->gpc_tpc_count == NULL) {
3444 gr->gpc_tpc_count = nvgpu_kzalloc(g, gr->gpc_count *
3445 sizeof(u32));
3446 } else {
3447 memset(gr->gpc_tpc_count, 0, gr->gpc_count *
3448 sizeof(u32));
3449 }
3450
3451 if (gr->gpc_tpc_mask == NULL) {
3452 gr->gpc_tpc_mask = nvgpu_kzalloc(g, gr->max_gpc_count *
3453 sizeof(u32));
3454 } else {
3455 memset(gr->gpc_tpc_mask, 0, gr->max_gpc_count *
3456 sizeof(u32));
3457 }
3458
3459 if (gr->gpc_zcb_count == NULL) {
3460 gr->gpc_zcb_count = nvgpu_kzalloc(g, gr->gpc_count *
3461 sizeof(u32));
3462 } else {
3463 memset(gr->gpc_zcb_count, 0, gr->gpc_count *
3464 sizeof(u32));
3465 }
3466
3467 if (gr->gpc_ppc_count == NULL) {
3468 gr->gpc_ppc_count = nvgpu_kzalloc(g, gr->gpc_count *
3469 sizeof(u32));
3470 } else {
3471 memset(gr->gpc_ppc_count, 0, gr->gpc_count *
3472 sizeof(u32));
3473 }
3474
3475 if (gr->gpc_skip_mask == NULL) {
3476 gr->gpc_skip_mask =
3477 nvgpu_kzalloc(g, gr_pd_dist_skip_table__size_1_v() *
3478 4 * sizeof(u32));
3479 } else {
3480 memset(gr->gpc_skip_mask, 0, gr_pd_dist_skip_table__size_1_v() *
3481 4 * sizeof(u32));
3482 }
3483
3484 if ((gr->gpc_tpc_count == NULL) || (gr->gpc_tpc_mask == NULL) ||
3485 (gr->gpc_zcb_count == NULL) || (gr->gpc_ppc_count == NULL) ||
3486 (gr->gpc_skip_mask == NULL)) {
3487 goto clean_up;
3488 }
3489
3490 for (gpc_index = 0; gpc_index < gr->max_gpc_count; gpc_index++) {
3491 if (g->ops.gr.get_gpc_tpc_mask) {
3492 gr->gpc_tpc_mask[gpc_index] =
3493 g->ops.gr.get_gpc_tpc_mask(g, gpc_index);
3494 }
3495 }
3496
3497 gr->ppc_count = 0;
3498 gr->tpc_count = 0;
3499 gr->zcb_count = 0;
3500 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3501 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r() +
3502 gpc_stride * gpc_index);
3503
3504 gr->gpc_tpc_count[gpc_index] =
3505 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3506 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3507
3508 gr->gpc_zcb_count[gpc_index] =
3509 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3510 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3511
3512 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3513 if (gr->pes_tpc_count[pes_index] == NULL) {
3514 gr->pes_tpc_count[pes_index] =
3515 nvgpu_kzalloc(g, gr->gpc_count *
3516 sizeof(u32));
3517 gr->pes_tpc_mask[pes_index] =
3518 nvgpu_kzalloc(g, gr->gpc_count *
3519 sizeof(u32));
3520 if ((gr->pes_tpc_count[pes_index] == NULL) ||
3521 (gr->pes_tpc_mask[pes_index] == NULL)) {
3522 goto clean_up;
3523 }
3524 }
3525
3526 tmp = gk20a_readl(g,
3527 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3528 gpc_index * gpc_stride);
3529
3530 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3531 pes_tpc_count = count_bits(pes_tpc_mask);
3532
3533 /* detect PES presence by seeing if there are
3534 * TPCs connected to it.
3535 */
3536 if (pes_tpc_count != 0) {
3537 gr->gpc_ppc_count[gpc_index]++;
3538 }
3539
3540 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3541 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3542 }
3543
3544 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3545
3546 gpc_new_skip_mask = 0;
3547 if (gr->pe_count_per_gpc > 1 &&
3548 gr->pes_tpc_count[0][gpc_index] +
3549 gr->pes_tpc_count[1][gpc_index] == 5) {
3550 pes_heavy_index =
3551 gr->pes_tpc_count[0][gpc_index] >
3552 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3553
3554 gpc_new_skip_mask =
3555 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3556 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3557 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3558
3559 } else if (gr->pe_count_per_gpc > 1 &&
3560 (gr->pes_tpc_count[0][gpc_index] +
3561 gr->pes_tpc_count[1][gpc_index] == 4) &&
3562 (gr->pes_tpc_count[0][gpc_index] !=
3563 gr->pes_tpc_count[1][gpc_index])) {
3564 pes_heavy_index =
3565 gr->pes_tpc_count[0][gpc_index] >
3566 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3567
3568 gpc_new_skip_mask =
3569 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3570 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3571 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3572 }
3573 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3574 }
3575
3576 /* allocate for max tpc per gpc */
3577 if (gr->sm_to_cluster == NULL) {
3578 gr->sm_to_cluster = nvgpu_kzalloc(g, gr->gpc_count *
3579 gr->max_tpc_per_gpc_count *
3580 sm_per_tpc * sizeof(struct sm_info));
3581 if (!gr->sm_to_cluster)
3582 goto clean_up;
3583 } else {
3584 memset(gr->sm_to_cluster, 0, gr->gpc_count *
3585 gr->max_tpc_per_gpc_count *
3586 sm_per_tpc * sizeof(struct sm_info));
3587 }
3588 gr->no_of_sm = 0;
3589
3590 nvgpu_log_info(g, "fbps: %d", gr->num_fbps);
3591 nvgpu_log_info(g, "max_gpc_count: %d", gr->max_gpc_count);
3592 nvgpu_log_info(g, "max_fbps_count: %d", gr->max_fbps_count);
3593 nvgpu_log_info(g, "max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3594 nvgpu_log_info(g, "max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3595 nvgpu_log_info(g, "max_tpc_count: %d", gr->max_tpc_count);
3596 nvgpu_log_info(g, "sys_count: %d", gr->sys_count);
3597 nvgpu_log_info(g, "gpc_count: %d", gr->gpc_count);
3598 nvgpu_log_info(g, "pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3599 nvgpu_log_info(g, "tpc_count: %d", gr->tpc_count);
3600 nvgpu_log_info(g, "ppc_count: %d", gr->ppc_count);
3601
3602 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3603 nvgpu_log_info(g, "gpc_tpc_count[%d] : %d",
3604 gpc_index, gr->gpc_tpc_count[gpc_index]);
3605 }
3606 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3607 nvgpu_log_info(g, "gpc_zcb_count[%d] : %d",
3608 gpc_index, gr->gpc_zcb_count[gpc_index]);
3609 }
3610 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3611 nvgpu_log_info(g, "gpc_ppc_count[%d] : %d",
3612 gpc_index, gr->gpc_ppc_count[gpc_index]);
3613 }
3614 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3615 nvgpu_log_info(g, "gpc_skip_mask[%d] : %d",
3616 gpc_index, gr->gpc_skip_mask[gpc_index]);
3617 }
3618 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3619 for (pes_index = 0;
3620 pes_index < gr->pe_count_per_gpc;
3621 pes_index++) {
3622 nvgpu_log_info(g, "pes_tpc_count[%d][%d] : %d",
3623 pes_index, gpc_index,
3624 gr->pes_tpc_count[pes_index][gpc_index]);
3625 }
3626 }
3627
3628 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3629 for (pes_index = 0;
3630 pes_index < gr->pe_count_per_gpc;
3631 pes_index++) {
3632 nvgpu_log_info(g, "pes_tpc_mask[%d][%d] : %d",
3633 pes_index, gpc_index,
3634 gr->pes_tpc_mask[pes_index][gpc_index]);
3635 }
3636 }
3637
3638 g->ops.gr.bundle_cb_defaults(g);
3639 g->ops.gr.cb_size_default(g);
3640 g->ops.gr.calc_global_ctx_buffer_size(g);
3641 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3642
3643 nvgpu_log_info(g, "bundle_cb_default_size: %d",
3644 gr->bundle_cb_default_size);
3645 nvgpu_log_info(g, "min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3646 nvgpu_log_info(g, "bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3647 nvgpu_log_info(g, "attrib_cb_default_size: %d",
3648 gr->attrib_cb_default_size);
3649 nvgpu_log_info(g, "attrib_cb_size: %d", gr->attrib_cb_size);
3650 nvgpu_log_info(g, "alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3651 nvgpu_log_info(g, "alpha_cb_size: %d", gr->alpha_cb_size);
3652 nvgpu_log_info(g, "timeslice_mode: %d", gr->timeslice_mode);
3653
3654 return 0;
3655
3656clean_up:
3657 return -ENOMEM;
3658}
3659
3660static u32 prime_set[18] = {
3661 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3662
3663static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3664{
3665 s32 comm_denom;
3666 s32 mul_factor;
3667 s32 *init_frac = NULL;
3668 s32 *init_err = NULL;
3669 s32 *run_err = NULL;
3670 s32 *sorted_num_tpcs = NULL;
3671 s32 *sorted_to_unsorted_gpc_map = NULL;
3672 u32 gpc_index;
3673 u32 gpc_mark = 0;
3674 u32 num_tpc;
3675 u32 max_tpc_count = 0;
3676 u32 swap;
3677 u32 tile_count;
3678 u32 index;
3679 bool delete_map = false;
3680 bool gpc_sorted;
3681 int ret = 0;
3682 int num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
3683 int num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
3684 int map_tile_count = num_gpcs * num_tpc_per_gpc;
3685
3686 init_frac = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3687 init_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3688 run_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3689 sorted_num_tpcs =
3690 nvgpu_kzalloc(g, num_gpcs * num_tpc_per_gpc * sizeof(s32));
3691 sorted_to_unsorted_gpc_map =
3692 nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
3693
3694 if (!((init_frac != NULL) &&
3695 (init_err != NULL) &&
3696 (run_err != NULL) &&
3697 (sorted_num_tpcs != NULL) &&
3698 (sorted_to_unsorted_gpc_map != NULL))) {
3699 ret = -ENOMEM;
3700 goto clean_up;
3701 }
3702
3703 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3704
3705 if (gr->tpc_count == 3) {
3706 gr->map_row_offset = 2;
3707 } else if (gr->tpc_count < 3) {
3708 gr->map_row_offset = 1;
3709 } else {
3710 gr->map_row_offset = 3;
3711
3712 for (index = 1; index < 18; index++) {
3713 u32 prime = prime_set[index];
3714 if ((gr->tpc_count % prime) != 0) {
3715 gr->map_row_offset = prime;
3716 break;
3717 }
3718 }
3719 }
3720
3721 switch (gr->tpc_count) {
3722 case 15:
3723 gr->map_row_offset = 6;
3724 break;
3725 case 14:
3726 gr->map_row_offset = 5;
3727 break;
3728 case 13:
3729 gr->map_row_offset = 2;
3730 break;
3731 case 11:
3732 gr->map_row_offset = 7;
3733 break;
3734 case 10:
3735 gr->map_row_offset = 6;
3736 break;
3737 case 7:
3738 case 5:
3739 gr->map_row_offset = 1;
3740 break;
3741 default:
3742 break;
3743 }
3744
3745 if (gr->map_tiles) {
3746 if (gr->map_tile_count != gr->tpc_count) {
3747 delete_map = true;
3748 }
3749
3750 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3751 if (gr_gk20a_get_map_tile_count(gr, tile_count)
3752 >= gr->tpc_count) {
3753 delete_map = true;
3754 }
3755 }
3756
3757 if (delete_map) {
3758 nvgpu_kfree(g, gr->map_tiles);
3759 gr->map_tiles = NULL;
3760 gr->map_tile_count = 0;
3761 }
3762 }
3763
3764 if (gr->map_tiles == NULL) {
3765 gr->map_tiles = nvgpu_kzalloc(g, map_tile_count * sizeof(u8));
3766 if (gr->map_tiles == NULL) {
3767 ret = -ENOMEM;
3768 goto clean_up;
3769 }
3770 gr->map_tile_count = map_tile_count;
3771
3772 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3773 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3774 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3775 }
3776
3777 gpc_sorted = false;
3778 while (!gpc_sorted) {
3779 gpc_sorted = true;
3780 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3781 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3782 gpc_sorted = false;
3783 swap = sorted_num_tpcs[gpc_index];
3784 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3785 sorted_num_tpcs[gpc_index + 1] = swap;
3786 swap = sorted_to_unsorted_gpc_map[gpc_index];
3787 sorted_to_unsorted_gpc_map[gpc_index] =
3788 sorted_to_unsorted_gpc_map[gpc_index + 1];
3789 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3790 }
3791 }
3792 }
3793
3794 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3795 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count) {
3796 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3797 }
3798 }
3799
3800 mul_factor = gr->gpc_count * max_tpc_count;
3801 if (mul_factor & 0x1) {
3802 mul_factor = 2;
3803 } else {
3804 mul_factor = 1;
3805 }
3806
3807 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3808
3809 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3810 num_tpc = sorted_num_tpcs[gpc_index];
3811
3812 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3813
3814 if (num_tpc != 0) {
3815 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3816 } else {
3817 init_err[gpc_index] = 0;
3818 }
3819
3820 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3821 }
3822
3823 while (gpc_mark < gr->tpc_count) {
3824 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3825 if ((run_err[gpc_index] * 2) >= comm_denom) {
3826 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3827 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3828 } else {
3829 run_err[gpc_index] += init_frac[gpc_index];
3830 }
3831 }
3832 }
3833 }
3834
3835clean_up:
3836 nvgpu_kfree(g, init_frac);
3837 nvgpu_kfree(g, init_err);
3838 nvgpu_kfree(g, run_err);
3839 nvgpu_kfree(g, sorted_num_tpcs);
3840 nvgpu_kfree(g, sorted_to_unsorted_gpc_map);
3841
3842 if (ret) {
3843 nvgpu_err(g, "fail");
3844 } else {
3845 nvgpu_log_fn(g, "done");
3846 }
3847
3848 return ret;
3849}
3850
3851static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3852{
3853 struct gr_zcull_gk20a *zcull = &gr->zcull;
3854
3855 zcull->aliquot_width = gr->tpc_count * 16;
3856 zcull->aliquot_height = 16;
3857
3858 zcull->width_align_pixels = gr->tpc_count * 16;
3859 zcull->height_align_pixels = 32;
3860
3861 zcull->aliquot_size =
3862 zcull->aliquot_width * zcull->aliquot_height;
3863
3864 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3865 zcull->pixel_squares_by_aliquots =
3866 gr->zcb_count * 16 * 16 * gr->tpc_count /
3867 (gr->gpc_count * gr->gpc_tpc_count[0]);
3868
3869 zcull->total_aliquots =
3870 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3871 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3872
3873 return 0;
3874}
3875
3876u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3877{
3878 /* assuming gr has already been initialized */
3879 return gr->ctx_vars.zcull_ctxsw_image_size;
3880}
3881
3882int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3883 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3884{
3885 struct tsg_gk20a *tsg;
3886 struct zcull_ctx_desc *zcull_ctx;
3887
3888 tsg = tsg_gk20a_from_ch(c);
3889 if (tsg == NULL) {
3890 return -EINVAL;
3891 }
3892
3893 zcull_ctx = &tsg->gr_ctx.zcull_ctx;
3894 zcull_ctx->ctx_sw_mode = mode;
3895 zcull_ctx->gpu_va = zcull_va;
3896
3897 /* TBD: don't disable channel in sw method processing */
3898 return gr_gk20a_ctx_zcull_setup(g, c);
3899}
3900
3901int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3902 struct gr_zcull_info *zcull_params)
3903{
3904 struct gr_zcull_gk20a *zcull = &gr->zcull;
3905
3906 zcull_params->width_align_pixels = zcull->width_align_pixels;
3907 zcull_params->height_align_pixels = zcull->height_align_pixels;
3908 zcull_params->pixel_squares_by_aliquots =
3909 zcull->pixel_squares_by_aliquots;
3910 zcull_params->aliquot_total = zcull->total_aliquots;
3911
3912 zcull_params->region_byte_multiplier =
3913 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3914 zcull_params->region_header_size =
3915 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3916 gr_zcull_save_restore_header_bytes_per_gpc_v();
3917
3918 zcull_params->subregion_header_size =
3919 nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
3920 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3921
3922 zcull_params->subregion_width_align_pixels =
3923 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3924 zcull_params->subregion_height_align_pixels =
3925 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3926 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3927
3928 return 0;
3929}
3930
3931int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3932 struct zbc_entry *color_val, u32 index)
3933{
3934 u32 i;
3935
3936 /* update l2 table */
3937 g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3938
3939 /* update ds table */
3940 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3941 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3942 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3943 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3944 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3945 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3946 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3947 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3948
3949 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3950 gr_ds_zbc_color_fmt_val_f(color_val->format));
3951
3952 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3953 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3954
3955 /* trigger the write */
3956 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3957 gr_ds_zbc_tbl_ld_select_c_f() |
3958 gr_ds_zbc_tbl_ld_action_write_f() |
3959 gr_ds_zbc_tbl_ld_trigger_active_f());
3960
3961 /* update local copy */
3962 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3963 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3964 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3965 }
3966 gr->zbc_col_tbl[index].format = color_val->format;
3967 gr->zbc_col_tbl[index].ref_cnt++;
3968
3969 return 0;
3970}
3971
3972int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3973 struct zbc_entry *depth_val, u32 index)
3974{
3975 /* update l2 table */
3976 g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3977
3978 /* update ds table */
3979 gk20a_writel(g, gr_ds_zbc_z_r(),
3980 gr_ds_zbc_z_val_f(depth_val->depth));
3981
3982 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3983 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3984
3985 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3986 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3987
3988 /* trigger the write */
3989 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3990 gr_ds_zbc_tbl_ld_select_z_f() |
3991 gr_ds_zbc_tbl_ld_action_write_f() |
3992 gr_ds_zbc_tbl_ld_trigger_active_f());
3993
3994 /* update local copy */
3995 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3996 gr->zbc_dep_tbl[index].format = depth_val->format;
3997 gr->zbc_dep_tbl[index].ref_cnt++;
3998
3999 return 0;
4000}
4001
4002void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
4003{
4004 struct fifo_gk20a *f = &g->fifo;
4005 struct fifo_engine_info_gk20a *gr_info = NULL;
4006 u32 ret;
4007 u32 engine_id;
4008
4009 engine_id = gk20a_fifo_get_gr_engine_id(g);
4010 gr_info = (f->engine_info + engine_id);
4011
4012 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
4013 if (ret) {
4014 nvgpu_err(g,
4015 "failed to disable gr engine activity");
4016 return;
4017 }
4018
4019 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
4020 GR_IDLE_CHECK_DEFAULT);
4021 if (ret) {
4022 nvgpu_err(g,
4023 "failed to idle graphics");
4024 goto clean_up;
4025 }
4026
4027 /* update zbc */
4028 g->ops.gr.pmu_save_zbc(g, entries);
4029
4030clean_up:
4031 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
4032 if (ret) {
4033 nvgpu_err(g,
4034 "failed to enable gr engine activity");
4035 }
4036}
4037
4038int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
4039 struct zbc_entry *zbc_val)
4040{
4041 struct zbc_color_table *c_tbl;
4042 struct zbc_depth_table *d_tbl;
4043 u32 i;
4044 int ret = -ENOSPC;
4045 bool added = false;
4046 u32 entries;
4047
4048 /* no endian swap ? */
4049
4050 nvgpu_mutex_acquire(&gr->zbc_lock);
4051 nvgpu_speculation_barrier();
4052 switch (zbc_val->type) {
4053 case GK20A_ZBC_TYPE_COLOR:
4054 /* search existing tables */
4055 for (i = 0; i < gr->max_used_color_index; i++) {
4056
4057 c_tbl = &gr->zbc_col_tbl[i];
4058
4059 if ((c_tbl->ref_cnt != 0U) &&
4060 (c_tbl->format == zbc_val->format) &&
4061 (memcmp(c_tbl->color_ds, zbc_val->color_ds,
4062 sizeof(zbc_val->color_ds)) == 0) &&
4063 (memcmp(c_tbl->color_l2, zbc_val->color_l2,
4064 sizeof(zbc_val->color_l2)) == 0)) {
4065
4066 added = true;
4067 c_tbl->ref_cnt++;
4068 ret = 0;
4069 break;
4070 }
4071 }
4072 /* add new table */
4073 if (!added &&
4074 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
4075
4076 c_tbl =
4077 &gr->zbc_col_tbl[gr->max_used_color_index];
4078 WARN_ON(c_tbl->ref_cnt != 0);
4079
4080 ret = g->ops.gr.add_zbc_color(g, gr,
4081 zbc_val, gr->max_used_color_index);
4082
4083 if (ret == 0) {
4084 gr->max_used_color_index++;
4085 }
4086 }
4087 break;
4088 case GK20A_ZBC_TYPE_DEPTH:
4089 /* search existing tables */
4090 for (i = 0; i < gr->max_used_depth_index; i++) {
4091
4092 d_tbl = &gr->zbc_dep_tbl[i];
4093
4094 if ((d_tbl->ref_cnt != 0U) &&
4095 (d_tbl->depth == zbc_val->depth) &&
4096 (d_tbl->format == zbc_val->format)) {
4097 added = true;
4098 d_tbl->ref_cnt++;
4099 ret = 0;
4100 break;
4101 }
4102 }
4103 /* add new table */
4104 if (!added &&
4105 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
4106
4107 d_tbl =
4108 &gr->zbc_dep_tbl[gr->max_used_depth_index];
4109 WARN_ON(d_tbl->ref_cnt != 0);
4110
4111 ret = g->ops.gr.add_zbc_depth(g, gr,
4112 zbc_val, gr->max_used_depth_index);
4113
4114 if (ret == 0) {
4115 gr->max_used_depth_index++;
4116 }
4117 }
4118 break;
4119 case T19X_ZBC:
4120 if (g->ops.gr.add_zbc_type_s) {
4121 added = g->ops.gr.add_zbc_type_s(g, gr, zbc_val, &ret);
4122 } else {
4123 nvgpu_err(g,
4124 "invalid zbc table type %d", zbc_val->type);
4125 ret = -EINVAL;
4126 goto err_mutex;
4127 }
4128 break;
4129 default:
4130 nvgpu_err(g,
4131 "invalid zbc table type %d", zbc_val->type);
4132 ret = -EINVAL;
4133 goto err_mutex;
4134 }
4135
4136 if (!added && ret == 0) {
4137 /* update zbc for elpg only when new entry is added */
4138 entries = max(gr->max_used_color_index,
4139 gr->max_used_depth_index);
4140 g->ops.gr.pmu_save_zbc(g, entries);
4141 }
4142
4143err_mutex:
4144 nvgpu_mutex_release(&gr->zbc_lock);
4145 return ret;
4146}
4147
4148/* get a zbc table entry specified by index
4149 * return table size when type is invalid */
4150int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
4151 struct zbc_query_params *query_params)
4152{
4153 u32 index = query_params->index_size;
4154 u32 i;
4155
4156 nvgpu_speculation_barrier();
4157 switch (query_params->type) {
4158 case GK20A_ZBC_TYPE_INVALID:
4159 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
4160 break;
4161 case GK20A_ZBC_TYPE_COLOR:
4162 if (index >= GK20A_ZBC_TABLE_SIZE) {
4163 nvgpu_err(g,
4164 "invalid zbc color table index");
4165 return -EINVAL;
4166 }
4167
4168 nvgpu_speculation_barrier();
4169 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4170 query_params->color_l2[i] =
4171 gr->zbc_col_tbl[index].color_l2[i];
4172 query_params->color_ds[i] =
4173 gr->zbc_col_tbl[index].color_ds[i];
4174 }
4175 query_params->format = gr->zbc_col_tbl[index].format;
4176 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
4177 break;
4178 case GK20A_ZBC_TYPE_DEPTH:
4179 if (index >= GK20A_ZBC_TABLE_SIZE) {
4180 nvgpu_err(g,
4181 "invalid zbc depth table index");
4182 return -EINVAL;
4183 }
4184
4185 nvgpu_speculation_barrier();
4186 query_params->depth = gr->zbc_dep_tbl[index].depth;
4187 query_params->format = gr->zbc_dep_tbl[index].format;
4188 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
4189 break;
4190 case T19X_ZBC:
4191 if (g->ops.gr.zbc_s_query_table) {
4192 return g->ops.gr.zbc_s_query_table(g, gr,
4193 query_params);
4194 } else {
4195 nvgpu_err(g,
4196 "invalid zbc table type");
4197 return -EINVAL;
4198 }
4199 break;
4200 default:
4201 nvgpu_err(g,
4202 "invalid zbc table type");
4203 return -EINVAL;
4204 }
4205
4206 return 0;
4207}
4208
4209static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
4210{
4211 unsigned int i;
4212 int ret;
4213
4214 for (i = 0; i < gr->max_used_color_index; i++) {
4215 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
4216 struct zbc_entry zbc_val;
4217
4218 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
4219 memcpy(zbc_val.color_ds,
4220 c_tbl->color_ds, sizeof(zbc_val.color_ds));
4221 memcpy(zbc_val.color_l2,
4222 c_tbl->color_l2, sizeof(zbc_val.color_l2));
4223 zbc_val.format = c_tbl->format;
4224
4225 ret = g->ops.gr.add_zbc_color(g, gr, &zbc_val, i);
4226
4227 if (ret) {
4228 return ret;
4229 }
4230 }
4231 for (i = 0; i < gr->max_used_depth_index; i++) {
4232 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
4233 struct zbc_entry zbc_val;
4234
4235 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
4236 zbc_val.depth = d_tbl->depth;
4237 zbc_val.format = d_tbl->format;
4238
4239 ret = g->ops.gr.add_zbc_depth(g, gr, &zbc_val, i);
4240 if (ret) {
4241 return ret;
4242 }
4243 }
4244
4245 if (g->ops.gr.load_zbc_s_tbl) {
4246 ret = g->ops.gr.load_zbc_s_tbl(g, gr);
4247 if (ret) {
4248 return ret;
4249 }
4250 }
4251
4252 return 0;
4253}
4254
4255int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
4256{
4257 struct zbc_entry zbc_val;
4258 u32 i = 0;
4259 int err = 0;
4260
4261 err = nvgpu_mutex_init(&gr->zbc_lock);
4262 if (err != 0) {
4263 nvgpu_err(g, "Error in zbc_lock mutex initialization");
4264 return err;
4265 }
4266
4267 /* load default color table */
4268 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
4269
4270 /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
4271 zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v();
4272 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4273 zbc_val.color_ds[i] = 0;
4274 zbc_val.color_l2[i] = 0;
4275 }
4276 zbc_val.color_l2[0] = 0xff000000;
4277 zbc_val.color_ds[3] = 0x3f800000;
4278 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4279 if (err != 0) {
4280 goto color_fail;
4281 }
4282
4283 /* Transparent black = (fmt 1 = zero) */
4284 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
4285 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4286 zbc_val.color_ds[i] = 0;
4287 zbc_val.color_l2[i] = 0;
4288 }
4289 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4290 if (err != 0) {
4291 goto color_fail;
4292 }
4293
4294 /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
4295 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
4296 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
4297 zbc_val.color_ds[i] = 0x3f800000;
4298 zbc_val.color_l2[i] = 0xffffffff;
4299 }
4300 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4301 if (err != 0) {
4302 goto color_fail;
4303 }
4304
4305 gr->max_default_color_index = 3;
4306
4307 /* load default depth table */
4308 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
4309
4310 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4311 zbc_val.depth = 0x3f800000;
4312 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4313 if (err != 0) {
4314 goto depth_fail;
4315 }
4316
4317 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
4318 zbc_val.depth = 0;
4319 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
4320 if (err != 0) {
4321 goto depth_fail;
4322 }
4323
4324 gr->max_default_depth_index = 2;
4325
4326 if (g->ops.gr.load_zbc_s_default_tbl) {
4327 err = g->ops.gr.load_zbc_s_default_tbl(g, gr);
4328 if (err != 0) {
4329 return err;
4330 }
4331 }
4332
4333 return 0;
4334
4335color_fail:
4336 nvgpu_err(g, "fail to load default zbc color table");
4337 return err;
4338depth_fail:
4339 nvgpu_err(g, "fail to load default zbc depth table");
4340 return err;
4341}
4342
4343int _gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4344 struct zbc_entry *zbc_val)
4345{
4346 struct fifo_gk20a *f = &g->fifo;
4347 struct fifo_engine_info_gk20a *gr_info = NULL;
4348 int ret;
4349 u32 engine_id;
4350
4351 engine_id = gk20a_fifo_get_gr_engine_id(g);
4352 gr_info = (f->engine_info + engine_id);
4353
4354 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
4355 if (ret) {
4356 nvgpu_err(g,
4357 "failed to disable gr engine activity");
4358 return ret;
4359 }
4360
4361 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
4362 GR_IDLE_CHECK_DEFAULT);
4363 if (ret) {
4364 nvgpu_err(g,
4365 "failed to idle graphics");
4366 goto clean_up;
4367 }
4368
4369 ret = gr_gk20a_add_zbc(g, gr, zbc_val);
4370
4371clean_up:
4372 if (gk20a_fifo_enable_engine_activity(g, gr_info)) {
4373 nvgpu_err(g,
4374 "failed to enable gr engine activity");
4375 }
4376
4377 return ret;
4378}
4379
4380int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
4381 struct zbc_entry *zbc_val)
4382{
4383 nvgpu_log_fn(g, " ");
4384
4385 return gr_gk20a_elpg_protected_call(g,
4386 gr_gk20a_add_zbc(g, gr, zbc_val));
4387}
4388
4389void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
4390 u32 *zcull_map_tiles)
4391{
4392 u32 val;
4393
4394 nvgpu_log_fn(g, " ");
4395
4396 if (zcull_num_entries >= 8) {
4397 nvgpu_log_fn(g, "map0");
4398 val =
4399 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(
4400 zcull_map_tiles[0]) |
4401 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(
4402 zcull_map_tiles[1]) |
4403 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(
4404 zcull_map_tiles[2]) |
4405 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(
4406 zcull_map_tiles[3]) |
4407 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(
4408 zcull_map_tiles[4]) |
4409 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(
4410 zcull_map_tiles[5]) |
4411 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(
4412 zcull_map_tiles[6]) |
4413 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(
4414 zcull_map_tiles[7]);
4415
4416 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(), val);
4417 }
4418
4419 if (zcull_num_entries >= 16) {
4420 nvgpu_log_fn(g, "map1");
4421 val =
4422 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(
4423 zcull_map_tiles[8]) |
4424 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(
4425 zcull_map_tiles[9]) |
4426 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(
4427 zcull_map_tiles[10]) |
4428 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(
4429 zcull_map_tiles[11]) |
4430 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(
4431 zcull_map_tiles[12]) |
4432 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(
4433 zcull_map_tiles[13]) |
4434 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(
4435 zcull_map_tiles[14]) |
4436 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(
4437 zcull_map_tiles[15]);
4438
4439 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(), val);
4440 }
4441
4442 if (zcull_num_entries >= 24) {
4443 nvgpu_log_fn(g, "map2");
4444 val =
4445 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(
4446 zcull_map_tiles[16]) |
4447 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(
4448 zcull_map_tiles[17]) |
4449 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(
4450 zcull_map_tiles[18]) |
4451 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(
4452 zcull_map_tiles[19]) |
4453 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(
4454 zcull_map_tiles[20]) |
4455 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(
4456 zcull_map_tiles[21]) |
4457 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(
4458 zcull_map_tiles[22]) |
4459 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(
4460 zcull_map_tiles[23]);
4461
4462 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(), val);
4463 }
4464
4465 if (zcull_num_entries >= 32) {
4466 nvgpu_log_fn(g, "map3");
4467 val =
4468 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(
4469 zcull_map_tiles[24]) |
4470 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(
4471 zcull_map_tiles[25]) |
4472 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(
4473 zcull_map_tiles[26]) |
4474 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(
4475 zcull_map_tiles[27]) |
4476 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(
4477 zcull_map_tiles[28]) |
4478 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(
4479 zcull_map_tiles[29]) |
4480 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(
4481 zcull_map_tiles[30]) |
4482 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(
4483 zcull_map_tiles[31]);
4484
4485 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(), val);
4486 }
4487
4488}
4489
4490static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
4491{
4492 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
4493 u32 *zcull_map_tiles, *zcull_bank_counters;
4494 u32 map_counter;
4495 u32 rcp_conserv;
4496 u32 offset;
4497 bool floorsweep = false;
4498 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
4499 u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
4500 u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
4501 GPU_LIT_NUM_TPC_PER_GPC);
4502 u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc;
4503 u32 map_tile_count;
4504
4505 if (gr->map_tiles == NULL) {
4506 return -1;
4507 }
4508
4509 if (zcull_alloc_num % 8 != 0) {
4510 /* Total 8 fields per map reg i.e. tile_0 to tile_7*/
4511 zcull_alloc_num += (zcull_alloc_num % 8);
4512 }
4513 zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4514
4515 if (zcull_map_tiles == NULL) {
4516 nvgpu_err(g,
4517 "failed to allocate zcull map titles");
4518 return -ENOMEM;
4519 }
4520
4521 zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
4522
4523 if (zcull_bank_counters == NULL) {
4524 nvgpu_err(g,
4525 "failed to allocate zcull bank counters");
4526 nvgpu_kfree(g, zcull_map_tiles);
4527 return -ENOMEM;
4528 }
4529
4530 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
4531 map_tile_count = gr_gk20a_get_map_tile_count(gr, map_counter);
4532 zcull_map_tiles[map_counter] =
4533 zcull_bank_counters[map_tile_count];
4534 zcull_bank_counters[map_tile_count]++;
4535 }
4536
4537 if (g->ops.gr.program_zcull_mapping != NULL) {
4538 g->ops.gr.program_zcull_mapping(g, zcull_alloc_num,
4539 zcull_map_tiles);
4540 }
4541
4542 nvgpu_kfree(g, zcull_map_tiles);
4543 nvgpu_kfree(g, zcull_bank_counters);
4544
4545 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4546 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
4547 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
4548
4549 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4550 gpc_zcull_count < gpc_tpc_count) {
4551 nvgpu_err(g,
4552 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
4553 gpc_zcull_count, gpc_tpc_count, gpc_index);
4554 return -EINVAL;
4555 }
4556 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4557 gpc_zcull_count != 0) {
4558 floorsweep = true;
4559 }
4560 }
4561
4562 /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */
4563 rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(),
4564 gr->gpc_tpc_count[0]);
4565
4566 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4567 offset = gpc_index * gpc_stride;
4568
4569 if (floorsweep) {
4570 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4571 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4572 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4573 gr->max_zcull_per_gpc_count));
4574 } else {
4575 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4576 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4577 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4578 gr->gpc_tpc_count[gpc_index]));
4579 }
4580
4581 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4582 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4583 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4584
4585 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4586 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4587 }
4588
4589 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4590 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4591
4592 return 0;
4593}
4594
4595void gk20a_gr_enable_exceptions(struct gk20a *g)
4596{
4597 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4598 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4599 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4600 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4601 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4602 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4603}
4604
4605void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4606{
4607 struct gr_gk20a *gr = &g->gr;
4608 u32 tpc_mask;
4609
4610 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(),
4611 gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f() |
4612 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f());
4613
4614 tpc_mask =
4615 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->max_tpc_per_gpc_count) - 1);
4616
4617 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
4618}
4619
4620
4621void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4622{
4623 /* enable exceptions */
4624 gk20a_writel(g, gr_fe_hww_esr_r(),
4625 gr_fe_hww_esr_en_enable_f() |
4626 gr_fe_hww_esr_reset_active_f());
4627 gk20a_writel(g, gr_memfmt_hww_esr_r(),
4628 gr_memfmt_hww_esr_en_enable_f() |
4629 gr_memfmt_hww_esr_reset_active_f());
4630}
4631
4632void gr_gk20a_fecs_host_int_enable(struct gk20a *g)
4633{
4634 gk20a_writel(g, gr_fecs_host_int_enable_r(),
4635 gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
4636 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4637 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4638 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4639 gr_fecs_host_int_enable_watchdog_enable_f());
4640}
4641
4642static int gk20a_init_gr_setup_hw(struct gk20a *g)
4643{
4644 struct gr_gk20a *gr = &g->gr;
4645 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4646 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4647 u32 data;
4648 u32 last_method_data = 0;
4649 u32 i, err;
4650
4651 nvgpu_log_fn(g, " ");
4652
4653 if (g->ops.gr.init_gpc_mmu) {
4654 g->ops.gr.init_gpc_mmu(g);
4655 }
4656
4657 /* load gr floorsweeping registers */
4658 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4659 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4660 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4661 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4662
4663 gr_gk20a_zcull_init_hw(g, gr);
4664
4665 if (g->ops.priv_ring.set_ppriv_timeout_settings != NULL) {
4666 g->ops.priv_ring.set_ppriv_timeout_settings(g);
4667 }
4668
4669 /* enable fifo access */
4670 gk20a_writel(g, gr_gpfifo_ctl_r(),
4671 gr_gpfifo_ctl_access_enabled_f() |
4672 gr_gpfifo_ctl_semaphore_access_enabled_f());
4673
4674 /* TBD: reload gr ucode when needed */
4675
4676 /* enable interrupts */
4677 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4678 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4679
4680 /* enable fecs error interrupts */
4681 g->ops.gr.fecs_host_int_enable(g);
4682
4683 g->ops.gr.enable_hww_exceptions(g);
4684 g->ops.gr.set_hww_esr_report_mask(g);
4685
4686 /* enable TPC exceptions per GPC */
4687 if (g->ops.gr.enable_gpc_exceptions) {
4688 g->ops.gr.enable_gpc_exceptions(g);
4689 }
4690
4691 /* enable ECC for L1/SM */
4692 if (g->ops.gr.ecc_init_scrub_reg) {
4693 g->ops.gr.ecc_init_scrub_reg(g);
4694 }
4695
4696 /* TBD: enable per BE exceptions */
4697
4698 /* reset and enable exceptions */
4699 g->ops.gr.enable_exceptions(g);
4700
4701 gr_gk20a_load_zbc_table(g, gr);
4702
4703 if (g->ops.ltc.init_cbc) {
4704 g->ops.ltc.init_cbc(g, gr);
4705 }
4706
4707 if (g->ops.fb.init_cbc) {
4708 g->ops.fb.init_cbc(g, gr);
4709 }
4710
4711 /* load ctx init */
4712 for (i = 0; i < sw_ctx_load->count; i++) {
4713 gk20a_writel(g, sw_ctx_load->l[i].addr,
4714 sw_ctx_load->l[i].value);
4715 }
4716
4717 if (g->ops.gr.disable_rd_coalesce) {
4718 g->ops.gr.disable_rd_coalesce(g);
4719 }
4720
4721 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4722 GR_IDLE_CHECK_DEFAULT);
4723 if (err != 0U) {
4724 goto out;
4725 }
4726
4727 if (g->ops.gr.init_preemption_state) {
4728 err = g->ops.gr.init_preemption_state(g);
4729 if (err != 0U) {
4730 goto out;
4731 }
4732 }
4733
4734 /* disable fe_go_idle */
4735 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4736 gr_fe_go_idle_timeout_count_disabled_f());
4737
4738 /* override a few ctx state registers */
4739 g->ops.gr.commit_global_timeslice(g, NULL);
4740
4741 /* floorsweep anything left */
4742 err = g->ops.gr.init_fs_state(g);
4743 if (err != 0U) {
4744 goto out;
4745 }
4746
4747 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4748 GR_IDLE_CHECK_DEFAULT);
4749 if (err != 0U) {
4750 goto restore_fe_go_idle;
4751 }
4752
4753restore_fe_go_idle:
4754 /* restore fe_go_idle */
4755 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4756 gr_fe_go_idle_timeout_count_prod_f());
4757
4758 if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4759 GR_IDLE_CHECK_DEFAULT) != 0)) {
4760 goto out;
4761 }
4762
4763 /* load method init */
4764 if (sw_method_init->count) {
4765 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4766 sw_method_init->l[0].value);
4767 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4768 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4769 sw_method_init->l[0].addr);
4770 last_method_data = sw_method_init->l[0].value;
4771 }
4772 for (i = 1; i < sw_method_init->count; i++) {
4773 if (sw_method_init->l[i].value != last_method_data) {
4774 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4775 sw_method_init->l[i].value);
4776 last_method_data = sw_method_init->l[i].value;
4777 }
4778 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4779 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4780 sw_method_init->l[i].addr);
4781 }
4782
4783 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4784 GR_IDLE_CHECK_DEFAULT);
4785out:
4786 nvgpu_log_fn(g, "done");
4787 return err;
4788}
4789
4790static int gk20a_init_gr_prepare(struct gk20a *g)
4791{
4792 u32 err = 0;
4793
4794 /* reset gr engine */
4795 g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_GRAPH) |
4796 g->ops.mc.reset_mask(g, NVGPU_UNIT_BLG) |
4797 g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON));
4798
4799 nvgpu_cg_init_gr_load_gating_prod(g);
4800
4801 /* Disable elcg until it gets enabled later in the init*/
4802 nvgpu_cg_elcg_disable_no_wait(g);
4803
4804 /* enable fifo access */
4805 gk20a_writel(g, gr_gpfifo_ctl_r(),
4806 gr_gpfifo_ctl_access_enabled_f() |
4807 gr_gpfifo_ctl_semaphore_access_enabled_f());
4808
4809 if (!g->gr.ctx_vars.valid) {
4810 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4811 if (err != 0U) {
4812 nvgpu_err(g,
4813 "fail to load gr init ctx");
4814 }
4815 }
4816 return err;
4817}
4818
4819static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4820{
4821 struct nvgpu_timeout timeout;
4822 bool fecs_scrubbing;
4823 bool gpccs_scrubbing;
4824
4825 nvgpu_log_fn(g, " ");
4826
4827 nvgpu_timeout_init(g, &timeout,
4828 CTXSW_MEM_SCRUBBING_TIMEOUT_MAX /
4829 CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT,
4830 NVGPU_TIMER_RETRY_TIMER);
4831 do {
4832 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4833 (gr_fecs_dmactl_imem_scrubbing_m() |
4834 gr_fecs_dmactl_dmem_scrubbing_m());
4835
4836 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4837 (gr_gpccs_dmactl_imem_scrubbing_m() |
4838 gr_gpccs_dmactl_imem_scrubbing_m());
4839
4840 if (!fecs_scrubbing && !gpccs_scrubbing) {
4841 nvgpu_log_fn(g, "done");
4842 return 0;
4843 }
4844
4845 nvgpu_udelay(CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT);
4846 } while (nvgpu_timeout_expired(&timeout) == 0);
4847
4848 nvgpu_err(g, "Falcon mem scrubbing timeout");
4849 return -ETIMEDOUT;
4850}
4851
4852static int gr_gk20a_init_ctxsw(struct gk20a *g)
4853{
4854 u32 err = 0;
4855
4856 err = g->ops.gr.load_ctxsw_ucode(g);
4857 if (err != 0U) {
4858 goto out;
4859 }
4860
4861 err = gr_gk20a_wait_ctxsw_ready(g);
4862 if (err != 0U) {
4863 goto out;
4864 }
4865
4866out:
4867 if (err != 0U) {
4868 nvgpu_err(g, "fail");
4869 } else {
4870 nvgpu_log_fn(g, "done");
4871 }
4872
4873 return err;
4874}
4875
4876static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4877{
4878 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4879 u32 i, err = 0;
4880
4881 nvgpu_log_fn(g, " ");
4882
4883 /* enable interrupts */
4884 gk20a_writel(g, gr_intr_r(), ~0);
4885 gk20a_writel(g, gr_intr_en_r(), ~0);
4886
4887 /* load non_ctx init */
4888 for (i = 0; i < sw_non_ctx_load->count; i++) {
4889 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4890 sw_non_ctx_load->l[i].value);
4891 }
4892
4893 err = gr_gk20a_wait_mem_scrubbing(g);
4894 if (err != 0U) {
4895 goto out;
4896 }
4897
4898 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4899 GR_IDLE_CHECK_DEFAULT);
4900 if (err != 0U) {
4901 goto out;
4902 }
4903
4904out:
4905 if (err != 0U) {
4906 nvgpu_err(g, "fail");
4907 } else {
4908 nvgpu_log_fn(g, "done");
4909 }
4910
4911 return 0;
4912}
4913
4914static int gr_gk20a_init_access_map(struct gk20a *g)
4915{
4916 struct gr_gk20a *gr = &g->gr;
4917 struct nvgpu_mem *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
4918 u32 nr_pages =
4919 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4920 PAGE_SIZE);
4921 u32 *whitelist = NULL;
4922 int w, num_entries = 0;
4923
4924 nvgpu_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
4925
4926 g->ops.gr.get_access_map(g, &whitelist, &num_entries);
4927
4928 for (w = 0; w < num_entries; w++) {
4929 u32 map_bit, map_byte, map_shift, x;
4930 map_bit = whitelist[w] >> 2;
4931 map_byte = map_bit >> 3;
4932 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4933 nvgpu_log_info(g, "access map addr:0x%x byte:0x%x bit:%d",
4934 whitelist[w], map_byte, map_shift);
4935 x = nvgpu_mem_rd32(g, mem, map_byte / sizeof(u32));
4936 x |= 1 << (
4937 (map_byte % sizeof(u32) * BITS_PER_BYTE)
4938 + map_shift);
4939 nvgpu_mem_wr32(g, mem, map_byte / sizeof(u32), x);
4940 }
4941
4942 return 0;
4943}
4944
4945static int gk20a_init_gr_setup_sw(struct gk20a *g)
4946{
4947 struct gr_gk20a *gr = &g->gr;
4948 int err = 0;
4949
4950 nvgpu_log_fn(g, " ");
4951
4952 if (gr->sw_ready) {
4953 nvgpu_log_fn(g, "skip init");
4954 return 0;
4955 }
4956
4957 gr->g = g;
4958
4959#if defined(CONFIG_GK20A_CYCLE_STATS)
4960 err = nvgpu_mutex_init(&g->gr.cs_lock);
4961 if (err != 0) {
4962 nvgpu_err(g, "Error in gr.cs_lock mutex initialization");
4963 return err;
4964 }
4965#endif
4966
4967 err = gr_gk20a_init_gr_config(g, gr);
4968 if (err != 0) {
4969 goto clean_up;
4970 }
4971
4972 err = gr_gk20a_init_map_tiles(g, gr);
4973 if (err != 0) {
4974 goto clean_up;
4975 }
4976
4977 if (g->ops.ltc.init_comptags) {
4978 err = g->ops.ltc.init_comptags(g, gr);
4979 if (err != 0) {
4980 goto clean_up;
4981 }
4982 }
4983
4984 err = gr_gk20a_init_zcull(g, gr);
4985 if (err != 0) {
4986 goto clean_up;
4987 }
4988
4989 err = g->ops.gr.alloc_global_ctx_buffers(g);
4990 if (err != 0) {
4991 goto clean_up;
4992 }
4993
4994 err = gr_gk20a_init_access_map(g);
4995 if (err != 0) {
4996 goto clean_up;
4997 }
4998
4999 gr_gk20a_load_zbc_default_table(g, gr);
5000
5001 if (g->ops.gr.init_czf_bypass) {
5002 g->ops.gr.init_czf_bypass(g);
5003 }
5004
5005 if (g->ops.gr.init_gfxp_wfi_timeout_count) {
5006 g->ops.gr.init_gfxp_wfi_timeout_count(g);
5007 }
5008
5009 err = nvgpu_mutex_init(&gr->ctx_mutex);
5010 if (err != 0) {
5011 nvgpu_err(g, "Error in gr.ctx_mutex initialization");
5012 goto clean_up;
5013 }
5014
5015 nvgpu_spinlock_init(&gr->ch_tlb_lock);
5016
5017 gr->remove_support = gk20a_remove_gr_support;
5018 gr->sw_ready = true;
5019
5020 err = nvgpu_ecc_init_support(g);
5021 if (err != 0) {
5022 goto clean_up;
5023 }
5024
5025 nvgpu_log_fn(g, "done");
5026 return 0;
5027
5028clean_up:
5029 nvgpu_err(g, "fail");
5030 gk20a_remove_gr_support(gr);
5031 return err;
5032}
5033
5034static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
5035{
5036 struct nvgpu_pmu *pmu = &g->pmu;
5037 struct mm_gk20a *mm = &g->mm;
5038 struct vm_gk20a *vm = mm->pmu.vm;
5039 int err = 0;
5040
5041 u32 size;
5042
5043 nvgpu_log_fn(g, " ");
5044
5045 size = 0;
5046
5047 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
5048 if (err != 0) {
5049 nvgpu_err(g,
5050 "fail to query fecs pg buffer size");
5051 return err;
5052 }
5053
5054 if (pmu->pg_buf.cpu_va == NULL) {
5055 err = nvgpu_dma_alloc_map_sys(vm, size, &pmu->pg_buf);
5056 if (err != 0) {
5057 nvgpu_err(g, "failed to allocate memory");
5058 return -ENOMEM;
5059 }
5060 }
5061
5062
5063 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &mm->pmu.inst_block);
5064 if (err != 0) {
5065 nvgpu_err(g,
5066 "fail to bind pmu inst to gr");
5067 return err;
5068 }
5069
5070 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va);
5071 if (err != 0) {
5072 nvgpu_err(g,
5073 "fail to set pg buffer pmu va");
5074 return err;
5075 }
5076
5077 return err;
5078}
5079
5080int gk20a_init_gr_support(struct gk20a *g)
5081{
5082 int err = 0;
5083
5084 nvgpu_log_fn(g, " ");
5085
5086 g->gr.initialized = false;
5087
5088 /* this is required before gr_gk20a_init_ctx_state */
5089 err = nvgpu_mutex_init(&g->gr.fecs_mutex);
5090 if (err != 0) {
5091 nvgpu_err(g, "Error in gr.fecs_mutex initialization");
5092 return err;
5093 }
5094
5095 err = gr_gk20a_init_ctxsw(g);
5096 if (err != 0) {
5097 return err;
5098 }
5099
5100 /* this appears query for sw states but fecs actually init
5101 ramchain, etc so this is hw init */
5102 err = g->ops.gr.init_ctx_state(g);
5103 if (err != 0) {
5104 return err;
5105 }
5106
5107 err = gk20a_init_gr_setup_sw(g);
5108 if (err != 0) {
5109 return err;
5110 }
5111
5112 err = gk20a_init_gr_setup_hw(g);
5113 if (err != 0) {
5114 return err;
5115 }
5116
5117 if (g->can_elpg) {
5118 err = gk20a_init_gr_bind_fecs_elpg(g);
5119 if (err != 0) {
5120 return err;
5121 }
5122 }
5123
5124 /* GR is inialized, signal possible waiters */
5125 g->gr.initialized = true;
5126 nvgpu_cond_signal(&g->gr.init_wq);
5127
5128 return 0;
5129}
5130
5131/* Wait until GR is initialized */
5132void gk20a_gr_wait_initialized(struct gk20a *g)
5133{
5134 NVGPU_COND_WAIT(&g->gr.init_wq, g->gr.initialized, 0);
5135}
5136
5137#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
5138#define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
5139#define NVA297_SET_SHADER_EXCEPTIONS 0x1528
5140#define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
5141
5142#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
5143
5144void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
5145{
5146 nvgpu_log_fn(g, " ");
5147
5148 if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
5149 gk20a_writel(g,
5150 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
5151 gk20a_writel(g,
5152 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
5153 } else {
5154 /* setup sm warp esr report masks */
5155 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
5156 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
5157 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
5158 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
5159 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
5160 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
5161 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
5162 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
5163 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
5164 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
5165 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
5166 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
5167 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
5168 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
5169 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
5170 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
5171 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
5172 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
5173 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
5174 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
5175 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
5176
5177 /* setup sm global esr report mask */
5178 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
5179 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
5180 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
5181 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
5182 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
5183 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
5184 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
5185 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
5186 }
5187}
5188
5189int gk20a_enable_gr_hw(struct gk20a *g)
5190{
5191 int err;
5192
5193 nvgpu_log_fn(g, " ");
5194
5195 err = gk20a_init_gr_prepare(g);
5196 if (err != 0) {
5197 return err;
5198 }
5199
5200 err = gk20a_init_gr_reset_enable_hw(g);
5201 if (err != 0) {
5202 return err;
5203 }
5204
5205 nvgpu_log_fn(g, "done");
5206
5207 return 0;
5208}
5209
5210int gk20a_gr_reset(struct gk20a *g)
5211{
5212 int err;
5213 u32 size;
5214
5215 g->gr.initialized = false;
5216
5217 nvgpu_mutex_acquire(&g->gr.fecs_mutex);
5218
5219 err = gk20a_enable_gr_hw(g);
5220 if (err != 0) {
5221 nvgpu_mutex_release(&g->gr.fecs_mutex);
5222 return err;
5223 }
5224
5225 err = gk20a_init_gr_setup_hw(g);
5226 if (err != 0) {
5227 nvgpu_mutex_release(&g->gr.fecs_mutex);
5228 return err;
5229 }
5230
5231 err = gr_gk20a_init_ctxsw(g);
5232 if (err != 0) {
5233 nvgpu_mutex_release(&g->gr.fecs_mutex);
5234 return err;
5235 }
5236
5237 nvgpu_mutex_release(&g->gr.fecs_mutex);
5238
5239 /* this appears query for sw states but fecs actually init
5240 ramchain, etc so this is hw init */
5241 err = g->ops.gr.init_ctx_state(g);
5242 if (err != 0) {
5243 return err;
5244 }
5245
5246 size = 0;
5247 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
5248 if (err != 0) {
5249 nvgpu_err(g,
5250 "fail to query fecs pg buffer size");
5251 return err;
5252 }
5253
5254 err = gr_gk20a_fecs_set_reglist_bind_inst(g, &g->mm.pmu.inst_block);
5255 if (err != 0) {
5256 nvgpu_err(g,
5257 "fail to bind pmu inst to gr");
5258 return err;
5259 }
5260
5261 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va);
5262 if (err != 0) {
5263 nvgpu_err(g,
5264 "fail to set pg buffer pmu va");
5265 return err;
5266 }
5267
5268 nvgpu_cg_init_gr_load_gating_prod(g);
5269 nvgpu_cg_elcg_enable_no_wait(g);
5270
5271 /* GR is inialized, signal possible waiters */
5272 g->gr.initialized = true;
5273 nvgpu_cond_signal(&g->gr.init_wq);
5274
5275 return err;
5276}
5277
5278static void gk20a_gr_set_error_notifier(struct gk20a *g,
5279 struct gr_gk20a_isr_data *isr_data, u32 error_notifier)
5280{
5281 struct channel_gk20a *ch;
5282 struct tsg_gk20a *tsg;
5283 struct channel_gk20a *ch_tsg;
5284
5285 ch = isr_data->ch;
5286
5287 if (ch == NULL) {
5288 return;
5289 }
5290
5291 tsg = tsg_gk20a_from_ch(ch);
5292 if (tsg != NULL) {
5293 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
5294 nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
5295 channel_gk20a, ch_entry) {
5296 if (gk20a_channel_get(ch_tsg)) {
5297 g->ops.fifo.set_error_notifier(ch_tsg,
5298 error_notifier);
5299 gk20a_channel_put(ch_tsg);
5300 }
5301
5302 }
5303 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
5304 } else {
5305 nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
5306 }
5307}
5308
5309static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
5310 struct gr_gk20a_isr_data *isr_data)
5311{
5312 nvgpu_log_fn(g, " ");
5313 gk20a_gr_set_error_notifier(g, isr_data,
5314 NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT);
5315 nvgpu_err(g,
5316 "gr semaphore timeout");
5317 return -EINVAL;
5318}
5319
5320static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
5321 struct gr_gk20a_isr_data *isr_data)
5322{
5323 nvgpu_log_fn(g, " ");
5324 gk20a_gr_set_error_notifier(g, isr_data,
5325 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
5326 /* This is an unrecoverable error, reset is needed */
5327 nvgpu_err(g,
5328 "gr semaphore timeout");
5329 return -EINVAL;
5330}
5331
5332static int gk20a_gr_handle_illegal_method(struct gk20a *g,
5333 struct gr_gk20a_isr_data *isr_data)
5334{
5335 int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
5336 isr_data->class_num, isr_data->offset,
5337 isr_data->data_lo);
5338 if (ret) {
5339 gk20a_gr_set_error_notifier(g, isr_data,
5340 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
5341 nvgpu_err(g, "invalid method class 0x%08x"
5342 ", offset 0x%08x address 0x%08x",
5343 isr_data->class_num, isr_data->offset, isr_data->addr);
5344 }
5345 return ret;
5346}
5347
5348static int gk20a_gr_handle_illegal_class(struct gk20a *g,
5349 struct gr_gk20a_isr_data *isr_data)
5350{
5351 nvgpu_log_fn(g, " ");
5352 gk20a_gr_set_error_notifier(g, isr_data,
5353 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5354 nvgpu_err(g,
5355 "invalid class 0x%08x, offset 0x%08x",
5356 isr_data->class_num, isr_data->offset);
5357 return -EINVAL;
5358}
5359
5360int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
5361 struct gr_gk20a_isr_data *isr_data)
5362{
5363 u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r());
5364 int ret = 0;
5365 u32 chid = isr_data->ch != NULL ?
5366 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5367
5368 if (gr_fecs_intr == 0U) {
5369 return 0;
5370 }
5371
5372 if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
5373 gk20a_gr_set_error_notifier(g, isr_data,
5374 NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
5375 nvgpu_err(g,
5376 "firmware method error 0x%08x for offset 0x%04x",
5377 gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
5378 isr_data->data_lo);
5379 ret = -1;
5380 } else if ((gr_fecs_intr &
5381 gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
5382 /* currently, recovery is not initiated */
5383 nvgpu_err(g, "fecs watchdog triggered for channel %u", chid);
5384 gk20a_fecs_dump_falcon_stats(g);
5385 gk20a_gpccs_dump_falcon_stats(g);
5386 gk20a_gr_debug_dump(g);
5387 } else if ((gr_fecs_intr &
5388 gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) {
5389 u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6));
5390
5391 if (mailbox_value == MAILBOX_VALUE_TIMESTAMP_BUFFER_FULL) {
5392 nvgpu_info(g, "ctxsw intr0 set by ucode, "
5393 "timestamp buffer full");
5394#ifdef CONFIG_GK20A_CTXSW_TRACE
5395 gk20a_fecs_trace_reset_buffer(g);
5396#else
5397 ret = -1;
5398#endif
5399 } else {
5400 nvgpu_err(g,
5401 "ctxsw intr0 set by ucode, error_code: 0x%08x",
5402 mailbox_value);
5403 ret = -1;
5404 }
5405 } else {
5406 nvgpu_err(g,
5407 "unhandled fecs error interrupt 0x%08x for channel %u",
5408 gr_fecs_intr, chid);
5409 gk20a_fecs_dump_falcon_stats(g);
5410 gk20a_gpccs_dump_falcon_stats(g);
5411 }
5412
5413 gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);
5414 return ret;
5415}
5416
5417static int gk20a_gr_handle_class_error(struct gk20a *g,
5418 struct gr_gk20a_isr_data *isr_data)
5419{
5420 u32 gr_class_error;
5421 u32 chid = isr_data->ch != NULL ?
5422 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5423
5424 nvgpu_log_fn(g, " ");
5425
5426 gr_class_error =
5427 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
5428 gk20a_gr_set_error_notifier(g, isr_data,
5429 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5430 nvgpu_err(g, "class error 0x%08x, offset 0x%08x,"
5431 "sub channel 0x%08x mme generated %d,"
5432 " mme pc 0x%08xdata high %d priv status %d"
5433 " unhandled intr 0x%08x for channel %u",
5434 isr_data->class_num, (isr_data->offset << 2),
5435 gr_trapped_addr_subch_v(isr_data->addr),
5436 gr_trapped_addr_mme_generated_v(isr_data->addr),
5437 gr_trapped_data_mme_pc_v(
5438 gk20a_readl(g, gr_trapped_data_mme_r())),
5439 gr_trapped_addr_datahigh_v(isr_data->addr),
5440 gr_trapped_addr_priv_v(isr_data->addr),
5441 gr_class_error, chid);
5442
5443 nvgpu_err(g, "trapped data low 0x%08x",
5444 gk20a_readl(g, gr_trapped_data_lo_r()));
5445 if (gr_trapped_addr_datahigh_v(isr_data->addr)) {
5446 nvgpu_err(g, "trapped data high 0x%08x",
5447 gk20a_readl(g, gr_trapped_data_hi_r()));
5448 }
5449
5450 return -EINVAL;
5451}
5452
5453static int gk20a_gr_handle_firmware_method(struct gk20a *g,
5454 struct gr_gk20a_isr_data *isr_data)
5455{
5456 u32 chid = isr_data->ch != NULL ?
5457 isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
5458
5459 nvgpu_log_fn(g, " ");
5460
5461 gk20a_gr_set_error_notifier(g, isr_data,
5462 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
5463 nvgpu_err(g,
5464 "firmware method 0x%08x, offset 0x%08x for channel %u",
5465 isr_data->class_num, isr_data->offset,
5466 chid);
5467 return -EINVAL;
5468}
5469
5470int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
5471 struct gr_gk20a_isr_data *isr_data)
5472{
5473 struct channel_gk20a *ch = isr_data->ch;
5474 struct tsg_gk20a *tsg;
5475
5476 if (ch == NULL) {
5477 return 0;
5478 }
5479
5480 tsg = tsg_gk20a_from_ch(ch);
5481 if (tsg != NULL) {
5482 g->ops.fifo.post_event_id(tsg,
5483 NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
5484
5485 nvgpu_cond_broadcast(&ch->semaphore_wq);
5486 } else {
5487 nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
5488 }
5489
5490 return 0;
5491}
5492
5493#if defined(CONFIG_GK20A_CYCLE_STATS)
5494static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
5495 u32 offset)
5496{
5497 /* support only 24-bit 4-byte aligned offsets */
5498 bool valid = !(offset & 0xFF000003);
5499
5500 if (g->allow_all)
5501 return true;
5502
5503 /* whitelist check */
5504 valid = valid &&
5505 is_bar0_global_offset_whitelisted_gk20a(g, offset);
5506 /* resource size check in case there was a problem
5507 * with allocating the assumed size of bar0 */
5508 valid = valid && gk20a_io_valid_reg(g, offset);
5509 return valid;
5510}
5511#endif
5512
5513int gk20a_gr_handle_notify_pending(struct gk20a *g,
5514 struct gr_gk20a_isr_data *isr_data)
5515{
5516 struct channel_gk20a *ch = isr_data->ch;
5517
5518#if defined(CONFIG_GK20A_CYCLE_STATS)
5519 void *virtual_address;
5520 u32 buffer_size;
5521 u32 offset;
5522 bool exit;
5523#endif
5524 if (ch == NULL || tsg_gk20a_from_ch(ch) == NULL) {
5525 return 0;
5526 }
5527
5528#if defined(CONFIG_GK20A_CYCLE_STATS)
5529 /* GL will never use payload 0 for cycle state */
5530 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
5531 return 0;
5532
5533 nvgpu_mutex_acquire(&ch->cyclestate.cyclestate_buffer_mutex);
5534
5535 virtual_address = ch->cyclestate.cyclestate_buffer;
5536 buffer_size = ch->cyclestate.cyclestate_buffer_size;
5537 offset = isr_data->data_lo;
5538 exit = false;
5539 while (!exit) {
5540 struct share_buffer_head *sh_hdr;
5541 u32 min_element_size;
5542
5543 /* validate offset */
5544 if (offset + sizeof(struct share_buffer_head) > buffer_size ||
5545 offset + sizeof(struct share_buffer_head) < offset) {
5546 nvgpu_err(g,
5547 "cyclestats buffer overrun at offset 0x%x",
5548 offset);
5549 break;
5550 }
5551
5552 sh_hdr = (struct share_buffer_head *)
5553 ((char *)virtual_address + offset);
5554
5555 min_element_size =
5556 (sh_hdr->operation == OP_END ?
5557 sizeof(struct share_buffer_head) :
5558 sizeof(struct gk20a_cyclestate_buffer_elem));
5559
5560 /* validate sh_hdr->size */
5561 if (sh_hdr->size < min_element_size ||
5562 offset + sh_hdr->size > buffer_size ||
5563 offset + sh_hdr->size < offset) {
5564 nvgpu_err(g,
5565 "bad cyclestate buffer header size at offset 0x%x",
5566 offset);
5567 sh_hdr->failed = true;
5568 break;
5569 }
5570
5571 switch (sh_hdr->operation) {
5572 case OP_END:
5573 exit = true;
5574 break;
5575
5576 case BAR0_READ32:
5577 case BAR0_WRITE32:
5578 {
5579 struct gk20a_cyclestate_buffer_elem *op_elem =
5580 (struct gk20a_cyclestate_buffer_elem *)sh_hdr;
5581 bool valid = is_valid_cyclestats_bar0_offset_gk20a(
5582 g, op_elem->offset_bar0);
5583 u32 raw_reg;
5584 u64 mask_orig;
5585 u64 v;
5586
5587 if (!valid) {
5588 nvgpu_err(g,
5589 "invalid cycletstats op offset: 0x%x",
5590 op_elem->offset_bar0);
5591
5592 sh_hdr->failed = exit = true;
5593 break;
5594 }
5595
5596
5597 mask_orig =
5598 ((1ULL <<
5599 (op_elem->last_bit + 1))
5600 -1)&~((1ULL <<
5601 op_elem->first_bit)-1);
5602
5603 raw_reg =
5604 gk20a_readl(g,
5605 op_elem->offset_bar0);
5606
5607 switch (sh_hdr->operation) {
5608 case BAR0_READ32:
5609 op_elem->data =
5610 (raw_reg & mask_orig)
5611 >> op_elem->first_bit;
5612 break;
5613
5614 case BAR0_WRITE32:
5615 v = 0;
5616 if ((unsigned int)mask_orig !=
5617 (unsigned int)~0) {
5618 v = (unsigned int)
5619 (raw_reg & ~mask_orig);
5620 }
5621
5622 v |= ((op_elem->data
5623 << op_elem->first_bit)
5624 & mask_orig);
5625
5626 gk20a_writel(g,
5627 op_elem->offset_bar0,
5628 (unsigned int)v);
5629 break;
5630 default:
5631 /* nop ok?*/
5632 break;
5633 }
5634 }
5635 break;
5636
5637 default:
5638 /* no operation content case */
5639 exit = true;
5640 break;
5641 }
5642 sh_hdr->completed = true;
5643 offset += sh_hdr->size;
5644 }
5645 nvgpu_mutex_release(&ch->cyclestate.cyclestate_buffer_mutex);
5646#endif
5647 nvgpu_log_fn(g, " ");
5648 nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
5649 return 0;
5650}
5651
5652/* Used by sw interrupt thread to translate current ctx to chid.
5653 * Also used by regops to translate current ctx to chid and tsgid.
5654 * For performance, we don't want to go through 128 channels every time.
5655 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5656 * A small tlb is used here to cache translation.
5657 *
5658 * Returned channel must be freed with gk20a_channel_put() */
5659static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
5660 struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid)
5661{
5662 struct fifo_gk20a *f = &g->fifo;
5663 struct gr_gk20a *gr = &g->gr;
5664 u32 chid = -1;
5665 u32 tsgid = NVGPU_INVALID_TSG_ID;
5666 u32 i;
5667 struct channel_gk20a *ret = NULL;
5668
5669 /* when contexts are unloaded from GR, the valid bit is reset
5670 * but the instance pointer information remains intact.
5671 * This might be called from gr_isr where contexts might be
5672 * unloaded. No need to check ctx_valid bit
5673 */
5674
5675 nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
5676
5677 /* check cache first */
5678 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5679 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5680 chid = gr->chid_tlb[i].chid;
5681 tsgid = gr->chid_tlb[i].tsgid;
5682 ret = gk20a_channel_from_id(g, chid);
5683 goto unlock;
5684 }
5685 }
5686
5687 /* slow path */
5688 for (chid = 0; chid < f->num_channels; chid++) {
5689 struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);
5690
5691 if (ch == NULL) {
5692 continue;
5693 }
5694
5695 if ((u32)(nvgpu_inst_block_addr(g, &ch->inst_block) >>
5696 ram_in_base_shift_v()) ==
5697 gr_fecs_current_ctx_ptr_v(curr_ctx)) {
5698 tsgid = ch->tsgid;
5699 /* found it */
5700 ret = ch;
5701 break;
5702 }
5703 gk20a_channel_put(ch);
5704 }
5705
5706 if (ret == NULL) {
5707 goto unlock;
5708 }
5709
5710 /* add to free tlb entry */
5711 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5712 if (gr->chid_tlb[i].curr_ctx == 0) {
5713 gr->chid_tlb[i].curr_ctx = curr_ctx;
5714 gr->chid_tlb[i].chid = chid;
5715 gr->chid_tlb[i].tsgid = tsgid;
5716 goto unlock;
5717 }
5718 }
5719
5720 /* no free entry, flush one */
5721 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5722 gr->chid_tlb[gr->channel_tlb_flush_index].chid = chid;
5723 gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid;
5724
5725 gr->channel_tlb_flush_index =
5726 (gr->channel_tlb_flush_index + 1) &
5727 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5728
5729unlock:
5730 nvgpu_spinlock_release(&gr->ch_tlb_lock);
5731 if (curr_tsgid) {
5732 *curr_tsgid = tsgid;
5733 }
5734 return ret;
5735}
5736
5737int gk20a_gr_lock_down_sm(struct gk20a *g,
5738 u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
5739 bool check_errors)
5740{
5741 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5742 u32 dbgr_control0;
5743
5744 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5745 "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm);
5746
5747 /* assert stop trigger */
5748 dbgr_control0 =
5749 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
5750 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5751 gk20a_writel(g,
5752 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
5753
5754 return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask,
5755 check_errors);
5756}
5757
5758bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5759{
5760 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5761
5762 /* check if an sm debugger is attached.
5763 * assumption: all SMs will have debug mode enabled/disabled
5764 * uniformly. */
5765 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5766 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v()) {
5767 return true;
5768 }
5769
5770 return false;
5771}
5772
5773int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
5774 bool *post_event, struct channel_gk20a *fault_ch,
5775 u32 *hww_global_esr)
5776{
5777 int ret = 0;
5778 bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
5779 bool disable_sm_exceptions = true;
5780 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5781 bool sm_debugger_attached;
5782 u32 global_esr, warp_esr, global_mask;
5783
5784 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
5785
5786 sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
5787
5788 global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
5789 *hww_global_esr = global_esr;
5790 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
5791 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
5792
5793 if (!sm_debugger_attached) {
5794 nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
5795 global_esr, warp_esr);
5796 return -EFAULT;
5797 }
5798
5799 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5800 "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
5801
5802 gr_gk20a_elpg_protected_call(g,
5803 g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
5804
5805 if (g->ops.gr.pre_process_sm_exception) {
5806 ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
5807 global_esr, warp_esr,
5808 sm_debugger_attached,
5809 fault_ch,
5810 &early_exit,
5811 &ignore_debugger);
5812 if (ret) {
5813 nvgpu_err(g, "could not pre-process sm error!");
5814 return ret;
5815 }
5816 }
5817
5818 if (early_exit) {
5819 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5820 "returning early");
5821 return ret;
5822 }
5823
5824 /*
5825 * Disable forwarding of tpc exceptions,
5826 * the debugger will reenable exceptions after servicing them.
5827 *
5828 * Do not disable exceptions if the only SM exception is BPT_INT
5829 */
5830 if ((global_esr == gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f())
5831 && (warp_esr == 0)) {
5832 disable_sm_exceptions = false;
5833 }
5834
5835 if (!ignore_debugger && disable_sm_exceptions) {
5836 u32 tpc_exception_en = gk20a_readl(g,
5837 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
5838 offset);
5839 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5840 gk20a_writel(g,
5841 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
5842 tpc_exception_en);
5843 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM Exceptions disabled");
5844 }
5845
5846 /* if a debugger is present and an error has occurred, do a warp sync */
5847 if (!ignore_debugger &&
5848 ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5849 nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
5850 do_warp_sync = true;
5851 }
5852
5853 if (do_warp_sync) {
5854 ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
5855 global_mask, true);
5856 if (ret) {
5857 nvgpu_err(g, "sm did not lock down!");
5858 return ret;
5859 }
5860 }
5861
5862 if (ignore_debugger) {
5863 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5864 "ignore_debugger set, skipping event posting");
5865 } else {
5866 *post_event = true;
5867 }
5868
5869 return ret;
5870}
5871
5872int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
5873 bool *post_event)
5874{
5875 int ret = 0;
5876 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
5877 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
5878 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
5879 u32 esr;
5880
5881 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
5882
5883 esr = gk20a_readl(g,
5884 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
5885 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);
5886
5887 gk20a_writel(g,
5888 gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
5889 esr);
5890
5891 return ret;
5892}
5893
5894void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
5895 u32 *esr_sm_sel)
5896{
5897 *esr_sm_sel = 1;
5898}
5899
5900static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5901 bool *post_event, struct channel_gk20a *fault_ch,
5902 u32 *hww_global_esr)
5903{
5904 int ret = 0;
5905 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
5906 u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r()
5907 + offset);
5908 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
5909
5910 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5911 "GPC%d TPC%d: pending exception 0x%x",
5912 gpc, tpc, tpc_exception);
5913
5914 /* check if an sm exeption is pending */
5915 if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) ==
5916 gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) {
5917 u32 esr_sm_sel, sm;
5918
5919 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5920 "GPC%d TPC%d: SM exception pending", gpc, tpc);
5921
5922 if (g->ops.gr.handle_tpc_sm_ecc_exception) {
5923 g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc,
5924 post_event, fault_ch, hww_global_esr);
5925 }
5926
5927 g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel);
5928
5929 for (sm = 0; sm < sm_per_tpc; sm++) {
5930
5931 if ((esr_sm_sel & BIT32(sm)) == 0U) {
5932 continue;
5933 }
5934
5935 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5936 "GPC%d TPC%d: SM%d exception pending",
5937 gpc, tpc, sm);
5938
5939 ret |= g->ops.gr.handle_sm_exception(g,
5940 gpc, tpc, sm, post_event, fault_ch,
5941 hww_global_esr);
5942 /* clear the hwws, also causes tpc and gpc
5943 * exceptions to be cleared. Should be cleared
5944 * only if SM is locked down or empty.
5945 */
5946 g->ops.gr.clear_sm_hww(g,
5947 gpc, tpc, sm, *hww_global_esr);
5948
5949 }
5950
5951 }
5952
5953 /* check if a tex exeption is pending */
5954 if (gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(tpc_exception) ==
5955 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) {
5956 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5957 "GPC%d TPC%d: TEX exception pending", gpc, tpc);
5958 ret |= g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event);
5959 }
5960
5961 if (g->ops.gr.handle_tpc_mpc_exception) {
5962 ret |= g->ops.gr.handle_tpc_mpc_exception(g,
5963 gpc, tpc, post_event);
5964 }
5965
5966 return ret;
5967}
5968
5969static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5970 struct channel_gk20a *fault_ch, u32 *hww_global_esr)
5971{
5972 int ret = 0;
5973 u32 gpc_offset, gpc, tpc;
5974 struct gr_gk20a *gr = &g->gr;
5975 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5976 u32 gpc_exception;
5977
5978 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, " ");
5979
5980 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
5981 if ((exception1 & (1 << gpc)) == 0) {
5982 continue;
5983 }
5984
5985 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
5986 "GPC%d exception pending", gpc);
5987
5988 gpc_offset = gk20a_gr_gpc_offset(g, gpc);
5989
5990 gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r()
5991 + gpc_offset);
5992
5993 /* check if any tpc has an exception */
5994 for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
5995 if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) &
5996 (1 << tpc)) == 0) {
5997 continue;
5998 }
5999
6000 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
6001 "GPC%d: TPC%d exception pending", gpc, tpc);
6002
6003 ret |= gk20a_gr_handle_tpc_exception(g, gpc, tpc,
6004 post_event, fault_ch, hww_global_esr);
6005
6006 }
6007
6008 /* Handle GCC exception */
6009 if ((gr_gpc0_gpccs_gpc_exception_gcc_v(gpc_exception) != 0U) &&
6010 (g->ops.gr.handle_gcc_exception != NULL)) {
6011 int gcc_ret = 0;
6012 gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc,
6013 post_event, fault_ch, hww_global_esr);
6014 ret |= (ret != 0) ? ret : gcc_ret;
6015 }
6016
6017 /* Handle GPCCS exceptions */
6018 if (g->ops.gr.handle_gpc_gpccs_exception) {
6019 int ret_ecc = 0;
6020 ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
6021 gpc_exception);
6022 ret |= (ret != 0) ? ret : ret_ecc;
6023 }
6024
6025 /* Handle GPCMMU exceptions */
6026 if (g->ops.gr.handle_gpc_gpcmmu_exception) {
6027 int ret_mmu = 0;
6028
6029 ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
6030 gpc_exception);
6031 ret |= (ret != 0) ? ret : ret_mmu;
6032 }
6033
6034 }
6035
6036 return ret;
6037}
6038
6039static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg,
6040 u32 global_esr)
6041{
6042 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) {
6043 g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT);
6044 }
6045
6046 if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) {
6047 g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE);
6048 }
6049
6050 return 0;
6051}
6052
6053int gk20a_gr_isr(struct gk20a *g)
6054{
6055 struct gr_gk20a_isr_data isr_data;
6056 u32 grfifo_ctl;
6057 u32 obj_table;
6058 bool need_reset = false;
6059 u32 gr_intr = gk20a_readl(g, gr_intr_r());
6060 struct channel_gk20a *ch = NULL;
6061 struct channel_gk20a *fault_ch = NULL;
6062 u32 tsgid = NVGPU_INVALID_TSG_ID;
6063 struct tsg_gk20a *tsg = NULL;
6064 u32 gr_engine_id;
6065 u32 global_esr = 0;
6066 u32 chid;
6067
6068 nvgpu_log_fn(g, " ");
6069 nvgpu_log(g, gpu_dbg_intr, "pgraph intr 0x%08x", gr_intr);
6070
6071 if (gr_intr == 0U) {
6072 return 0;
6073 }
6074
6075 gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
6076 if (gr_engine_id != FIFO_INVAL_ENGINE_ID) {
6077 gr_engine_id = BIT(gr_engine_id);
6078 }
6079
6080 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
6081 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
6082 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
6083
6084 gk20a_writel(g, gr_gpfifo_ctl_r(),
6085 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
6086 gr_gpfifo_ctl_semaphore_access_f(0));
6087
6088 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
6089 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
6090 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
6091 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
6092 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
6093 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
6094 obj_table = (isr_data.sub_chan < 4) ? gk20a_readl(g,
6095 gr_fe_object_table_r(isr_data.sub_chan)) : 0;
6096 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
6097
6098 ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, &tsgid);
6099 isr_data.ch = ch;
6100 chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
6101
6102 if (ch == NULL) {
6103 nvgpu_err(g, "pgraph intr: 0x%08x, chid: INVALID", gr_intr);
6104 } else {
6105 tsg = tsg_gk20a_from_ch(ch);
6106 if (tsg == NULL) {
6107 nvgpu_err(g, "pgraph intr: 0x%08x, chid: %d "
6108 "not bound to tsg", gr_intr, chid);
6109 }
6110 }
6111
6112 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
6113 "channel %d: addr 0x%08x, "
6114 "data 0x%08x 0x%08x,"
6115 "ctx 0x%08x, offset 0x%08x, "
6116 "subchannel 0x%08x, class 0x%08x",
6117 chid, isr_data.addr,
6118 isr_data.data_hi, isr_data.data_lo,
6119 isr_data.curr_ctx, isr_data.offset,
6120 isr_data.sub_chan, isr_data.class_num);
6121
6122 if (gr_intr & gr_intr_notify_pending_f()) {
6123 g->ops.gr.handle_notify_pending(g, &isr_data);
6124 gk20a_writel(g, gr_intr_r(),
6125 gr_intr_notify_reset_f());
6126 gr_intr &= ~gr_intr_notify_pending_f();
6127 }
6128
6129 if (gr_intr & gr_intr_semaphore_pending_f()) {
6130 g->ops.gr.handle_semaphore_pending(g, &isr_data);
6131 gk20a_writel(g, gr_intr_r(),
6132 gr_intr_semaphore_reset_f());
6133 gr_intr &= ~gr_intr_semaphore_pending_f();
6134 }
6135
6136 if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
6137 if (gk20a_gr_handle_semaphore_timeout_pending(g,
6138 &isr_data) != 0) {
6139 need_reset = true;
6140 }
6141 gk20a_writel(g, gr_intr_r(),
6142 gr_intr_semaphore_reset_f());
6143 gr_intr &= ~gr_intr_semaphore_pending_f();
6144 }
6145
6146 if (gr_intr & gr_intr_illegal_notify_pending_f()) {
6147 if (gk20a_gr_intr_illegal_notify_pending(g,
6148 &isr_data) != 0) {
6149 need_reset = true;
6150 }
6151 gk20a_writel(g, gr_intr_r(),
6152 gr_intr_illegal_notify_reset_f());
6153 gr_intr &= ~gr_intr_illegal_notify_pending_f();
6154 }
6155
6156 if (gr_intr & gr_intr_illegal_method_pending_f()) {
6157 if (gk20a_gr_handle_illegal_method(g, &isr_data) != 0) {
6158 need_reset = true;
6159 }
6160 gk20a_writel(g, gr_intr_r(),
6161 gr_intr_illegal_method_reset_f());
6162 gr_intr &= ~gr_intr_illegal_method_pending_f();
6163 }
6164
6165 if (gr_intr & gr_intr_illegal_class_pending_f()) {
6166 if (gk20a_gr_handle_illegal_class(g, &isr_data) != 0) {
6167 need_reset = true;
6168 }
6169 gk20a_writel(g, gr_intr_r(),
6170 gr_intr_illegal_class_reset_f());
6171 gr_intr &= ~gr_intr_illegal_class_pending_f();
6172 }
6173
6174 if (gr_intr & gr_intr_fecs_error_pending_f()) {
6175 if (g->ops.gr.handle_fecs_error(g, ch, &isr_data) != 0) {
6176 need_reset = true;
6177 }
6178 gk20a_writel(g, gr_intr_r(),
6179 gr_intr_fecs_error_reset_f());
6180 gr_intr &= ~gr_intr_fecs_error_pending_f();
6181 }
6182
6183 if (gr_intr & gr_intr_class_error_pending_f()) {
6184 if (gk20a_gr_handle_class_error(g, &isr_data) != 0) {
6185 need_reset = true;
6186 }
6187 gk20a_writel(g, gr_intr_r(),
6188 gr_intr_class_error_reset_f());
6189 gr_intr &= ~gr_intr_class_error_pending_f();
6190 }
6191
6192 /* this one happens if someone tries to hit a non-whitelisted
6193 * register using set_falcon[4] */
6194 if (gr_intr & gr_intr_firmware_method_pending_f()) {
6195 if (gk20a_gr_handle_firmware_method(g, &isr_data) != 0) {
6196 need_reset = true;
6197 }
6198 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
6199 gk20a_writel(g, gr_intr_r(),
6200 gr_intr_firmware_method_reset_f());
6201 gr_intr &= ~gr_intr_firmware_method_pending_f();
6202 }
6203
6204 if (gr_intr & gr_intr_exception_pending_f()) {
6205 u32 exception = gk20a_readl(g, gr_exception_r());
6206
6207 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
6208
6209 if (exception & gr_exception_fe_m()) {
6210 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
6211 u32 info = gk20a_readl(g, gr_fe_hww_esr_info_r());
6212
6213 nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
6214 fe, info);
6215 gk20a_writel(g, gr_fe_hww_esr_r(),
6216 gr_fe_hww_esr_reset_active_f());
6217 need_reset = true;
6218 }
6219
6220 if (exception & gr_exception_memfmt_m()) {
6221 u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
6222
6223 nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
6224 gk20a_writel(g, gr_memfmt_hww_esr_r(),
6225 gr_memfmt_hww_esr_reset_active_f());
6226 need_reset = true;
6227 }
6228
6229 if (exception & gr_exception_pd_m()) {
6230 u32 pd = gk20a_readl(g, gr_pd_hww_esr_r());
6231
6232 nvgpu_err(g, "pd exception: esr 0x%08x", pd);
6233 gk20a_writel(g, gr_pd_hww_esr_r(),
6234 gr_pd_hww_esr_reset_active_f());
6235 need_reset = true;
6236 }
6237
6238 if (exception & gr_exception_scc_m()) {
6239 u32 scc = gk20a_readl(g, gr_scc_hww_esr_r());
6240
6241 nvgpu_err(g, "scc exception: esr 0x%08x", scc);
6242 gk20a_writel(g, gr_scc_hww_esr_r(),
6243 gr_scc_hww_esr_reset_active_f());
6244 need_reset = true;
6245 }
6246
6247 if (exception & gr_exception_ds_m()) {
6248 u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
6249
6250 nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
6251 gk20a_writel(g, gr_ds_hww_esr_r(),
6252 gr_ds_hww_esr_reset_task_f());
6253 need_reset = true;
6254 }
6255
6256 if (exception & gr_exception_ssync_m()) {
6257 if (g->ops.gr.handle_ssync_hww) {
6258 if (g->ops.gr.handle_ssync_hww(g) != 0) {
6259 need_reset = true;
6260 }
6261 } else {
6262 nvgpu_err(g, "unhandled ssync exception");
6263 }
6264 }
6265
6266 if (exception & gr_exception_mme_m()) {
6267 u32 mme = gk20a_readl(g, gr_mme_hww_esr_r());
6268 u32 info = gk20a_readl(g, gr_mme_hww_esr_info_r());
6269
6270 nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
6271 mme, info);
6272 gk20a_writel(g, gr_mme_hww_esr_r(),
6273 gr_mme_hww_esr_reset_active_f());
6274 need_reset = true;
6275 }
6276
6277 if (exception & gr_exception_sked_m()) {
6278 u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
6279
6280 nvgpu_err(g, "sked exception: esr 0x%08x", sked);
6281 gk20a_writel(g, gr_sked_hww_esr_r(),
6282 gr_sked_hww_esr_reset_active_f());
6283 need_reset = true;
6284 }
6285
6286 /* check if a gpc exception has occurred */
6287 if (((exception & gr_exception_gpc_m()) != 0U) &&
6288 !need_reset) {
6289 bool post_event = false;
6290
6291 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
6292 "GPC exception pending");
6293
6294 if (tsg != NULL) {
6295 fault_ch = isr_data.ch;
6296 }
6297
6298 /* fault_ch can be NULL */
6299 /* check if any gpc has an exception */
6300 if (gk20a_gr_handle_gpc_exception(g, &post_event,
6301 fault_ch, &global_esr) != 0) {
6302 need_reset = true;
6303 }
6304
6305 /* signal clients waiting on an event */
6306 if (g->ops.gr.sm_debugger_attached(g) &&
6307 post_event && (fault_ch != NULL)) {
6308 g->ops.debugger.post_events(fault_ch);
6309 }
6310 }
6311
6312 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
6313 gr_intr &= ~gr_intr_exception_pending_f();
6314
6315 if (need_reset) {
6316 nvgpu_err(g, "set gr exception notifier");
6317 gk20a_gr_set_error_notifier(g, &isr_data,
6318 NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
6319 }
6320 }
6321
6322 if (need_reset) {
6323 if (tsg != NULL) {
6324 gk20a_fifo_recover(g, gr_engine_id,
6325 tsgid, true, true, true,
6326 RC_TYPE_GR_FAULT);
6327 } else {
6328 if (ch != NULL) {
6329 nvgpu_err(g, "chid: %d referenceable but not "
6330 "bound to tsg", chid);
6331 }
6332 gk20a_fifo_recover(g, gr_engine_id,
6333 0, false, false, true,
6334 RC_TYPE_GR_FAULT);
6335 }
6336 }
6337
6338 if (gr_intr != 0U) {
6339 /* clear unhandled interrupts */
6340 if (ch == NULL) {
6341 /*
6342 * This is probably an interrupt during
6343 * gk20a_free_channel()
6344 */
6345 nvgpu_err(g, "unhandled gr intr 0x%08x for "
6346 "unreferenceable channel, clearing",
6347 gr_intr);
6348 } else {
6349 nvgpu_err(g, "unhandled gr intr 0x%08x for chid: %d",
6350 gr_intr, chid);
6351 }
6352 gk20a_writel(g, gr_intr_r(), gr_intr);
6353 }
6354
6355 gk20a_writel(g, gr_gpfifo_ctl_r(),
6356 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
6357 gr_gpfifo_ctl_semaphore_access_f(1));
6358
6359
6360 /* Posting of BPT events should be the last thing in this function */
6361 if ((global_esr != 0U) && (tsg != NULL)) {
6362 gk20a_gr_post_bpt_events(g, tsg, global_esr);
6363 }
6364
6365 if (ch) {
6366 gk20a_channel_put(ch);
6367 }
6368
6369 return 0;
6370}
6371
6372u32 gk20a_gr_nonstall_isr(struct gk20a *g)
6373{
6374 u32 ops = 0;
6375 u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
6376
6377 nvgpu_log(g, gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
6378
6379 if ((gr_intr & gr_intr_nonstall_trap_pending_f()) != 0U) {
6380 /* Clear the interrupt */
6381 gk20a_writel(g, gr_intr_nonstall_r(),
6382 gr_intr_nonstall_trap_pending_f());
6383 ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE |
6384 GK20A_NONSTALL_OPS_POST_EVENTS);
6385 }
6386 return ops;
6387}
6388
6389int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
6390{
6391 BUG_ON(size == NULL);
6392 return gr_gk20a_submit_fecs_method_op(g,
6393 (struct fecs_method_op_gk20a) {
6394 .mailbox.id = 0,
6395 .mailbox.data = 0,
6396 .mailbox.clr = ~0,
6397 .method.data = 1,
6398 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
6399 .mailbox.ret = size,
6400 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
6401 .mailbox.ok = 0,
6402 .cond.fail = GR_IS_UCODE_OP_SKIP,
6403 .mailbox.fail = 0}, false);
6404}
6405
6406int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g,
6407 struct nvgpu_mem *inst_block)
6408{
6409 u32 data = fecs_current_ctx_data(g, inst_block);
6410
6411 return gr_gk20a_submit_fecs_method_op(g,
6412 (struct fecs_method_op_gk20a){
6413 .mailbox.id = 4,
6414 .mailbox.data = data,
6415 .mailbox.clr = ~0,
6416 .method.data = 1,
6417 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
6418 .mailbox.ret = NULL,
6419 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6420 .mailbox.ok = 1,
6421 .cond.fail = GR_IS_UCODE_OP_SKIP,
6422 .mailbox.fail = 0}, false);
6423}
6424
6425int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
6426{
6427 return gr_gk20a_submit_fecs_method_op(g,
6428 (struct fecs_method_op_gk20a) {
6429 .mailbox.id = 4,
6430 .mailbox.data = u64_lo32(pmu_va >> 8),
6431 .mailbox.clr = ~0,
6432 .method.data = 1,
6433 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
6434 .mailbox.ret = NULL,
6435 .cond.ok = GR_IS_UCODE_OP_EQUAL,
6436 .mailbox.ok = 1,
6437 .cond.fail = GR_IS_UCODE_OP_SKIP,
6438 .mailbox.fail = 0}, false);
6439}
6440
6441int gk20a_gr_suspend(struct gk20a *g)
6442{
6443 u32 ret = 0;
6444
6445 nvgpu_log_fn(g, " ");
6446
6447 ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
6448 GR_IDLE_CHECK_DEFAULT);
6449 if (ret) {
6450 return ret;
6451 }
6452
6453 gk20a_writel(g, gr_gpfifo_ctl_r(),
6454 gr_gpfifo_ctl_access_disabled_f());
6455
6456 /* disable gr intr */
6457 gk20a_writel(g, gr_intr_r(), 0);
6458 gk20a_writel(g, gr_intr_en_r(), 0);
6459
6460 /* disable all exceptions */
6461 gk20a_writel(g, gr_exception_r(), 0);
6462 gk20a_writel(g, gr_exception_en_r(), 0);
6463 gk20a_writel(g, gr_exception1_r(), 0);
6464 gk20a_writel(g, gr_exception1_en_r(), 0);
6465 gk20a_writel(g, gr_exception2_r(), 0);
6466 gk20a_writel(g, gr_exception2_en_r(), 0);
6467
6468 gk20a_gr_flush_channel_tlb(&g->gr);
6469
6470 g->gr.initialized = false;
6471
6472 nvgpu_log_fn(g, "done");
6473 return ret;
6474}
6475
6476static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6477 u32 addr,
6478 bool is_quad, u32 quad,
6479 u32 *context_buffer,
6480 u32 context_buffer_size,
6481 u32 *priv_offset);
6482
6483static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
6484 u32 addr,
6485 u32 *priv_offset);
6486
6487/* This function will decode a priv address and return the partition type and numbers. */
6488int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
6489 enum ctxsw_addr_type *addr_type,
6490 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
6491 u32 *broadcast_flags)
6492{
6493 u32 gpc_addr;
6494
6495 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6496
6497 /* setup defaults */
6498 *addr_type = CTXSW_ADDR_TYPE_SYS;
6499 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
6500 *gpc_num = 0;
6501 *tpc_num = 0;
6502 *ppc_num = 0;
6503 *be_num = 0;
6504
6505 if (pri_is_gpc_addr(g, addr)) {
6506 *addr_type = CTXSW_ADDR_TYPE_GPC;
6507 gpc_addr = pri_gpccs_addr_mask(addr);
6508 if (pri_is_gpc_addr_shared(g, addr)) {
6509 *addr_type = CTXSW_ADDR_TYPE_GPC;
6510 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
6511 } else {
6512 *gpc_num = pri_get_gpc_num(g, addr);
6513 }
6514
6515 if (pri_is_ppc_addr(g, gpc_addr)) {
6516 *addr_type = CTXSW_ADDR_TYPE_PPC;
6517 if (pri_is_ppc_addr_shared(g, gpc_addr)) {
6518 *broadcast_flags |= PRI_BROADCAST_FLAGS_PPC;
6519 return 0;
6520 }
6521 }
6522 if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
6523 *addr_type = CTXSW_ADDR_TYPE_TPC;
6524 if (pri_is_tpc_addr_shared(g, gpc_addr)) {
6525 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
6526 return 0;
6527 }
6528 *tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6529 }
6530 return 0;
6531 } else if (pri_is_be_addr(g, addr)) {
6532 *addr_type = CTXSW_ADDR_TYPE_BE;
6533 if (pri_is_be_addr_shared(g, addr)) {
6534 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
6535 return 0;
6536 }
6537 *be_num = pri_get_be_num(g, addr);
6538 return 0;
6539 } else if (g->ops.ltc.pri_is_ltc_addr(g, addr)) {
6540 *addr_type = CTXSW_ADDR_TYPE_LTCS;
6541 if (g->ops.ltc.is_ltcs_ltss_addr(g, addr)) {
6542 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS;
6543 } else if (g->ops.ltc.is_ltcn_ltss_addr(g, addr)) {
6544 *broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS;
6545 }
6546 return 0;
6547 } else if (pri_is_fbpa_addr(g, addr)) {
6548 *addr_type = CTXSW_ADDR_TYPE_FBPA;
6549 if (pri_is_fbpa_addr_shared(g, addr)) {
6550 *broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA;
6551 return 0;
6552 }
6553 return 0;
6554 } else if ((g->ops.gr.is_egpc_addr != NULL) &&
6555 g->ops.gr.is_egpc_addr(g, addr)) {
6556 return g->ops.gr.decode_egpc_addr(g,
6557 addr, addr_type, gpc_num,
6558 tpc_num, broadcast_flags);
6559 } else {
6560 *addr_type = CTXSW_ADDR_TYPE_SYS;
6561 return 0;
6562 }
6563 /* PPC!?!?!?! */
6564
6565 /*NOTREACHED*/
6566 return -EINVAL;
6567}
6568
6569void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
6570 u32 num_fbpas,
6571 u32 *priv_addr_table, u32 *t)
6572{
6573 u32 fbpa_id;
6574
6575 for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) {
6576 priv_addr_table[(*t)++] = pri_fbpa_addr(g,
6577 pri_fbpa_addr_mask(g, addr), fbpa_id);
6578 }
6579}
6580
6581int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
6582 u32 gpc_num,
6583 u32 *priv_addr_table, u32 *t)
6584{
6585 u32 ppc_num;
6586
6587 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6588
6589 for (ppc_num = 0; ppc_num < g->gr.gpc_ppc_count[gpc_num]; ppc_num++) {
6590 priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr),
6591 gpc_num, ppc_num);
6592 }
6593
6594 return 0;
6595}
6596
6597/*
6598 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
6599 * unicast addresses. This function will convert a BE unicast address to a BE
6600 * broadcast address and split a GPC/TPC broadcast address into a table of
6601 * GPC/TPC addresses. The addresses generated by this function can be
6602 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
6603 */
6604int gr_gk20a_create_priv_addr_table(struct gk20a *g,
6605 u32 addr,
6606 u32 *priv_addr_table,
6607 u32 *num_registers)
6608{
6609 enum ctxsw_addr_type addr_type;
6610 u32 gpc_num, tpc_num, ppc_num, be_num;
6611 u32 priv_addr, gpc_addr;
6612 u32 broadcast_flags;
6613 u32 t;
6614 int err;
6615
6616 t = 0;
6617 *num_registers = 0;
6618
6619 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6620
6621 err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
6622 &gpc_num, &tpc_num, &ppc_num, &be_num,
6623 &broadcast_flags);
6624 nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
6625 if (err != 0) {
6626 return err;
6627 }
6628
6629 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6630 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6631 /* The BE broadcast registers are included in the compressed PRI
6632 * table. Convert a BE unicast address to a broadcast address
6633 * so that we can look up the offset. */
6634 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
6635 ((broadcast_flags & PRI_BROADCAST_FLAGS_BE) == 0U)) {
6636 priv_addr_table[t++] = pri_be_shared_addr(g, addr);
6637 } else {
6638 priv_addr_table[t++] = addr;
6639 }
6640
6641 *num_registers = t;
6642 return 0;
6643 }
6644
6645 /* The GPC/TPC unicast registers are included in the compressed PRI
6646 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
6647 * that we can look up the offsets. */
6648 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
6649 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
6650
6651 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
6652 for (tpc_num = 0;
6653 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6654 tpc_num++) {
6655 priv_addr_table[t++] =
6656 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6657 gpc_num, tpc_num);
6658 }
6659
6660 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
6661 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
6662 priv_addr_table, &t);
6663 if (err != 0) {
6664 return err;
6665 }
6666 } else {
6667 priv_addr = pri_gpc_addr(g,
6668 pri_gpccs_addr_mask(addr),
6669 gpc_num);
6670
6671 gpc_addr = pri_gpccs_addr_mask(priv_addr);
6672 tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
6673 if (tpc_num >= g->gr.gpc_tpc_count[gpc_num]) {
6674 continue;
6675 }
6676
6677 priv_addr_table[t++] = priv_addr;
6678 }
6679 }
6680 } else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
6681 (addr_type == CTXSW_ADDR_TYPE_ETPC)) &&
6682 (g->ops.gr.egpc_etpc_priv_addr_table != NULL)) {
6683 nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC");
6684 g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num, tpc_num,
6685 broadcast_flags, priv_addr_table, &t);
6686 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) {
6687 g->ops.ltc.split_lts_broadcast_addr(g, addr,
6688 priv_addr_table, &t);
6689 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) {
6690 g->ops.ltc.split_ltc_broadcast_addr(g, addr,
6691 priv_addr_table, &t);
6692 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) {
6693 g->ops.gr.split_fbpa_broadcast_addr(g, addr,
6694 nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS),
6695 priv_addr_table, &t);
6696 } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) == 0U) {
6697 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
6698 for (tpc_num = 0;
6699 tpc_num < g->gr.gpc_tpc_count[gpc_num];
6700 tpc_num++) {
6701 priv_addr_table[t++] =
6702 pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
6703 gpc_num, tpc_num);
6704 }
6705 } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
6706 err = gr_gk20a_split_ppc_broadcast_addr(g,
6707 addr, gpc_num, priv_addr_table, &t);
6708 } else {
6709 priv_addr_table[t++] = addr;
6710 }
6711 }
6712
6713 *num_registers = t;
6714 return 0;
6715}
6716
6717int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
6718 u32 addr,
6719 u32 max_offsets,
6720 u32 *offsets, u32 *offset_addrs,
6721 u32 *num_offsets,
6722 bool is_quad, u32 quad)
6723{
6724 u32 i;
6725 u32 priv_offset = 0;
6726 u32 *priv_registers;
6727 u32 num_registers = 0;
6728 int err = 0;
6729 struct gr_gk20a *gr = &g->gr;
6730 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6731 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6732 sm_per_tpc;
6733
6734 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6735
6736 /* implementation is crossed-up if either of these happen */
6737 if (max_offsets > potential_offsets) {
6738 nvgpu_log_fn(g, "max_offsets > potential_offsets");
6739 return -EINVAL;
6740 }
6741
6742 if (!g->gr.ctx_vars.golden_image_initialized) {
6743 return -ENODEV;
6744 }
6745
6746 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6747 if (priv_registers == NULL) {
6748 nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
6749 err = PTR_ERR(priv_registers);
6750 goto cleanup;
6751 }
6752 memset(offsets, 0, sizeof(u32) * max_offsets);
6753 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6754 *num_offsets = 0;
6755
6756 g->ops.gr.create_priv_addr_table(g, addr, &priv_registers[0],
6757 &num_registers);
6758
6759 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6760 nvgpu_log_fn(g, "max_offsets = %d, num_registers = %d",
6761 max_offsets, num_registers);
6762 err = -EINVAL;
6763 goto cleanup;
6764 }
6765
6766 if ((max_offsets == 1) && (num_registers > 1)) {
6767 num_registers = 1;
6768 }
6769
6770 if (g->gr.ctx_vars.local_golden_image == NULL) {
6771 nvgpu_log_fn(g, "no context switch header info to work with");
6772 err = -EINVAL;
6773 goto cleanup;
6774 }
6775
6776 for (i = 0; i < num_registers; i++) {
6777 err = gr_gk20a_find_priv_offset_in_buffer(g,
6778 priv_registers[i],
6779 is_quad, quad,
6780 g->gr.ctx_vars.local_golden_image,
6781 g->gr.ctx_vars.golden_image_size,
6782 &priv_offset);
6783 if (err != 0) {
6784 nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
6785 addr); /*, grPriRegStr(addr)));*/
6786 goto cleanup;
6787 }
6788
6789 offsets[i] = priv_offset;
6790 offset_addrs[i] = priv_registers[i];
6791 }
6792
6793 *num_offsets = num_registers;
6794cleanup:
6795 if (!IS_ERR_OR_NULL(priv_registers)) {
6796 nvgpu_kfree(g, priv_registers);
6797 }
6798
6799 return err;
6800}
6801
6802int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
6803 u32 addr,
6804 u32 max_offsets,
6805 u32 *offsets, u32 *offset_addrs,
6806 u32 *num_offsets)
6807{
6808 u32 i;
6809 u32 priv_offset = 0;
6810 u32 *priv_registers;
6811 u32 num_registers = 0;
6812 int err = 0;
6813 struct gr_gk20a *gr = &g->gr;
6814 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
6815 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
6816 sm_per_tpc;
6817
6818 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6819
6820 /* implementation is crossed-up if either of these happen */
6821 if (max_offsets > potential_offsets) {
6822 return -EINVAL;
6823 }
6824
6825 if (!g->gr.ctx_vars.golden_image_initialized) {
6826 return -ENODEV;
6827 }
6828
6829 priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
6830 if (priv_registers == NULL) {
6831 nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
6832 return -ENOMEM;
6833 }
6834 memset(offsets, 0, sizeof(u32) * max_offsets);
6835 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
6836 *num_offsets = 0;
6837
6838 g->ops.gr.create_priv_addr_table(g, addr, priv_registers,
6839 &num_registers);
6840
6841 if ((max_offsets > 1) && (num_registers > max_offsets)) {
6842 err = -EINVAL;
6843 goto cleanup;
6844 }
6845
6846 if ((max_offsets == 1) && (num_registers > 1)) {
6847 num_registers = 1;
6848 }
6849
6850 if (g->gr.ctx_vars.local_golden_image == NULL) {
6851 nvgpu_log_fn(g, "no context switch header info to work with");
6852 err = -EINVAL;
6853 goto cleanup;
6854 }
6855
6856 for (i = 0; i < num_registers; i++) {
6857 err = gr_gk20a_find_priv_offset_in_pm_buffer(g,
6858 priv_registers[i],
6859 &priv_offset);
6860 if (err != 0) {
6861 nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
6862 addr); /*, grPriRegStr(addr)));*/
6863 goto cleanup;
6864 }
6865
6866 offsets[i] = priv_offset;
6867 offset_addrs[i] = priv_registers[i];
6868 }
6869
6870 *num_offsets = num_registers;
6871cleanup:
6872 nvgpu_kfree(g, priv_registers);
6873
6874 return err;
6875}
6876
6877/* Setup some register tables. This looks hacky; our
6878 * register/offset functions are just that, functions.
6879 * So they can't be used as initializers... TBD: fix to
6880 * generate consts at least on an as-needed basis.
6881 */
6882static const u32 _num_ovr_perf_regs = 17;
6883static u32 _ovr_perf_regs[17] = { 0, };
6884/* Following are the blocks of registers that the ucode
6885 stores in the extended region.*/
6886
6887void gk20a_gr_init_ovr_sm_dsm_perf(void)
6888{
6889 if (_ovr_perf_regs[0] != 0) {
6890 return;
6891 }
6892
6893 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
6894 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
6895 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
6896 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
6897 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
6898 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
6899 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
6900 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
6901 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
6902 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
6903 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
6904 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
6905 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
6906 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
6907 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
6908 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
6909 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
6910
6911}
6912
6913/* TBD: would like to handle this elsewhere, at a higher level.
6914 * these are currently constructed in a "test-then-write" style
6915 * which makes it impossible to know externally whether a ctx
6916 * write will actually occur. so later we should put a lazy,
6917 * map-and-hold system in the patch write state */
6918static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
6919 struct channel_gk20a *ch,
6920 u32 addr, u32 data,
6921 struct nvgpu_mem *mem)
6922{
6923 u32 num_gpc = g->gr.gpc_count;
6924 u32 num_tpc;
6925 u32 tpc, gpc, reg;
6926 u32 chk_addr;
6927 u32 vaddr_lo;
6928 u32 vaddr_hi;
6929 u32 tmp;
6930 u32 num_ovr_perf_regs = 0;
6931 u32 *ovr_perf_regs = NULL;
6932 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
6933 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
6934 struct tsg_gk20a *tsg;
6935 struct nvgpu_gr_ctx *gr_ctx;
6936 struct nvgpu_mem *ctxheader = &ch->ctx_header;
6937
6938 tsg = tsg_gk20a_from_ch(ch);
6939 if (tsg == NULL) {
6940 return -EINVAL;
6941 }
6942
6943 gr_ctx = &tsg->gr_ctx;
6944 g->ops.gr.init_ovr_sm_dsm_perf();
6945 g->ops.gr.init_sm_dsm_reg_info();
6946 g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);
6947
6948 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6949
6950 for (reg = 0; reg < num_ovr_perf_regs; reg++) {
6951 for (gpc = 0; gpc < num_gpc; gpc++) {
6952 num_tpc = g->gr.gpc_tpc_count[gpc];
6953 for (tpc = 0; tpc < num_tpc; tpc++) {
6954 chk_addr = ((gpc_stride * gpc) +
6955 (tpc_in_gpc_stride * tpc) +
6956 ovr_perf_regs[reg]);
6957 if (chk_addr != addr) {
6958 continue;
6959 }
6960 /* reset the patch count from previous
6961 runs,if ucode has already processed
6962 it */
6963 tmp = nvgpu_mem_rd(g, mem,
6964 ctxsw_prog_main_image_patch_count_o());
6965
6966 if (tmp == 0U) {
6967 gr_ctx->patch_ctx.data_count = 0;
6968 }
6969
6970 gr_gk20a_ctx_patch_write(g, gr_ctx,
6971 addr, data, true);
6972
6973 vaddr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
6974 vaddr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
6975
6976 nvgpu_mem_wr(g, mem,
6977 ctxsw_prog_main_image_patch_count_o(),
6978 gr_ctx->patch_ctx.data_count);
6979 if (ctxheader->gpu_va) {
6980 nvgpu_mem_wr(g, ctxheader,
6981 ctxsw_prog_main_image_patch_adr_lo_o(),
6982 vaddr_lo);
6983 nvgpu_mem_wr(g, ctxheader,
6984 ctxsw_prog_main_image_patch_adr_hi_o(),
6985 vaddr_hi);
6986 } else {
6987 nvgpu_mem_wr(g, mem,
6988 ctxsw_prog_main_image_patch_adr_lo_o(),
6989 vaddr_lo);
6990 nvgpu_mem_wr(g, mem,
6991 ctxsw_prog_main_image_patch_adr_hi_o(),
6992 vaddr_hi);
6993 }
6994
6995 /* we're not caching these on cpu side,
6996 but later watch for it */
6997 return 0;
6998 }
6999 }
7000 }
7001
7002 return 0;
7003}
7004
7005#define ILLEGAL_ID ((u32)~0)
7006
7007static inline bool check_main_image_header_magic(u8 *context)
7008{
7009 u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
7010 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
7011}
7012static inline bool check_local_header_magic(u8 *context)
7013{
7014 u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
7015 return magic == ctxsw_prog_local_magic_value_v_value_v();
7016
7017}
7018
7019/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
7020static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
7021{
7022 return 256;
7023}
7024
7025void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
7026 u32 **ovr_perf_regs)
7027{
7028 *num_ovr_perf_regs = _num_ovr_perf_regs;
7029 *ovr_perf_regs = _ovr_perf_regs;
7030}
7031
7032static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
7033 u32 addr,
7034 bool is_quad, u32 quad,
7035 u32 *context_buffer,
7036 u32 context_buffer_size,
7037 u32 *priv_offset)
7038{
7039 u32 i, data32;
7040 u32 gpc_num, tpc_num;
7041 u32 num_gpcs, num_tpcs;
7042 u32 chk_addr;
7043 u32 ext_priv_offset, ext_priv_size;
7044 u8 *context;
7045 u32 offset_to_segment, offset_to_segment_end;
7046 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
7047 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
7048 u32 num_ext_gpccs_ext_buffer_segments;
7049 u32 inter_seg_offset;
7050 u32 max_tpc_count;
7051 u32 *sm_dsm_perf_ctrl_regs = NULL;
7052 u32 num_sm_dsm_perf_ctrl_regs = 0;
7053 u32 *sm_dsm_perf_regs = NULL;
7054 u32 num_sm_dsm_perf_regs = 0;
7055 u32 buffer_segments_size = 0;
7056 u32 marker_size = 0;
7057 u32 control_register_stride = 0;
7058 u32 perf_register_stride = 0;
7059 struct gr_gk20a *gr = &g->gr;
7060 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7061 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7062 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7063 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7064 u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1);
7065
7066 /* Only have TPC registers in extended region, so if not a TPC reg,
7067 then return error so caller can look elsewhere. */
7068 if (pri_is_gpc_addr(g, addr)) {
7069 u32 gpc_addr = 0;
7070 gpc_num = pri_get_gpc_num(g, addr);
7071 gpc_addr = pri_gpccs_addr_mask(addr);
7072 if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
7073 tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
7074 } else {
7075 return -EINVAL;
7076 }
7077
7078 nvgpu_log_info(g, " gpc = %d tpc = %d",
7079 gpc_num, tpc_num);
7080 } else if ((g->ops.gr.is_etpc_addr != NULL) &&
7081 g->ops.gr.is_etpc_addr(g, addr)) {
7082 g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num);
7083 gpc_base = g->ops.gr.get_egpc_base(g);
7084 } else {
7085 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7086 "does not exist in extended region");
7087 return -EINVAL;
7088 }
7089
7090 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
7091 /* note below is in words/num_registers */
7092 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
7093
7094 context = (u8 *)context_buffer;
7095 /* sanity check main header */
7096 if (!check_main_image_header_magic(context)) {
7097 nvgpu_err(g,
7098 "Invalid main header: magic value");
7099 return -EINVAL;
7100 }
7101 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
7102 if (gpc_num >= num_gpcs) {
7103 nvgpu_err(g,
7104 "GPC 0x%08x is greater than total count 0x%08x!",
7105 gpc_num, num_gpcs);
7106 return -EINVAL;
7107 }
7108
7109 data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
7110 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
7111 if (0 == ext_priv_size) {
7112 nvgpu_log_info(g, " No extended memory in context buffer");
7113 return -EINVAL;
7114 }
7115 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
7116
7117 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
7118 offset_to_segment_end = offset_to_segment +
7119 (ext_priv_size * buffer_segments_size);
7120
7121 /* check local header magic */
7122 context += ctxsw_prog_ucode_header_size_in_bytes();
7123 if (!check_local_header_magic(context)) {
7124 nvgpu_err(g,
7125 "Invalid local header: magic value");
7126 return -EINVAL;
7127 }
7128
7129 /*
7130 * See if the incoming register address is in the first table of
7131 * registers. We check this by decoding only the TPC addr portion.
7132 * If we get a hit on the TPC bit, we then double check the address
7133 * by computing it from the base gpc/tpc strides. Then make sure
7134 * it is a real match.
7135 */
7136 g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
7137 &sm_dsm_perf_regs,
7138 &perf_register_stride);
7139
7140 g->ops.gr.init_sm_dsm_reg_info();
7141
7142 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
7143 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
7144 sm_dsm_perf_reg_id = i;
7145
7146 nvgpu_log_info(g, "register match: 0x%08x",
7147 sm_dsm_perf_regs[i]);
7148
7149 chk_addr = (gpc_base + gpc_stride * gpc_num) +
7150 tpc_in_gpc_base +
7151 (tpc_in_gpc_stride * tpc_num) +
7152 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask);
7153
7154 if (chk_addr != addr) {
7155 nvgpu_err(g,
7156 "Oops addr miss-match! : 0x%08x != 0x%08x",
7157 addr, chk_addr);
7158 return -EINVAL;
7159 }
7160 break;
7161 }
7162 }
7163
7164 /* Didn't find reg in supported group 1.
7165 * so try the second group now */
7166 g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
7167 &sm_dsm_perf_ctrl_regs,
7168 &control_register_stride);
7169
7170 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
7171 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
7172 if ((addr & tpc_gpc_mask) ==
7173 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
7174 sm_dsm_perf_ctrl_reg_id = i;
7175
7176 nvgpu_log_info(g, "register match: 0x%08x",
7177 sm_dsm_perf_ctrl_regs[i]);
7178
7179 chk_addr = (gpc_base + gpc_stride * gpc_num) +
7180 tpc_in_gpc_base +
7181 tpc_in_gpc_stride * tpc_num +
7182 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
7183 tpc_gpc_mask);
7184
7185 if (chk_addr != addr) {
7186 nvgpu_err(g,
7187 "Oops addr miss-match! : 0x%08x != 0x%08x",
7188 addr, chk_addr);
7189 return -EINVAL;
7190
7191 }
7192
7193 break;
7194 }
7195 }
7196 }
7197
7198 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
7199 (ILLEGAL_ID == sm_dsm_perf_reg_id)) {
7200 return -EINVAL;
7201 }
7202
7203 /* Skip the FECS extended header, nothing there for us now. */
7204 offset_to_segment += buffer_segments_size;
7205
7206 /* skip through the GPCCS extended headers until we get to the data for
7207 * our GPC. The size of each gpc extended segment is enough to hold the
7208 * max tpc count for the gpcs,in 256b chunks.
7209 */
7210
7211 max_tpc_count = gr->max_tpc_per_gpc_count;
7212
7213 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
7214
7215 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
7216 buffer_segments_size * gpc_num);
7217
7218 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
7219
7220 /* skip the head marker to start with */
7221 inter_seg_offset = marker_size;
7222
7223 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
7224 /* skip over control regs of TPC's before the one we want.
7225 * then skip to the register in this tpc */
7226 inter_seg_offset = inter_seg_offset +
7227 (tpc_num * control_register_stride) +
7228 sm_dsm_perf_ctrl_reg_id;
7229 } else {
7230 /* skip all the control registers */
7231 inter_seg_offset = inter_seg_offset +
7232 (num_tpcs * control_register_stride);
7233
7234 /* skip the marker between control and counter segments */
7235 inter_seg_offset += marker_size;
7236
7237 /* skip over counter regs of TPCs before the one we want */
7238 inter_seg_offset = inter_seg_offset +
7239 (tpc_num * perf_register_stride) *
7240 ctxsw_prog_extended_num_smpc_quadrants_v();
7241
7242 /* skip over the register for the quadrants we do not want.
7243 * then skip to the register in this tpc */
7244 inter_seg_offset = inter_seg_offset +
7245 (perf_register_stride * quad) +
7246 sm_dsm_perf_reg_id;
7247 }
7248
7249 /* set the offset to the segment offset plus the inter segment offset to
7250 * our register */
7251 offset_to_segment += (inter_seg_offset * 4);
7252
7253 /* last sanity check: did we somehow compute an offset outside the
7254 * extended buffer? */
7255 if (offset_to_segment > offset_to_segment_end) {
7256 nvgpu_err(g,
7257 "Overflow ctxsw buffer! 0x%08x > 0x%08x",
7258 offset_to_segment, offset_to_segment_end);
7259 return -EINVAL;
7260 }
7261
7262 *priv_offset = offset_to_segment;
7263
7264 return 0;
7265}
7266
7267
7268static int
7269gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
7270 enum ctxsw_addr_type addr_type,
7271 u32 pri_addr,
7272 u32 gpc_num, u32 num_tpcs,
7273 u32 num_ppcs, u32 ppc_mask,
7274 u32 *priv_offset)
7275{
7276 u32 i;
7277 u32 address, base_address;
7278 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
7279 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
7280 struct aiv_gk20a *reg;
7281 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7282 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7283 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
7284 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
7285 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7286 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7287
7288 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
7289
7290 if (!g->gr.ctx_vars.valid) {
7291 return -EINVAL;
7292 }
7293
7294 /* Process the SYS/BE segment. */
7295 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
7296 (addr_type == CTXSW_ADDR_TYPE_BE)) {
7297 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
7298 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
7299 address = reg->addr;
7300 sys_offset = reg->index;
7301
7302 if (pri_addr == address) {
7303 *priv_offset = sys_offset;
7304 return 0;
7305 }
7306 }
7307 }
7308
7309 /* Process the TPC segment. */
7310 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
7311 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
7312 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
7313 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
7314 address = reg->addr;
7315 tpc_addr = pri_tpccs_addr_mask(address);
7316 base_address = gpc_base +
7317 (gpc_num * gpc_stride) +
7318 tpc_in_gpc_base +
7319 (tpc_num * tpc_in_gpc_stride);
7320 address = base_address + tpc_addr;
7321 /*
7322 * The data for the TPCs is interleaved in the context buffer.
7323 * Example with num_tpcs = 2
7324 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7325 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7326 */
7327 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
7328
7329 if (pri_addr == address) {
7330 *priv_offset = tpc_offset;
7331 return 0;
7332 }
7333 }
7334 }
7335 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7336 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7337 if (g->ops.gr.get_egpc_base == NULL) {
7338 return -EINVAL;
7339 }
7340
7341 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
7342 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.etpc.count; i++) {
7343 reg = &g->gr.ctx_vars.ctxsw_regs.etpc.l[i];
7344 address = reg->addr;
7345 tpc_addr = pri_tpccs_addr_mask(address);
7346 base_address = g->ops.gr.get_egpc_base(g) +
7347 (gpc_num * gpc_stride) +
7348 tpc_in_gpc_base +
7349 (tpc_num * tpc_in_gpc_stride);
7350 address = base_address + tpc_addr;
7351 /*
7352 * The data for the TPCs is interleaved in the context buffer.
7353 * Example with num_tpcs = 2
7354 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7355 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7356 */
7357 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
7358
7359 if (pri_addr == address) {
7360 *priv_offset = tpc_offset;
7361 nvgpu_log(g,
7362 gpu_dbg_fn | gpu_dbg_gpu_dbg,
7363 "egpc/etpc priv_offset=0x%#08x",
7364 *priv_offset);
7365 return 0;
7366 }
7367 }
7368 }
7369 }
7370
7371
7372 /* Process the PPC segment. */
7373 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7374 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
7375 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
7376 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
7377 address = reg->addr;
7378 ppc_addr = pri_ppccs_addr_mask(address);
7379 base_address = gpc_base +
7380 (gpc_num * gpc_stride) +
7381 ppc_in_gpc_base +
7382 (ppc_num * ppc_in_gpc_stride);
7383 address = base_address + ppc_addr;
7384 /*
7385 * The data for the PPCs is interleaved in the context buffer.
7386 * Example with numPpcs = 2
7387 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
7388 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
7389 */
7390 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
7391
7392 if (pri_addr == address) {
7393 *priv_offset = ppc_offset;
7394 return 0;
7395 }
7396 }
7397 }
7398 }
7399
7400
7401 /* Process the GPC segment. */
7402 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7403 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
7404 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
7405
7406 address = reg->addr;
7407 gpc_addr = pri_gpccs_addr_mask(address);
7408 gpc_offset = reg->index;
7409
7410 base_address = gpc_base + (gpc_num * gpc_stride);
7411 address = base_address + gpc_addr;
7412
7413 if (pri_addr == address) {
7414 *priv_offset = gpc_offset;
7415 return 0;
7416 }
7417 }
7418 }
7419 return -EINVAL;
7420}
7421
7422static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
7423 u8 *context,
7424 u32 *num_ppcs, u32 *ppc_mask,
7425 u32 *reg_ppc_count)
7426{
7427 u32 data32;
7428 u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
7429
7430 /*
7431 * if there is only 1 PES_PER_GPC, then we put the PES registers
7432 * in the GPC reglist, so we can't error out if ppc.count == 0
7433 */
7434 if ((!g->gr.ctx_vars.valid) ||
7435 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
7436 (num_pes_per_gpc > 1))) {
7437 return -EINVAL;
7438 }
7439
7440 data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
7441
7442 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
7443 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
7444
7445 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
7446
7447 return 0;
7448}
7449
7450int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g,
7451 enum ctxsw_addr_type addr_type,
7452 u32 num_tpcs,
7453 u32 num_ppcs,
7454 u32 reg_list_ppc_count,
7455 u32 *__offset_in_segment)
7456{
7457 u32 offset_in_segment = 0;
7458 struct gr_gk20a *gr = &g->gr;
7459
7460 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
7461 /*
7462 * reg = gr->ctx_vars.ctxsw_regs.tpc.l;
7463 * offset_in_segment = 0;
7464 */
7465 } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
7466 (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
7467 offset_in_segment =
7468 ((gr->ctx_vars.ctxsw_regs.tpc.count *
7469 num_tpcs) << 2);
7470
7471 nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg,
7472 "egpc etpc offset_in_segment 0x%#08x",
7473 offset_in_segment);
7474 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
7475 /*
7476 * The ucode stores TPC data before PPC data.
7477 * Advance offset past TPC data to PPC data.
7478 */
7479 offset_in_segment =
7480 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7481 gr->ctx_vars.ctxsw_regs.etpc.count) *
7482 num_tpcs) << 2);
7483 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
7484 /*
7485 * The ucode stores TPC/PPC data before GPC data.
7486 * Advance offset past TPC/PPC data to GPC data.
7487 *
7488 * Note 1 PES_PER_GPC case
7489 */
7490 u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
7491 GPU_LIT_NUM_PES_PER_GPC);
7492 if (num_pes_per_gpc > 1) {
7493 offset_in_segment =
7494 ((((gr->ctx_vars.ctxsw_regs.tpc.count +
7495 gr->ctx_vars.ctxsw_regs.etpc.count) *
7496 num_tpcs) << 2) +
7497 ((reg_list_ppc_count * num_ppcs) << 2));
7498 } else {
7499 offset_in_segment =
7500 (((gr->ctx_vars.ctxsw_regs.tpc.count +
7501 gr->ctx_vars.ctxsw_regs.etpc.count) *
7502 num_tpcs) << 2);
7503 }
7504 } else {
7505 nvgpu_log_fn(g, "Unknown address type.");
7506 return -EINVAL;
7507 }
7508
7509 *__offset_in_segment = offset_in_segment;
7510 return 0;
7511}
7512
7513/*
7514 * This function will return the 32 bit offset for a priv register if it is
7515 * present in the context buffer. The context buffer is in CPU memory.
7516 */
7517static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
7518 u32 addr,
7519 bool is_quad, u32 quad,
7520 u32 *context_buffer,
7521 u32 context_buffer_size,
7522 u32 *priv_offset)
7523{
7524 u32 i, data32;
7525 int err;
7526 enum ctxsw_addr_type addr_type;
7527 u32 broadcast_flags;
7528 u32 gpc_num, tpc_num, ppc_num, be_num;
7529 u32 num_gpcs, num_tpcs, num_ppcs;
7530 u32 offset;
7531 u32 sys_priv_offset, gpc_priv_offset;
7532 u32 ppc_mask, reg_list_ppc_count;
7533 u8 *context;
7534 u32 offset_to_segment, offset_in_segment = 0;
7535
7536 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
7537
7538 err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
7539 &gpc_num, &tpc_num, &ppc_num, &be_num,
7540 &broadcast_flags);
7541 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7542 "addr_type = %d, broadcast_flags: %08x",
7543 addr_type, broadcast_flags);
7544 if (err != 0) {
7545 return err;
7546 }
7547
7548 context = (u8 *)context_buffer;
7549 if (!check_main_image_header_magic(context)) {
7550 nvgpu_err(g,
7551 "Invalid main header: magic value");
7552 return -EINVAL;
7553 }
7554 num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
7555
7556 /* Parse the FECS local header. */
7557 context += ctxsw_prog_ucode_header_size_in_bytes();
7558 if (!check_local_header_magic(context)) {
7559 nvgpu_err(g,
7560 "Invalid FECS local header: magic value");
7561 return -EINVAL;
7562 }
7563 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7564 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7565 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);
7566
7567 /* If found in Ext buffer, ok.
7568 * If it failed and we expected to find it there (quad offset)
7569 * then return the error. Otherwise continue on.
7570 */
7571 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
7572 addr, is_quad, quad, context_buffer,
7573 context_buffer_size, priv_offset);
7574 if ((err == 0) || ((err != 0) && is_quad)) {
7575 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7576 "err = %d, is_quad = %s",
7577 err, is_quad ? "true" : "false");
7578 return err;
7579 }
7580
7581 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
7582 (addr_type == CTXSW_ADDR_TYPE_BE)) {
7583 /* Find the offset in the FECS segment. */
7584 offset_to_segment = sys_priv_offset *
7585 ctxsw_prog_ucode_header_size_in_bytes();
7586
7587 err = gr_gk20a_process_context_buffer_priv_segment(g,
7588 addr_type, addr,
7589 0, 0, 0, 0,
7590 &offset);
7591 if (err != 0) {
7592 return err;
7593 }
7594
7595 *priv_offset = (offset_to_segment + offset);
7596 return 0;
7597 }
7598
7599 if ((gpc_num + 1) > num_gpcs) {
7600 nvgpu_err(g,
7601 "GPC %d not in this context buffer.",
7602 gpc_num);
7603 return -EINVAL;
7604 }
7605
7606 /* Parse the GPCCS local header(s).*/
7607 for (i = 0; i < num_gpcs; i++) {
7608 context += ctxsw_prog_ucode_header_size_in_bytes();
7609 if (!check_local_header_magic(context)) {
7610 nvgpu_err(g,
7611 "Invalid GPCCS local header: magic value");
7612 return -EINVAL;
7613
7614 }
7615 data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
7616 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
7617
7618 err = gr_gk20a_determine_ppc_configuration(g, context,
7619 &num_ppcs, &ppc_mask,
7620 &reg_list_ppc_count);
7621 if (err != 0) {
7622 nvgpu_err(g, "determine ppc configuration failed");
7623 return err;
7624 }
7625
7626
7627 num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
7628
7629 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
7630 nvgpu_err(g,
7631 "GPC %d TPC %d not in this context buffer.",
7632 gpc_num, tpc_num);
7633 return -EINVAL;
7634 }
7635
7636 /* Find the offset in the GPCCS segment.*/
7637 if (i == gpc_num) {
7638 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7639 "gpc_priv_offset 0x%#08x",
7640 gpc_priv_offset);
7641 offset_to_segment = gpc_priv_offset *
7642 ctxsw_prog_ucode_header_size_in_bytes();
7643
7644 err = g->ops.gr.get_offset_in_gpccs_segment(g,
7645 addr_type,
7646 num_tpcs, num_ppcs, reg_list_ppc_count,
7647 &offset_in_segment);
7648 if (err != 0) {
7649 return -EINVAL;
7650 }
7651
7652 offset_to_segment += offset_in_segment;
7653 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7654 "offset_to_segment 0x%#08x",
7655 offset_to_segment);
7656
7657 err = gr_gk20a_process_context_buffer_priv_segment(g,
7658 addr_type, addr,
7659 i, num_tpcs,
7660 num_ppcs, ppc_mask,
7661 &offset);
7662 if (err != 0) {
7663 return -EINVAL;
7664 }
7665
7666 *priv_offset = offset_to_segment + offset;
7667 return 0;
7668 }
7669 }
7670
7671 return -EINVAL;
7672}
7673
7674static int map_cmp(const void *a, const void *b)
7675{
7676 struct ctxsw_buf_offset_map_entry *e1 =
7677 (struct ctxsw_buf_offset_map_entry *)a;
7678 struct ctxsw_buf_offset_map_entry *e2 =
7679 (struct ctxsw_buf_offset_map_entry *)b;
7680
7681 if (e1->addr < e2->addr) {
7682 return -1;
7683 }
7684
7685 if (e1->addr > e2->addr) {
7686 return 1;
7687 }
7688 return 0;
7689}
7690
7691static int add_ctxsw_buffer_map_entries_pmsys(struct ctxsw_buf_offset_map_entry *map,
7692 struct aiv_list_gk20a *regs,
7693 u32 *count, u32 *offset,
7694 u32 max_cnt, u32 base, u32 mask)
7695{
7696 u32 idx;
7697 u32 cnt = *count;
7698 u32 off = *offset;
7699
7700 if ((cnt + regs->count) > max_cnt) {
7701 return -EINVAL;
7702 }
7703
7704 for (idx = 0; idx < regs->count; idx++) {
7705 if ((base + (regs->l[idx].addr & mask)) < 0xFFF) {
7706 map[cnt].addr = base + (regs->l[idx].addr & mask)
7707 + NV_PCFG_BASE;
7708 } else {
7709 map[cnt].addr = base + (regs->l[idx].addr & mask);
7710 }
7711 map[cnt++].offset = off;
7712 off += 4;
7713 }
7714 *count = cnt;
7715 *offset = off;
7716 return 0;
7717}
7718
7719static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g,
7720 struct ctxsw_buf_offset_map_entry *map,
7721 struct aiv_list_gk20a *regs,
7722 u32 *count, u32 *offset,
7723 u32 max_cnt, u32 base, u32 mask)
7724{
7725 u32 idx;
7726 u32 cnt = *count;
7727 u32 off = *offset;
7728
7729 if ((cnt + regs->count) > max_cnt) {
7730 return -EINVAL;
7731 }
7732
7733 /* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1
7734 * To handle the case of PPC registers getting added into GPC, the below
7735 * code specifically checks for any PPC offsets and adds them using
7736 * proper mask
7737 */
7738 for (idx = 0; idx < regs->count; idx++) {
7739 /* Check if the address is PPC address */
7740 if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) {
7741 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
7742 GPU_LIT_PPC_IN_GPC_BASE);
7743 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
7744 GPU_LIT_PPC_IN_GPC_STRIDE);
7745 /* Use PPC mask instead of the GPC mask provided */
7746 u32 ppcmask = ppc_in_gpc_stride - 1;
7747
7748 map[cnt].addr = base + ppc_in_gpc_base
7749 + (regs->l[idx].addr & ppcmask);
7750 } else {
7751 map[cnt].addr = base + (regs->l[idx].addr & mask);
7752 }
7753 map[cnt++].offset = off;
7754 off += 4;
7755 }
7756 *count = cnt;
7757 *offset = off;
7758 return 0;
7759}
7760
7761static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
7762 struct aiv_list_gk20a *regs,
7763 u32 *count, u32 *offset,
7764 u32 max_cnt, u32 base, u32 mask)
7765{
7766 u32 idx;
7767 u32 cnt = *count;
7768 u32 off = *offset;
7769
7770 if ((cnt + regs->count) > max_cnt) {
7771 return -EINVAL;
7772 }
7773
7774 for (idx = 0; idx < regs->count; idx++) {
7775 map[cnt].addr = base + (regs->l[idx].addr & mask);
7776 map[cnt++].offset = off;
7777 off += 4;
7778 }
7779 *count = cnt;
7780 *offset = off;
7781 return 0;
7782}
7783
7784/* Helper function to add register entries to the register map for all
7785 * subunits
7786 */
7787static int add_ctxsw_buffer_map_entries_subunits(
7788 struct ctxsw_buf_offset_map_entry *map,
7789 struct aiv_list_gk20a *regs,
7790 u32 *count, u32 *offset,
7791 u32 max_cnt, u32 base,
7792 u32 num_units, u32 stride, u32 mask)
7793{
7794 u32 unit;
7795 u32 idx;
7796 u32 cnt = *count;
7797 u32 off = *offset;
7798
7799 if ((cnt + (regs->count * num_units)) > max_cnt) {
7800 return -EINVAL;
7801 }
7802
7803 /* Data is interleaved for units in ctxsw buffer */
7804 for (idx = 0; idx < regs->count; idx++) {
7805 for (unit = 0; unit < num_units; unit++) {
7806 map[cnt].addr = base + (regs->l[idx].addr & mask) +
7807 (unit * stride);
7808 map[cnt++].offset = off;
7809 off += 4;
7810 }
7811 }
7812 *count = cnt;
7813 *offset = off;
7814 return 0;
7815}
7816
7817int gr_gk20a_add_ctxsw_reg_pm_fbpa(struct gk20a *g,
7818 struct ctxsw_buf_offset_map_entry *map,
7819 struct aiv_list_gk20a *regs,
7820 u32 *count, u32 *offset,
7821 u32 max_cnt, u32 base,
7822 u32 num_fbpas, u32 stride, u32 mask)
7823{
7824 return add_ctxsw_buffer_map_entries_subunits(map, regs, count, offset,
7825 max_cnt, base, num_fbpas, stride, mask);
7826}
7827
7828static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
7829 struct ctxsw_buf_offset_map_entry *map,
7830 u32 *count, u32 *offset, u32 max_cnt)
7831{
7832 u32 num_gpcs = g->gr.gpc_count;
7833 u32 num_ppcs, num_tpcs, gpc_num, base;
7834 u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
7835 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
7836 u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
7837 u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
7838 u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
7839 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
7840
7841 for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
7842 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
7843 base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base;
7844 if (add_ctxsw_buffer_map_entries_subunits(map,
7845 &g->gr.ctx_vars.ctxsw_regs.pm_tpc,
7846 count, offset, max_cnt, base, num_tpcs,
7847 tpc_in_gpc_stride,
7848 (tpc_in_gpc_stride - 1))) {
7849 return -EINVAL;
7850 }
7851
7852 num_ppcs = g->gr.gpc_ppc_count[gpc_num];
7853 base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base;
7854 if (add_ctxsw_buffer_map_entries_subunits(map,
7855 &g->gr.ctx_vars.ctxsw_regs.pm_ppc,
7856 count, offset, max_cnt, base, num_ppcs,
7857 ppc_in_gpc_stride,
7858 (ppc_in_gpc_stride - 1))) {
7859 return -EINVAL;
7860 }
7861
7862 base = gpc_base + (gpc_stride * gpc_num);
7863 if (add_ctxsw_buffer_map_entries_pmgpc(g, map,
7864 &g->gr.ctx_vars.ctxsw_regs.pm_gpc,
7865 count, offset, max_cnt, base,
7866 (gpc_stride - 1))) {
7867 return -EINVAL;
7868 }
7869
7870 base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num;
7871 if (add_ctxsw_buffer_map_entries(map,
7872 &g->gr.ctx_vars.ctxsw_regs.pm_ucgpc,
7873 count, offset, max_cnt, base, ~0)) {
7874 return -EINVAL;
7875 }
7876
7877 base = (g->ops.gr.get_pmm_per_chiplet_offset() * gpc_num);
7878 if (add_ctxsw_buffer_map_entries(map,
7879 &g->gr.ctx_vars.ctxsw_regs.perf_gpc,
7880 count, offset, max_cnt, base, ~0)) {
7881 return -EINVAL;
7882 }
7883
7884 base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
7885 if (add_ctxsw_buffer_map_entries(map,
7886 &g->gr.ctx_vars.ctxsw_regs.gpc_router,
7887 count, offset, max_cnt, base, ~0)) {
7888 return -EINVAL;
7889 }
7890
7891 /* Counter Aggregation Unit, if available */
7892 if (g->gr.ctx_vars.ctxsw_regs.pm_cau.count) {
7893 base = gpc_base + (gpc_stride * gpc_num)
7894 + tpc_in_gpc_base;
7895 if (add_ctxsw_buffer_map_entries_subunits(map,
7896 &g->gr.ctx_vars.ctxsw_regs.pm_cau,
7897 count, offset, max_cnt, base, num_tpcs,
7898 tpc_in_gpc_stride,
7899 (tpc_in_gpc_stride - 1))) {
7900 return -EINVAL;
7901 }
7902 }
7903
7904 *offset = ALIGN(*offset, 256);
7905 }
7906 return 0;
7907}
7908
7909int gr_gk20a_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map,
7910 struct aiv_list_gk20a *regs,
7911 u32 *count, u32 *offset,
7912 u32 max_cnt, u32 base, u32 mask)
7913{
7914 return add_ctxsw_buffer_map_entries(map, regs,
7915 count, offset, max_cnt, base, mask);
7916}
7917
7918/*
7919 * PM CTXSW BUFFER LAYOUT :
7920 *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE
7921 *| |
7922 *| LIST_compressed_pm_ctx_reg_SYS |Space allocated: numRegs words
7923 *|---------------------------------------------|
7924 *| |
7925 *| LIST_compressed_nv_perf_ctx_reg_SYS |Space allocated: numRegs words
7926 *|---------------------------------------------|
7927 *| |
7928 *| LIST_compressed_nv_perf_ctx_reg_sysrouter|Space allocated: numRegs words
7929 *|---------------------------------------------|
7930 *| |
7931 *| LIST_compressed_nv_perf_ctx_reg_PMA |Space allocated: numRegs words
7932 *|---------------------------------------------|
7933 *| PADDING for 256 byte alignment |
7934 *|---------------------------------------------|<----256 byte aligned
7935 *| LIST_compressed_nv_perf_fbp_ctx_regs |
7936 *| |Space allocated: numRegs * n words (for n FB units)
7937 *|---------------------------------------------|
7938 *| LIST_compressed_nv_perf_fbprouter_ctx_regs |
7939 *| |Space allocated: numRegs * n words (for n FB units)
7940 *|---------------------------------------------|
7941 *| LIST_compressed_pm_fbpa_ctx_regs |
7942 *| |Space allocated: numRegs * n words (for n FB units)
7943 *|---------------------------------------------|
7944 *| LIST_compressed_pm_rop_ctx_regs |
7945 *|---------------------------------------------|
7946 *| LIST_compressed_pm_ltc_ctx_regs |
7947 *| LTC0 LTS0 |
7948 *| LTC1 LTS0 |Space allocated: numRegs * n words (for n LTC units)
7949 *| LTCn LTS0 |
7950 *| LTC0 LTS1 |
7951 *| LTC1 LTS1 |
7952 *| LTCn LTS1 |
7953 *| LTC0 LTSn |
7954 *| LTC1 LTSn |
7955 *| LTCn LTSn |
7956 *|---------------------------------------------|
7957 *| PADDING for 256 byte alignment |
7958 *|---------------------------------------------|<----256 byte aligned
7959 *| GPC0 REG0 TPC0 |Each GPC has space allocated to accommodate
7960 *| REG0 TPC1 | all the GPC/TPC register lists
7961 *| Lists in each GPC region: REG0 TPCn |Per GPC allocated space is always 256 byte aligned
7962 *| LIST_pm_ctx_reg_TPC REG1 TPC0 |
7963 *| * numTpcs REG1 TPC1 |
7964 *| LIST_pm_ctx_reg_PPC REG1 TPCn |
7965 *| * numPpcs REGn TPC0 |
7966 *| LIST_pm_ctx_reg_GPC REGn TPC1 |
7967 *| List_pm_ctx_reg_uc_GPC REGn TPCn |
7968 *| LIST_nv_perf_ctx_reg_GPC |
7969 *| LIST_nv_perf_gpcrouter_ctx_reg |
7970 *| LIST_nv_perf_ctx_reg_CAU |
7971 *| ---- |--
7972 *| GPC1 . |
7973 *| . |<----
7974 *|---------------------------------------------|
7975 *= =
7976 *| GPCn |
7977 *= =
7978 *|---------------------------------------------|
7979 */
7980
7981static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
7982{
7983 u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size;
7984 u32 hwpm_ctxsw_reg_count_max;
7985 u32 map_size;
7986 u32 i, count = 0;
7987 u32 offset = 0;
7988 struct ctxsw_buf_offset_map_entry *map;
7989 u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
7990 u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
7991 u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
7992 u32 num_ltc = g->ops.gr.get_max_ltc_per_fbp(g) * g->gr.num_fbps;
7993
7994 if (hwpm_ctxsw_buffer_size == 0) {
7995 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
7996 "no PM Ctxsw buffer memory in context buffer");
7997 return -EINVAL;
7998 }
7999
8000 hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
8001 map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
8002
8003 map = nvgpu_big_zalloc(g, map_size);
8004 if (map == NULL) {
8005 return -ENOMEM;
8006 }
8007
8008 /* Add entries from _LIST_pm_ctx_reg_SYS */
8009 if (add_ctxsw_buffer_map_entries_pmsys(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys,
8010 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
8011 goto cleanup;
8012 }
8013
8014 /* Add entries from _LIST_nv_perf_ctx_reg_SYS */
8015 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys,
8016 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
8017 goto cleanup;
8018 }
8019
8020 /* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/
8021 if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys_router,
8022 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
8023 goto cleanup;
8024 }
8025
8026 /* Add entries from _LIST_nv_perf_pma_ctx_reg*/
8027 if (g->ops.gr.add_ctxsw_reg_perf_pma(map, &g->gr.ctx_vars.ctxsw_regs.perf_pma,
8028 &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
8029 goto cleanup;
8030 }
8031
8032 offset = ALIGN(offset, 256);
8033
8034 /* Add entries from _LIST_nv_perf_fbp_ctx_regs */
8035 if (add_ctxsw_buffer_map_entries_subunits(map,
8036 &g->gr.ctx_vars.ctxsw_regs.fbp,
8037 &count, &offset,
8038 hwpm_ctxsw_reg_count_max, 0,
8039 g->gr.num_fbps,
8040 g->ops.gr.get_pmm_per_chiplet_offset(),
8041 ~0)) {
8042 goto cleanup;
8043 }
8044
8045 /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
8046 if (add_ctxsw_buffer_map_entries_subunits(map,
8047 &g->gr.ctx_vars.ctxsw_regs.fbp_router,
8048 &count, &offset,
8049 hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps,
8050 NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0)) {
8051 goto cleanup;
8052 }
8053
8054 /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
8055 if (g->ops.gr.add_ctxsw_reg_pm_fbpa(g, map,
8056 &g->gr.ctx_vars.ctxsw_regs.pm_fbpa,
8057 &count, &offset,
8058 hwpm_ctxsw_reg_count_max, 0,
8059 num_fbpas, fbpa_stride, ~0)) {
8060 goto cleanup;
8061 }
8062
8063 /* Add entries from _LIST_nv_pm_rop_ctx_regs */
8064 if (add_ctxsw_buffer_map_entries(map,
8065 &g->gr.ctx_vars.ctxsw_regs.pm_rop,
8066 &count, &offset,
8067 hwpm_ctxsw_reg_count_max, 0, ~0)) {
8068 goto cleanup;
8069 }
8070
8071 /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
8072 if (add_ctxsw_buffer_map_entries_subunits(map,
8073 &g->gr.ctx_vars.ctxsw_regs.pm_ltc,
8074 &count, &offset,
8075 hwpm_ctxsw_reg_count_max, 0,
8076 num_ltc, ltc_stride, ~0)) {
8077 goto cleanup;
8078 }
8079
8080 offset = ALIGN(offset, 256);
8081
8082 /* Add GPC entries */
8083 if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
8084 hwpm_ctxsw_reg_count_max)) {
8085 goto cleanup;
8086 }
8087
8088 if (offset > hwpm_ctxsw_buffer_size) {
8089 nvgpu_err(g, "offset > buffer size");
8090 goto cleanup;
8091 }
8092
8093 sort(map, count, sizeof(*map), map_cmp, NULL);
8094
8095 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map;
8096 g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count;
8097
8098 nvgpu_log_info(g, "Reg Addr => HWPM Ctxt switch buffer offset");
8099
8100 for (i = 0; i < count; i++) {
8101 nvgpu_log_info(g, "%08x => %08x", map[i].addr, map[i].offset);
8102 }
8103
8104 return 0;
8105cleanup:
8106 nvgpu_err(g, "Failed to create HWPM buffer offset map");
8107 nvgpu_big_free(g, map);
8108 return -EINVAL;
8109}
8110
8111/*
8112 * This function will return the 32 bit offset for a priv register if it is
8113 * present in the PM context buffer.
8114 */
8115static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
8116 u32 addr,
8117 u32 *priv_offset)
8118{
8119 struct gr_gk20a *gr = &g->gr;
8120 int err = 0;
8121 u32 count;
8122 struct ctxsw_buf_offset_map_entry *map, *result, map_key;
8123
8124 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
8125
8126 /* Create map of pri address and pm offset if necessary */
8127 if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) {
8128 err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g);
8129 if (err != 0) {
8130 return err;
8131 }
8132 }
8133
8134 *priv_offset = 0;
8135
8136 map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map;
8137 count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count;
8138
8139 map_key.addr = addr;
8140 result = bsearch(&map_key, map, count, sizeof(*map), map_cmp);
8141
8142 if (result) {
8143 *priv_offset = result->offset;
8144 } else {
8145 nvgpu_err(g, "Lookup failed for address 0x%x", addr);
8146 err = -EINVAL;
8147 }
8148 return err;
8149}
8150
8151bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
8152{
8153 int curr_gr_ctx;
8154 u32 curr_gr_tsgid;
8155 struct gk20a *g = ch->g;
8156 struct channel_gk20a *curr_ch;
8157 bool ret = false;
8158 struct tsg_gk20a *tsg;
8159
8160 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
8161
8162 /* when contexts are unloaded from GR, the valid bit is reset
8163 * but the instance pointer information remains intact. So the
8164 * valid bit must be checked to be absolutely certain that a
8165 * valid context is currently resident.
8166 */
8167 if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
8168 return NULL;
8169 }
8170
8171 curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
8172 &curr_gr_tsgid);
8173
8174 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
8175 "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
8176 " ch->chid=%d",
8177 (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
8178 curr_gr_tsgid,
8179 ch->tsgid,
8180 ch->chid);
8181
8182 if (curr_ch == NULL) {
8183 return false;
8184 }
8185
8186 if (ch->chid == curr_ch->chid) {
8187 ret = true;
8188 }
8189
8190 tsg = tsg_gk20a_from_ch(ch);
8191 if ((tsg != NULL) && (tsg->tsgid == curr_gr_tsgid)) {
8192 ret = true;
8193 }
8194
8195 gk20a_channel_put(curr_ch);
8196 return ret;
8197}
8198
8199int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
8200 struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
8201 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
8202 bool ch_is_curr_ctx)
8203{
8204 struct gk20a *g = ch->g;
8205 struct tsg_gk20a *tsg;
8206 struct nvgpu_gr_ctx *gr_ctx;
8207 bool gr_ctx_ready = false;
8208 bool pm_ctx_ready = false;
8209 struct nvgpu_mem *current_mem = NULL;
8210 u32 i, j, offset, v;
8211 struct gr_gk20a *gr = &g->gr;
8212 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8213 u32 max_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
8214 sm_per_tpc;
8215 u32 *offsets = NULL;
8216 u32 *offset_addrs = NULL;
8217 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
8218 int err = 0, pass;
8219
8220 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
8221 num_ctx_wr_ops, num_ctx_rd_ops);
8222
8223 tsg = tsg_gk20a_from_ch(ch);
8224 if (tsg == NULL) {
8225 return -EINVAL;
8226 }
8227
8228 gr_ctx = &tsg->gr_ctx;
8229
8230 if (ch_is_curr_ctx) {
8231 for (pass = 0; pass < 2; pass++) {
8232 ctx_op_nr = 0;
8233 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
8234 /* only do ctx ops and only on the right pass */
8235 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
8236 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
8237 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
8238 continue;
8239 }
8240
8241 /* if this is a quad access, setup for special access*/
8242 if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
8243 && (g->ops.gr.access_smpc_reg != NULL)) {
8244 g->ops.gr.access_smpc_reg(g,
8245 ctx_ops[i].quad,
8246 ctx_ops[i].offset);
8247 }
8248 offset = ctx_ops[i].offset;
8249
8250 if (pass == 0) { /* write pass */
8251 v = gk20a_readl(g, offset);
8252 v &= ~ctx_ops[i].and_n_mask_lo;
8253 v |= ctx_ops[i].value_lo;
8254 gk20a_writel(g, offset, v);
8255
8256 nvgpu_log(g, gpu_dbg_gpu_dbg,
8257 "direct wr: offset=0x%x v=0x%x",
8258 offset, v);
8259
8260 if (ctx_ops[i].op == REGOP(WRITE_64)) {
8261 v = gk20a_readl(g, offset + 4);
8262 v &= ~ctx_ops[i].and_n_mask_hi;
8263 v |= ctx_ops[i].value_hi;
8264 gk20a_writel(g, offset + 4, v);
8265
8266 nvgpu_log(g, gpu_dbg_gpu_dbg,
8267 "direct wr: offset=0x%x v=0x%x",
8268 offset + 4, v);
8269 }
8270
8271 } else { /* read pass */
8272 ctx_ops[i].value_lo =
8273 gk20a_readl(g, offset);
8274
8275 nvgpu_log(g, gpu_dbg_gpu_dbg,
8276 "direct rd: offset=0x%x v=0x%x",
8277 offset, ctx_ops[i].value_lo);
8278
8279 if (ctx_ops[i].op == REGOP(READ_64)) {
8280 ctx_ops[i].value_hi =
8281 gk20a_readl(g, offset + 4);
8282
8283 nvgpu_log(g, gpu_dbg_gpu_dbg,
8284 "direct rd: offset=0x%x v=0x%x",
8285 offset, ctx_ops[i].value_lo);
8286 } else {
8287 ctx_ops[i].value_hi = 0;
8288 }
8289 }
8290 ctx_op_nr++;
8291 }
8292 }
8293 goto cleanup;
8294 }
8295
8296 /* they're the same size, so just use one alloc for both */
8297 offsets = nvgpu_kzalloc(g, 2 * sizeof(u32) * max_offsets);
8298 if (offsets == NULL) {
8299 err = -ENOMEM;
8300 goto cleanup;
8301 }
8302 offset_addrs = offsets + max_offsets;
8303
8304 err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
8305 if (err != 0) {
8306 goto cleanup;
8307 }
8308
8309 g->ops.mm.l2_flush(g, true);
8310
8311 /* write to appropriate place in context image,
8312 * first have to figure out where that really is */
8313
8314 /* first pass is writes, second reads */
8315 for (pass = 0; pass < 2; pass++) {
8316 ctx_op_nr = 0;
8317 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
8318 u32 num_offsets;
8319
8320 /* only do ctx ops and only on the right pass */
8321 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
8322 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
8323 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
8324 continue;
8325 }
8326
8327 err = gr_gk20a_get_ctx_buffer_offsets(g,
8328 ctx_ops[i].offset,
8329 max_offsets,
8330 offsets, offset_addrs,
8331 &num_offsets,
8332 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
8333 ctx_ops[i].quad);
8334 if (err == 0) {
8335 if (!gr_ctx_ready) {
8336 gr_ctx_ready = true;
8337 }
8338 current_mem = &gr_ctx->mem;
8339 } else {
8340 err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
8341 ctx_ops[i].offset,
8342 max_offsets,
8343 offsets, offset_addrs,
8344 &num_offsets);
8345 if (err != 0) {
8346 nvgpu_log(g, gpu_dbg_gpu_dbg,
8347 "ctx op invalid offset: offset=0x%x",
8348 ctx_ops[i].offset);
8349 ctx_ops[i].status =
8350 REGOP(STATUS_INVALID_OFFSET);
8351 continue;
8352 }
8353 if (!pm_ctx_ready) {
8354 /* Make sure ctx buffer was initialized */
8355 if (!nvgpu_mem_is_valid(&gr_ctx->pm_ctx.mem)) {
8356 nvgpu_err(g,
8357 "Invalid ctx buffer");
8358 err = -EINVAL;
8359 goto cleanup;
8360 }
8361 pm_ctx_ready = true;
8362 }
8363 current_mem = &gr_ctx->pm_ctx.mem;
8364 }
8365
8366 /* if this is a quad access, setup for special access*/
8367 if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) &&
8368 (g->ops.gr.access_smpc_reg != NULL)) {
8369 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
8370 ctx_ops[i].offset);
8371 }
8372
8373 for (j = 0; j < num_offsets; j++) {
8374 /* sanity check gr ctxt offsets,
8375 * don't write outside, worst case
8376 */
8377 if ((current_mem == &gr_ctx->mem) &&
8378 (offsets[j] >= g->gr.ctx_vars.golden_image_size)) {
8379 continue;
8380 }
8381 if (pass == 0) { /* write pass */
8382 v = nvgpu_mem_rd(g, current_mem, offsets[j]);
8383 v &= ~ctx_ops[i].and_n_mask_lo;
8384 v |= ctx_ops[i].value_lo;
8385 nvgpu_mem_wr(g, current_mem, offsets[j], v);
8386
8387 nvgpu_log(g, gpu_dbg_gpu_dbg,
8388 "context wr: offset=0x%x v=0x%x",
8389 offsets[j], v);
8390
8391 if (ctx_ops[i].op == REGOP(WRITE_64)) {
8392 v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4);
8393 v &= ~ctx_ops[i].and_n_mask_hi;
8394 v |= ctx_ops[i].value_hi;
8395 nvgpu_mem_wr(g, current_mem, offsets[j] + 4, v);
8396
8397 nvgpu_log(g, gpu_dbg_gpu_dbg,
8398 "context wr: offset=0x%x v=0x%x",
8399 offsets[j] + 4, v);
8400 }
8401
8402 /* check to see if we need to add a special WAR
8403 for some of the SMPC perf regs */
8404 gr_gk20a_ctx_patch_smpc(g, ch, offset_addrs[j],
8405 v, current_mem);
8406
8407 } else { /* read pass */
8408 ctx_ops[i].value_lo =
8409 nvgpu_mem_rd(g, current_mem, offsets[0]);
8410
8411 nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
8412 offsets[0], ctx_ops[i].value_lo);
8413
8414 if (ctx_ops[i].op == REGOP(READ_64)) {
8415 ctx_ops[i].value_hi =
8416 nvgpu_mem_rd(g, current_mem, offsets[0] + 4);
8417
8418 nvgpu_log(g, gpu_dbg_gpu_dbg,
8419 "context rd: offset=0x%x v=0x%x",
8420 offsets[0] + 4, ctx_ops[i].value_hi);
8421 } else {
8422 ctx_ops[i].value_hi = 0;
8423 }
8424 }
8425 }
8426 ctx_op_nr++;
8427 }
8428 }
8429
8430 cleanup:
8431 if (offsets) {
8432 nvgpu_kfree(g, offsets);
8433 }
8434
8435 if (gr_ctx->patch_ctx.mem.cpu_va) {
8436 gr_gk20a_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready);
8437 }
8438
8439 return err;
8440}
8441
8442int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
8443 struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
8444 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
8445 bool *is_curr_ctx)
8446{
8447 struct gk20a *g = ch->g;
8448 int err, tmp_err;
8449 bool ch_is_curr_ctx;
8450
8451 /* disable channel switching.
8452 * at that point the hardware state can be inspected to
8453 * determine if the context we're interested in is current.
8454 */
8455 err = gr_gk20a_disable_ctxsw(g);
8456 if (err != 0) {
8457 nvgpu_err(g, "unable to stop gr ctxsw");
8458 /* this should probably be ctx-fatal... */
8459 return err;
8460 }
8461
8462 ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
8463 if (is_curr_ctx != NULL) {
8464 *is_curr_ctx = ch_is_curr_ctx;
8465 }
8466 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
8467 ch_is_curr_ctx);
8468
8469 err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
8470 num_ctx_rd_ops, ch_is_curr_ctx);
8471
8472 tmp_err = gr_gk20a_enable_ctxsw(g);
8473 if (tmp_err) {
8474 nvgpu_err(g, "unable to restart ctxsw!");
8475 err = tmp_err;
8476 }
8477
8478 return err;
8479}
8480
8481void gr_gk20a_commit_global_pagepool(struct gk20a *g,
8482 struct nvgpu_gr_ctx *gr_ctx,
8483 u64 addr, u32 size, bool patch)
8484{
8485 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(),
8486 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
8487
8488 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(),
8489 gr_scc_pagepool_total_pages_f(size) |
8490 gr_scc_pagepool_valid_true_f(), patch);
8491
8492 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(),
8493 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
8494
8495 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(),
8496 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
8497
8498 gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_pagepool_r(),
8499 gr_pd_pagepool_total_pages_f(size) |
8500 gr_pd_pagepool_valid_true_f(), patch);
8501}
8502
8503void gk20a_init_gr(struct gk20a *g)
8504{
8505 nvgpu_cond_init(&g->gr.init_wq);
8506}
8507
8508int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
8509 u32 global_esr_mask, bool check_errors)
8510{
8511 bool locked_down;
8512 bool no_error_pending;
8513 u32 delay = GR_IDLE_CHECK_DEFAULT;
8514 bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g);
8515 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8516 u32 dbgr_status0 = 0, dbgr_control0 = 0;
8517 u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
8518 struct nvgpu_timeout timeout;
8519 u32 warp_esr;
8520
8521 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
8522 "GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm);
8523
8524 nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
8525 NVGPU_TIMER_CPU_TIMER);
8526
8527 /* wait for the sm to lock down */
8528 do {
8529 u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
8530 gpc, tpc, sm);
8531 dbgr_status0 = gk20a_readl(g,
8532 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
8533
8534 warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
8535
8536 locked_down =
8537 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
8538 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
8539 no_error_pending =
8540 check_errors &&
8541 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
8542 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
8543 ((global_esr & ~global_esr_mask) == 0);
8544
8545 if (locked_down || no_error_pending) {
8546 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
8547 "GPC%d TPC%d SM%d: locked down SM",
8548 gpc, tpc, sm);
8549 return 0;
8550 }
8551
8552 /* if an mmu fault is pending and mmu debug mode is not
8553 * enabled, the sm will never lock down. */
8554 if (!mmu_debug_mode_enabled &&
8555 (g->ops.mm.mmu_fault_pending(g))) {
8556 nvgpu_err(g,
8557 "GPC%d TPC%d: mmu fault pending,"
8558 " SM%d will never lock down!", gpc, tpc, sm);
8559 return -EFAULT;
8560 }
8561
8562 nvgpu_usleep_range(delay, delay * 2);
8563 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
8564 } while (nvgpu_timeout_expired(&timeout) == 0);
8565
8566 dbgr_control0 = gk20a_readl(g,
8567 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8568
8569 /* 64 bit read */
8570 warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
8571 warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);
8572
8573 /* 64 bit read */
8574 warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
8575 warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);
8576
8577 /* 64 bit read */
8578 warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
8579 warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);
8580
8581 nvgpu_err(g,
8582 "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
8583 nvgpu_err(g,
8584 "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx",
8585 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
8586 warps_valid, warps_paused, warps_trapped);
8587
8588 return -ETIMEDOUT;
8589}
8590
8591void gk20a_gr_suspend_single_sm(struct gk20a *g,
8592 u32 gpc, u32 tpc, u32 sm,
8593 u32 global_esr_mask, bool check_errors)
8594{
8595 int err;
8596 u32 dbgr_control0;
8597 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8598
8599 /* if an SM debugger isn't attached, skip suspend */
8600 if (!g->ops.gr.sm_debugger_attached(g)) {
8601 nvgpu_err(g,
8602 "SM debugger not attached, skipping suspend!");
8603 return;
8604 }
8605
8606 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
8607 "suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm);
8608
8609 /* assert stop trigger. */
8610 dbgr_control0 = gk20a_readl(g,
8611 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8612 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8613 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset,
8614 dbgr_control0);
8615
8616 err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm,
8617 global_esr_mask, check_errors);
8618 if (err != 0) {
8619 nvgpu_err(g,
8620 "SuspendSm failed");
8621 return;
8622 }
8623}
8624
8625void gk20a_gr_suspend_all_sms(struct gk20a *g,
8626 u32 global_esr_mask, bool check_errors)
8627{
8628 struct gr_gk20a *gr = &g->gr;
8629 u32 gpc, tpc, sm;
8630 int err;
8631 u32 dbgr_control0;
8632 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8633
8634 /* if an SM debugger isn't attached, skip suspend */
8635 if (!g->ops.gr.sm_debugger_attached(g)) {
8636 nvgpu_err(g,
8637 "SM debugger not attached, skipping suspend!");
8638 return;
8639 }
8640
8641 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms");
8642 /* assert stop trigger. uniformity assumption: all SMs will have
8643 * the same state in dbg_control0.
8644 */
8645 dbgr_control0 =
8646 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8647 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8648
8649 /* broadcast write */
8650 gk20a_writel(g,
8651 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8652
8653 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
8654 for (tpc = 0; tpc < gr_gk20a_get_tpc_count(gr, gpc); tpc++) {
8655 for (sm = 0; sm < sm_per_tpc; sm++) {
8656 err = g->ops.gr.wait_for_sm_lock_down(g,
8657 gpc, tpc, sm,
8658 global_esr_mask, check_errors);
8659 if (err != 0) {
8660 nvgpu_err(g, "SuspendAllSms failed");
8661 return;
8662 }
8663 }
8664 }
8665 }
8666}
8667
8668void gk20a_gr_resume_single_sm(struct gk20a *g,
8669 u32 gpc, u32 tpc, u32 sm)
8670{
8671 u32 dbgr_control0;
8672 u32 offset;
8673 /*
8674 * The following requires some clarification. Despite the fact that both
8675 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8676 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8677 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8678 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8679 * (_DISABLE) as well.
8680
8681 * Advice from the arch group: Disable the stop trigger first, as a
8682 * separate operation, in order to ensure that the trigger has taken
8683 * effect, before enabling the run trigger.
8684 */
8685
8686 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
8687
8688 /*De-assert stop trigger */
8689 dbgr_control0 =
8690 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
8691 dbgr_control0 = set_field(dbgr_control0,
8692 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(),
8693 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f());
8694 gk20a_writel(g,
8695 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8696
8697 /* Run trigger */
8698 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8699 gk20a_writel(g,
8700 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
8701}
8702
8703void gk20a_gr_resume_all_sms(struct gk20a *g)
8704{
8705 u32 dbgr_control0;
8706 /*
8707 * The following requires some clarification. Despite the fact that both
8708 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
8709 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
8710 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
8711 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
8712 * (_DISABLE) as well.
8713
8714 * Advice from the arch group: Disable the stop trigger first, as a
8715 * separate operation, in order to ensure that the trigger has taken
8716 * effect, before enabling the run trigger.
8717 */
8718
8719 /*De-assert stop trigger */
8720 dbgr_control0 =
8721 gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
8722 dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8723 gk20a_writel(g,
8724 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8725
8726 /* Run trigger */
8727 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
8728 gk20a_writel(g,
8729 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8730}
8731
8732int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
8733 struct channel_gk20a *ch, u64 sms, bool enable)
8734{
8735 struct nvgpu_dbg_reg_op *ops;
8736 unsigned int i = 0, sm_id;
8737 int err;
8738 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
8739 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
8740
8741 ops = nvgpu_kcalloc(g, g->gr.no_of_sm, sizeof(*ops));
8742 if (ops == NULL) {
8743 return -ENOMEM;
8744 }
8745 for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
8746 int gpc, tpc;
8747 u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val;
8748
8749 if ((sms & BIT64(sm_id)) == 0ULL) {
8750 continue;
8751 }
8752
8753 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8754 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8755
8756 tpc_offset = tpc_in_gpc_stride * tpc;
8757 gpc_offset = gpc_stride * gpc;
8758 reg_offset = tpc_offset + gpc_offset;
8759
8760 ops[i].op = REGOP(WRITE_32);
8761 ops[i].type = REGOP(TYPE_GR_CTX);
8762 ops[i].offset = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset;
8763
8764 reg_mask = 0;
8765 reg_val = 0;
8766 if (enable) {
8767 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8768 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f();
8769 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m();
8770 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f();
8771 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m();
8772 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f();
8773 } else {
8774 reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
8775 reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f();
8776 }
8777
8778 ops[i].and_n_mask_lo = reg_mask;
8779 ops[i].value_lo = reg_val;
8780 i++;
8781 }
8782
8783 err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
8784 if (err != 0) {
8785 nvgpu_err(g, "Failed to access register");
8786 }
8787 nvgpu_kfree(g, ops);
8788 return err;
8789}
8790
8791/*
8792 * gr_gk20a_suspend_context()
8793 * This API should be called with dbg_session lock held
8794 * and ctxsw disabled
8795 * Returns bool value indicating if context was resident
8796 * or not
8797 */
8798bool gr_gk20a_suspend_context(struct channel_gk20a *ch)
8799{
8800 struct gk20a *g = ch->g;
8801 bool ctx_resident = false;
8802
8803 if (gk20a_is_channel_ctx_resident(ch)) {
8804 g->ops.gr.suspend_all_sms(g, 0, false);
8805 ctx_resident = true;
8806 } else {
8807 gk20a_disable_channel_tsg(g, ch);
8808 }
8809
8810 return ctx_resident;
8811}
8812
8813bool gr_gk20a_resume_context(struct channel_gk20a *ch)
8814{
8815 struct gk20a *g = ch->g;
8816 bool ctx_resident = false;
8817
8818 if (gk20a_is_channel_ctx_resident(ch)) {
8819 g->ops.gr.resume_all_sms(g);
8820 ctx_resident = true;
8821 } else {
8822 gk20a_enable_channel_tsg(g, ch);
8823 }
8824
8825 return ctx_resident;
8826}
8827
8828int gr_gk20a_suspend_contexts(struct gk20a *g,
8829 struct dbg_session_gk20a *dbg_s,
8830 int *ctx_resident_ch_fd)
8831{
8832 int local_ctx_resident_ch_fd = -1;
8833 bool ctx_resident;
8834 struct channel_gk20a *ch;
8835 struct dbg_session_channel_data *ch_data;
8836 int err = 0;
8837
8838 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8839
8840 err = gr_gk20a_disable_ctxsw(g);
8841 if (err != 0) {
8842 nvgpu_err(g, "unable to stop gr ctxsw");
8843 goto clean_up;
8844 }
8845
8846 nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
8847
8848 nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
8849 dbg_session_channel_data, ch_entry) {
8850 ch = g->fifo.channel + ch_data->chid;
8851
8852 ctx_resident = gr_gk20a_suspend_context(ch);
8853 if (ctx_resident) {
8854 local_ctx_resident_ch_fd = ch_data->channel_fd;
8855 }
8856 }
8857
8858 nvgpu_mutex_release(&dbg_s->ch_list_lock);
8859
8860 err = gr_gk20a_enable_ctxsw(g);
8861 if (err != 0) {
8862 nvgpu_err(g, "unable to restart ctxsw!");
8863 }
8864
8865 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8866
8867clean_up:
8868 nvgpu_mutex_release(&g->dbg_sessions_lock);
8869
8870 return err;
8871}
8872
8873int gr_gk20a_resume_contexts(struct gk20a *g,
8874 struct dbg_session_gk20a *dbg_s,
8875 int *ctx_resident_ch_fd)
8876{
8877 int local_ctx_resident_ch_fd = -1;
8878 bool ctx_resident;
8879 struct channel_gk20a *ch;
8880 int err = 0;
8881 struct dbg_session_channel_data *ch_data;
8882
8883 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
8884
8885 err = gr_gk20a_disable_ctxsw(g);
8886 if (err != 0) {
8887 nvgpu_err(g, "unable to stop gr ctxsw");
8888 goto clean_up;
8889 }
8890
8891 nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
8892 dbg_session_channel_data, ch_entry) {
8893 ch = g->fifo.channel + ch_data->chid;
8894
8895 ctx_resident = gr_gk20a_resume_context(ch);
8896 if (ctx_resident) {
8897 local_ctx_resident_ch_fd = ch_data->channel_fd;
8898 }
8899 }
8900
8901 err = gr_gk20a_enable_ctxsw(g);
8902 if (err != 0) {
8903 nvgpu_err(g, "unable to restart ctxsw!");
8904 }
8905
8906 *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
8907
8908clean_up:
8909 nvgpu_mutex_release(&g->dbg_sessions_lock);
8910
8911 return err;
8912}
8913
8914int gr_gk20a_trigger_suspend(struct gk20a *g)
8915{
8916 int err = 0;
8917 u32 dbgr_control0;
8918
8919 /* assert stop trigger. uniformity assumption: all SMs will have
8920 * the same state in dbg_control0. */
8921 dbgr_control0 =
8922 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
8923 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
8924
8925 /* broadcast write */
8926 gk20a_writel(g,
8927 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
8928
8929 return err;
8930}
8931
8932int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state)
8933{
8934 int err = 0;
8935 struct gr_gk20a *gr = &g->gr;
8936 u32 gpc, tpc, sm, sm_id;
8937 u32 global_mask;
8938
8939 if (!g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask ||
8940 !g->ops.gr.lock_down_sm || !g->ops.gr.bpt_reg_info)
8941 return -EINVAL;
8942
8943 /* Wait for the SMs to reach full stop. This condition is:
8944 * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE)
8945 * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp
8946 * masks.
8947 */
8948 global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
8949
8950 /* Lock down all SMs */
8951 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
8952
8953 gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
8954 tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
8955 sm = g->gr.sm_to_cluster[sm_id].sm_index;
8956
8957 err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
8958 global_mask, false);
8959 if (err != 0) {
8960 nvgpu_err(g, "sm did not lock down!");
8961 return err;
8962 }
8963 }
8964
8965 /* Read the warp status */
8966 g->ops.gr.bpt_reg_info(g, w_state);
8967
8968 return 0;
8969}
8970
8971int gr_gk20a_resume_from_pause(struct gk20a *g)
8972{
8973 int err = 0;
8974 u32 reg_val;
8975
8976 /* Clear the pause mask to tell the GPU we want to resume everyone */
8977 gk20a_writel(g,
8978 gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0);
8979
8980 /* explicitly re-enable forwarding of SM interrupts upon any resume */
8981 reg_val = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
8982 reg_val |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
8983 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), reg_val);
8984
8985 /* Now resume all sms, write a 0 to the stop trigger
8986 * then a 1 to the run trigger */
8987 g->ops.gr.resume_all_sms(g);
8988
8989 return err;
8990}
8991
8992int gr_gk20a_clear_sm_errors(struct gk20a *g)
8993{
8994 int ret = 0;
8995 u32 gpc, tpc, sm;
8996 struct gr_gk20a *gr = &g->gr;
8997 u32 global_esr;
8998 u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
8999
9000 if (!g->ops.gr.get_sm_hww_global_esr || !g->ops.gr.clear_sm_hww)
9001 return -EINVAL;
9002
9003 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
9004
9005 /* check if any tpc has an exception */
9006 for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
9007
9008 for (sm = 0; sm < sm_per_tpc; sm++) {
9009 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
9010 gpc, tpc, sm);
9011
9012 /* clearing hwws, also causes tpc and gpc
9013 * exceptions to be cleared
9014 */
9015 g->ops.gr.clear_sm_hww(g,
9016 gpc, tpc, sm, global_esr);
9017 }
9018 }
9019 }
9020
9021 return ret;
9022}
9023
9024u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g)
9025{
9026 struct gr_gk20a *gr = &g->gr;
9027 u32 sm_id, tpc_exception_en = 0;
9028 u32 offset, regval, tpc_offset, gpc_offset;
9029 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
9030 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
9031
9032 for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
9033
9034 tpc_offset = tpc_in_gpc_stride * g->gr.sm_to_cluster[sm_id].tpc_index;
9035 gpc_offset = gpc_stride * g->gr.sm_to_cluster[sm_id].gpc_index;
9036 offset = tpc_offset + gpc_offset;
9037
9038 regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
9039 offset);
9040 /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */
9041 tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id;
9042 }
9043
9044 return tpc_exception_en;
9045}
9046
9047u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
9048{
9049 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
9050 u32 hww_warp_esr = gk20a_readl(g,
9051 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
9052 return hww_warp_esr;
9053}
9054
9055u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
9056{
9057 u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
9058
9059 u32 hww_global_esr = gk20a_readl(g,
9060 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
9061
9062 return hww_global_esr;
9063}
9064
9065u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g)
9066{
9067 /*
9068 * These three interrupts don't require locking down the SM. They can
9069 * be handled by usermode clients as they aren't fatal. Additionally,
9070 * usermode clients may wish to allow some warps to execute while others
9071 * are at breakpoints, as opposed to fatal errors where all warps should
9072 * halt.
9073 */
9074 u32 global_esr_mask =
9075 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
9076 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
9077 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
9078
9079 return global_esr_mask;
9080}
9081
9082/* invalidate channel lookup tlb */
9083void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
9084{
9085 nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
9086 memset(gr->chid_tlb, 0,
9087 sizeof(struct gr_channel_map_tlb_entry) *
9088 GR_CHANNEL_MAP_TLB_SIZE);
9089 nvgpu_spinlock_release(&gr->ch_tlb_lock);
9090}