From 37155b65f1dd6039bdef92f513d86640956bc12c Mon Sep 17 00:00:00 2001 From: Peter Daifuku Date: Wed, 9 Mar 2016 19:10:20 -0800 Subject: gpu: nvgpu: support for hwpm context switching Add support for hwpm context switching Bug 1648200 Change-Id: I482899bf165cd2ef24bb8617be16df01218e462f Signed-off-by: Peter Daifuku Reviewed-on: http://git-master/r/1120450 Reviewed-by: Terje Bergstrom Tested-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 1 + drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c | 47 +- drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c | 65 ++- drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h | 14 +- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 692 ++++++++++++++++++++++++-- drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 23 +- drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h | 4 + drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h | 10 +- drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 7 +- drivers/gpu/nvgpu/gk20a/regops_gk20a.c | 27 +- drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h | 10 +- 11 files changed, 836 insertions(+), 64 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 29c39160..d8951b94 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -54,6 +54,7 @@ struct channel_ctx_gk20a { struct gr_ctx_desc *gr_ctx; struct patch_desc patch_ctx; struct zcull_ctx_desc zcull_ctx; + struct pm_ctx_desc pm_ctx; u64 global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA]; u64 global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA]; bool global_ctx_buffer_mapped; diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c index 1ee0189b..d087d89e 100644 --- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c @@ -457,6 +457,9 @@ static int nvgpu_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s, static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s, struct nvgpu_dbg_gpu_smpc_ctxsw_mode_args *args); +static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s, + struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *args); + static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm( struct dbg_session_gk20a *dbg_s, struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args); @@ -582,6 +585,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, (struct nvgpu_dbg_gpu_smpc_ctxsw_mode_args *)buf); break; + case NVGPU_DBG_GPU_IOCTL_HWPM_CTXSW_MODE: + err = nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(dbg_s, + (struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *)buf); + break; + case NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS: err = nvgpu_dbg_gpu_ioctl_suspend_resume_sm(dbg_s, (struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf); @@ -880,7 +888,7 @@ static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s, ch_gk20a = dbg_s->ch; if (!ch_gk20a) { - gk20a_err(dev_from_gk20a(dbg_s->g), + gk20a_err(dev_from_gk20a(g), "no bound channel for smpc ctxsw mode update\n"); err = -EINVAL; goto clean_up; @@ -889,13 +897,48 @@ static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s, err = gr_gk20a_update_smpc_ctxsw_mode(g, ch_gk20a, args->mode == NVGPU_DBG_GPU_SMPC_CTXSW_MODE_CTXSW); if (err) { - gk20a_err(dev_from_gk20a(dbg_s->g), + gk20a_err(dev_from_gk20a(g), "error (%d) during smpc ctxsw mode update\n", err); goto clean_up; } err = g->ops.regops.apply_smpc_war(dbg_s); + clean_up: + mutex_unlock(&g->dbg_sessions_lock); + return err; +} + +static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s, + struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *args) +{ + int err; + struct gk20a *g = get_gk20a(dbg_s->pdev); + struct channel_gk20a *ch_gk20a; + gk20a_dbg_fn("%s pm ctxsw mode = %d", + dev_name(dbg_s->dev), args->mode); + + /* Take the global lock, since we'll be doing global regops */ + mutex_lock(&g->dbg_sessions_lock); + + ch_gk20a = dbg_s->ch; + + if (!ch_gk20a) { + gk20a_err(dev_from_gk20a(g), + "no bound channel for pm ctxsw mode update\n"); + err = -EINVAL; + goto clean_up; + } + + err = gr_gk20a_update_hwpm_ctxsw_mode(g, ch_gk20a, + args->mode == NVGPU_DBG_GPU_HWPM_CTXSW_MODE_CTXSW); + if (err) + gk20a_err(dev_from_gk20a(g), + "error (%d) during pm ctxsw mode update\n", err); + + /* gk20a would require a WAR to set the core PM_ENABLE bit, not + * added here with gk20a being deprecated + */ clean_up: mutex_unlock(&g->dbg_sessions_lock); return err; diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c index 94dba7b6..64d6542b 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c @@ -3,7 +3,7 @@ * * GK20A Graphics Context * - * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -281,7 +281,60 @@ static int gr_gk20a_init_ctx_vars_fw(struct gk20a *g, struct gr_gk20a *gr) netlist_num); break; case NETLIST_REGIONID_CTXREG_PMPPC: - gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC skipped"); + gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ppc); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_NVPERF_CTXREG_SYS: + gk20a_dbg_info("NETLIST_REGIONID_NVPERF_CTXREG_SYS"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.perf_sys); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_NVPERF_FBP_CTXREGS: + gk20a_dbg_info("NETLIST_REGIONID_NVPERF_FBP_CTXREGS"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.fbp); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_NVPERF_CTXREG_GPC: + gk20a_dbg_info("NETLIST_REGIONID_NVPERF_CTXREG_GPC"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.perf_gpc); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_NVPERF_FBP_ROUTER: + gk20a_dbg_info("NETLIST_REGIONID_NVPERF_FBP_ROUTER"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.fbp_router); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_NVPERF_GPC_ROUTER: + gk20a_dbg_info("NETLIST_REGIONID_NVPERF_GPC_ROUTER"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.gpc_router); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_CTXREG_PMLTC: + gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMLTC"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ltc); + if (err) + goto clean_up; + break; + case NETLIST_REGIONID_CTXREG_PMFBPA: + gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMFBPA"); + err = gr_gk20a_alloc_load_netlist_aiv( + src, size, &g->gr.ctx_vars.ctxsw_regs.pm_fbpa); + if (err) + goto clean_up; break; default: gk20a_dbg_info("unrecognized region %d skipped", i); @@ -319,6 +372,14 @@ clean_up: kfree(g->gr.ctx_vars.ctxsw_regs.pm_sys.l); kfree(g->gr.ctx_vars.ctxsw_regs.pm_gpc.l); kfree(g->gr.ctx_vars.ctxsw_regs.pm_tpc.l); + kfree(g->gr.ctx_vars.ctxsw_regs.pm_ppc.l); + kfree(g->gr.ctx_vars.ctxsw_regs.perf_sys.l); + kfree(g->gr.ctx_vars.ctxsw_regs.fbp.l); + kfree(g->gr.ctx_vars.ctxsw_regs.perf_gpc.l); + kfree(g->gr.ctx_vars.ctxsw_regs.fbp_router.l); + kfree(g->gr.ctx_vars.ctxsw_regs.gpc_router.l); + kfree(g->gr.ctx_vars.ctxsw_regs.pm_ltc.l); + kfree(g->gr.ctx_vars.ctxsw_regs.pm_fbpa.l); release_firmware(netlist_fw); err = -ENOENT; } diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h index 6844ee69..d413942a 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h @@ -1,7 +1,7 @@ /* * GK20A Graphics Context * - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -75,6 +75,13 @@ union __max_name { #define NETLIST_REGIONID_NETLIST_NUM 18 #define NETLIST_REGIONID_CTXREG_PPC 19 #define NETLIST_REGIONID_CTXREG_PMPPC 20 +#define NETLIST_REGIONID_NVPERF_CTXREG_SYS 21 +#define NETLIST_REGIONID_NVPERF_FBP_CTXREGS 22 +#define NETLIST_REGIONID_NVPERF_CTXREG_GPC 23 +#define NETLIST_REGIONID_NVPERF_FBP_ROUTER 24 +#define NETLIST_REGIONID_NVPERF_GPC_ROUTER 25 +#define NETLIST_REGIONID_CTXREG_PMLTC 26 +#define NETLIST_REGIONID_CTXREG_PMFBPA 27 struct netlist_region { u32 region_id; @@ -114,6 +121,11 @@ struct u32_list_gk20a { u32 count; }; +struct ctxsw_buf_offset_map_entry { + u32 addr; /* Register address */ + u32 offset; /* Offset in ctxt switch buffer */ +}; + static inline struct av_gk20a *alloc_av_list_gk20a(struct av_list_gk20a *avl) { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 116fd88f..a8addc7b 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include "gk20a.h" @@ -59,6 +61,10 @@ #include "ctxsw_trace_gk20a.h" #define BLK_SIZE (256) +#define NV_PMM_FBP_STRIDE 0x1000 +#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200 +#define NV_PERF_PMMGPC_CHIPLET_OFFSET 0x1000 +#define NV_PERF_PMMGPCROUTER_STRIDE 0x0200 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g); static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va); @@ -1591,9 +1597,17 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g, u32 data; int ret; + gk20a_dbg_fn(""); + + if (!ch_ctx->gr_ctx) { + gk20a_err(dev_from_gk20a(g), "no graphics context allocated"); + return -EFAULT; + } + c->g->ops.fifo.disable_channel(c); ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid); if (ret) { + c->g->ops.fifo.enable_channel(c); gk20a_err(dev_from_gk20a(g), "failed to preempt channel\n"); return ret; @@ -1603,11 +1617,18 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g, Flush and invalidate before cpu update. */ g->ops.mm.l2_flush(g, true); + if (!ch_ctx->gr_ctx) { + gk20a_err(dev_from_gk20a(g), "no graphics context allocated"); + return -EFAULT; + } + ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL)); - if (!ctx_ptr) + if (!ctx_ptr) { + c->g->ops.fifo.enable_channel(c); return -ENOMEM; + } data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0); data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m(); @@ -1620,11 +1641,135 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g, vunmap(ctx_ptr); /* enable channel */ - gk20a_writel(c->g, ccsr_channel_r(c->hw_chid), - gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) | - ccsr_channel_enable_set_true_f()); + c->g->ops.fifo.enable_channel(c); + + return 0; +} + +int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g, + struct channel_gk20a *c, + bool enable_hwpm_ctxsw) +{ + struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx; + struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx; + void *ctx_ptr = NULL; + void *pm_ctx_ptr; + u32 data, virt_addr; + int ret; + + gk20a_dbg_fn(""); + + if (!ch_ctx->gr_ctx) { + gk20a_err(dev_from_gk20a(g), "no graphics context allocated"); + return -EFAULT; + } + + if (enable_hwpm_ctxsw) { + if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) + return 0; + } else { + if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) + return 0; + } + + c->g->ops.fifo.disable_channel(c); + ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid); + if (ret) { + c->g->ops.fifo.enable_channel(c); + gk20a_err(dev_from_gk20a(g), + "failed to preempt channel\n"); + return ret; + } + + /* Channel gr_ctx buffer is gpu cacheable. + Flush and invalidate before cpu update. */ + g->ops.mm.l2_flush(g, true); + + if (enable_hwpm_ctxsw) { + /* Allocate buffer if necessary */ + if (pm_ctx->mem.gpu_va == 0) { + ret = gk20a_gmmu_alloc_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, + g->gr.ctx_vars.pm_ctxsw_image_size, + &pm_ctx->mem); + if (ret) { + c->g->ops.fifo.enable_channel(c); + gk20a_err(dev_from_gk20a(g), + "failed to allocate pm ctxt buffer"); + return ret; + } + + pm_ctx->mem.gpu_va = gk20a_gmmu_map(c->vm, + &pm_ctx->mem.sgt, + pm_ctx->mem.size, + NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, + gk20a_mem_flag_none, true); + if (!pm_ctx->mem.gpu_va) { + gk20a_err(dev_from_gk20a(g), + "failed to map pm ctxt buffer"); + gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, + &pm_ctx->mem); + c->g->ops.fifo.enable_channel(c); + return -ENOMEM; + } + } + + /* Now clear the buffer */ + pm_ctx_ptr = vmap(pm_ctx->mem.pages, + PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT, + 0, pgprot_writecombine(PAGE_KERNEL)); + + if (!pm_ctx_ptr) { + ret = -ENOMEM; + goto cleanup_pm_buf; + } + + memset(pm_ctx_ptr, 0, pm_ctx->mem.size); + + vunmap(pm_ctx_ptr); + } + + ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, + PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, + 0, pgprot_writecombine(PAGE_KERNEL)); + if (!ctx_ptr) { + ret = -ENOMEM; + goto cleanup_pm_buf; + } + + data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0); + data = data & ~ctxsw_prog_main_image_pm_mode_m(); + + if (enable_hwpm_ctxsw) { + pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f(); + + /* pack upper 32 bits of virtual address into a 32 bit number + * (256 byte boundary) + */ + virt_addr = (u32)(pm_ctx->mem.gpu_va >> 8); + } else { + pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f(); + virt_addr = 0; + } + + data |= pm_ctx->pm_mode; + + gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data); + gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr); + + vunmap(ctx_ptr); + + /* enable channel */ + c->g->ops.fifo.enable_channel(c); return 0; +cleanup_pm_buf: + gk20a_gmmu_unmap(c->vm, pm_ctx->mem.gpu_va, pm_ctx->mem.size, + gk20a_mem_flag_none); + gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &pm_ctx->mem); + memset(&pm_ctx->mem, 0, sizeof(struct mem_desc)); + + c->g->ops.fifo.enable_channel(c); + return ret; } /* load saved fresh copy of gloden image into channel gr_ctx */ @@ -1635,6 +1780,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx; u32 virt_addr_lo; u32 virt_addr_hi; + u32 virt_addr = 0; u32 i, v, data; int ret = 0; void *ctx_ptr = NULL; @@ -1663,15 +1809,6 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g, gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0); gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0); - /* no user for client managed performance counter ctx */ - data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0); - data = data & ~ctxsw_prog_main_image_pm_mode_m(); - data |= ctxsw_prog_main_image_pm_mode_no_ctxsw_f(); - gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, - data); - - gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0); - /* set priv access map */ virt_addr_lo = u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]); @@ -1708,6 +1845,32 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g, gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0, virt_addr_hi); + /* Update main header region of the context buffer with the info needed + * for PM context switching, including mode and possibly a pointer to + * the PM backing store. + */ + if (ch_ctx->pm_ctx.pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) { + if (ch_ctx->pm_ctx.mem.gpu_va == 0) { + gk20a_err(dev_from_gk20a(g), + "context switched pm with no pm buffer!"); + vunmap(ctx_ptr); + return -EFAULT; + } + + /* pack upper 32 bits of virtual address into a 32 bit number + * (256 byte boundary) + */ + virt_addr = (u32)(ch_ctx->pm_ctx.mem.gpu_va >> 8); + } else + virt_addr = 0; + + data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0); + data = data & ~ctxsw_prog_main_image_pm_mode_m(); + data |= ch_ctx->pm_ctx.pm_mode; + + gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data); + gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr); + vunmap(ctx_ptr); if (tegra_platform_is_linsim()) { @@ -2205,7 +2368,6 @@ static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g) int gr_gk20a_init_ctx_state(struct gk20a *g) { - u32 pm_ctx_image_size; u32 ret; struct fecs_method_op_gk20a op = { .mailbox = { .id = 0, .data = 0, @@ -2237,7 +2399,7 @@ int gr_gk20a_init_ctx_state(struct gk20a *g) } op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v(); - op.mailbox.ret = &pm_ctx_image_size; + op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size; ret = gr_gk20a_submit_fecs_method_op(g, op, false); if (ret) { gk20a_err(dev_from_gk20a(g), @@ -2641,14 +2803,30 @@ static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c) patch_ctx->data_count = 0; } +static void gr_gk20a_free_channel_pm_ctx(struct channel_gk20a *c) +{ + struct pm_ctx_desc *pm_ctx = &c->ch_ctx.pm_ctx; + struct gk20a *g = c->g; + + gk20a_dbg_fn(""); + + if (pm_ctx->mem.gpu_va) { + gk20a_gmmu_unmap(c->vm, pm_ctx->mem.gpu_va, + pm_ctx->mem.size, gk20a_mem_flag_none); + + gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &pm_ctx->mem); + } +} + void gk20a_free_channel_ctx(struct channel_gk20a *c) { gr_gk20a_unmap_global_ctx_buffers(c); gr_gk20a_free_channel_patch_ctx(c); + gr_gk20a_free_channel_pm_ctx(c); if (!gk20a_is_channel_marked_as_tsg(c)) gr_gk20a_free_channel_gr_ctx(c); - /* zcull_ctx, pm_ctx */ + /* zcull_ctx */ memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a)); @@ -2743,6 +2921,9 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c, ch_ctx->gr_ctx = tsg->tsg_gr_ctx; } + /* PM ctxt switch is off by default */ + ch_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f(); + /* commit gr ctx buffer */ err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va); if (err) { @@ -2983,6 +3164,10 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr) kfree(gr->ctx_vars.local_golden_image); gr->ctx_vars.local_golden_image = NULL; + if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map) + nvgpu_free(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map); + gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL; + gk20a_comptag_allocator_destroy(&gr->comp_tags); } @@ -5828,6 +6013,10 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, u32 context_buffer_size, u32 *priv_offset); +static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g, + u32 addr, + u32 *priv_offset); + /* This function will decode a priv address and return the partition type and numbers. */ static int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr, int *addr_type, /* enum ctxsw_addr_type */ @@ -6056,14 +6245,81 @@ int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g, offset_addrs[i] = priv_registers[i]; } - *num_offsets = num_registers; + *num_offsets = num_registers; +cleanup: + if (!IS_ERR_OR_NULL(priv_registers)) + kfree(priv_registers); - cleanup: + return err; +} + +int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g, + u32 addr, + u32 max_offsets, + u32 *offsets, u32 *offset_addrs, + u32 *num_offsets) +{ + u32 i; + u32 priv_offset = 0; + u32 *priv_registers; + u32 num_registers = 0; + int err = 0; + struct gr_gk20a *gr = &g->gr; + u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); + + /* implementation is crossed-up if either of these happen */ + if (max_offsets > potential_offsets) + return -EINVAL; + + if (!g->gr.ctx_vars.golden_image_initialized) + return -ENODEV; + + priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(priv_registers)) { + gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets); + return -ENOMEM; + } + memset(offsets, 0, sizeof(u32) * max_offsets); + memset(offset_addrs, 0, sizeof(u32) * max_offsets); + *num_offsets = 0; + + gr_gk20a_create_priv_addr_table(g, addr, priv_registers, &num_registers); + + if ((max_offsets > 1) && (num_registers > max_offsets)) { + err = -EINVAL; + goto cleanup; + } - if (!IS_ERR_OR_NULL(priv_registers)) - kfree(priv_registers); + if ((max_offsets == 1) && (num_registers > 1)) + num_registers = 1; + + if (!g->gr.ctx_vars.local_golden_image) { + gk20a_dbg_fn("no context switch header info to work with"); + err = -EINVAL; + goto cleanup; + } - return err; + for (i = 0; i < num_registers; i++) { + err = gr_gk20a_find_priv_offset_in_pm_buffer(g, + priv_registers[i], + &priv_offset); + if (err) { + gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x", + addr); /*, grPriRegStr(addr)));*/ + goto cleanup; + } + + offsets[i] = priv_offset; + offset_addrs[i] = priv_registers[i]; + } + + *num_offsets = num_registers; +cleanup: + kfree(priv_registers); + + return err; } /* Setup some register tables. This looks hacky; our @@ -6638,8 +6894,6 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g, return 0; } - - /* * This function will return the 32 bit offset for a priv register if it is * present in the context buffer. @@ -6801,6 +7055,314 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, return -EINVAL; } +static int map_cmp(const void *a, const void *b) +{ + struct ctxsw_buf_offset_map_entry *e1 = + (struct ctxsw_buf_offset_map_entry *)a; + struct ctxsw_buf_offset_map_entry *e2 = + (struct ctxsw_buf_offset_map_entry *)b; + + if (e1->addr < e2->addr) + return -1; + + if (e1->addr > e2->addr) + return 1; + return 0; +} + +static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map, + struct aiv_list_gk20a *regs, + u32 *count, u32 *offset, + u32 max_cnt, u32 base, u32 mask) +{ + u32 idx; + u32 cnt = *count; + u32 off = *offset; + + if ((cnt + regs->count) > max_cnt) + return -EINVAL; + + for (idx = 0; idx < regs->count; idx++) { + map[cnt].addr = base + (regs->l[idx].addr & mask); + map[cnt++].offset = off; + off += 4; + } + *count = cnt; + *offset = off; + return 0; +} + +/* Helper function to add register entries to the register map for all + * subunits + */ +static int add_ctxsw_buffer_map_entries_subunits( + struct ctxsw_buf_offset_map_entry *map, + struct aiv_list_gk20a *regs, + u32 *count, u32 *offset, + u32 max_cnt, u32 base, + u32 num_units, u32 stride, u32 mask) +{ + u32 unit; + u32 idx; + u32 cnt = *count; + u32 off = *offset; + + if ((cnt + (regs->count * num_units)) > max_cnt) + return -EINVAL; + + /* Data is interleaved for units in ctxsw buffer */ + for (idx = 0; idx < regs->count; idx++) { + for (unit = 0; unit < num_units; unit++) { + map[cnt].addr = base + (regs->l[idx].addr & mask) + + (unit * stride); + map[cnt++].offset = off; + off += 4; + } + } + *count = cnt; + *offset = off; + return 0; +} + +static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g, + struct ctxsw_buf_offset_map_entry *map, + u32 *count, u32 *offset, u32 max_cnt) +{ + u32 num_gpcs = g->gr.gpc_count; + u32 num_ppcs, num_tpcs, gpc_num, base; + + for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) { + num_tpcs = g->gr.gpc_tpc_count[gpc_num]; + base = proj_gpc_base_v() + + (proj_gpc_stride_v() * gpc_num) + proj_tpc_in_gpc_base_v(); + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.pm_tpc, + count, offset, max_cnt, base, num_tpcs, + proj_tpc_in_gpc_stride_v(), + (proj_tpc_in_gpc_stride_v() - 1))) + return -EINVAL; + + num_ppcs = g->gr.gpc_ppc_count[gpc_num]; + base = proj_gpc_base_v() + (proj_gpc_stride_v() * gpc_num) + + proj_ppc_in_gpc_base_v(); + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.pm_ppc, + count, offset, max_cnt, base, num_ppcs, + proj_ppc_in_gpc_stride_v(), + (proj_ppc_in_gpc_stride_v() - 1))) + return -EINVAL; + + base = proj_gpc_base_v() + (proj_gpc_stride_v() * gpc_num); + if (add_ctxsw_buffer_map_entries(map, + &g->gr.ctx_vars.ctxsw_regs.pm_gpc, + count, offset, max_cnt, base, + (proj_gpc_stride_v() - 1))) + return -EINVAL; + + base = (NV_PERF_PMMGPC_CHIPLET_OFFSET * gpc_num); + if (add_ctxsw_buffer_map_entries(map, + &g->gr.ctx_vars.ctxsw_regs.perf_gpc, + count, offset, max_cnt, base, ~0)) + return -EINVAL; + + base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num); + if (add_ctxsw_buffer_map_entries(map, + &g->gr.ctx_vars.ctxsw_regs.gpc_router, + count, offset, max_cnt, base, ~0)) + return -EINVAL; + + *offset = ALIGN(*offset, 256); + } + return 0; +} + +/* + * PM CTXSW BUFFER LAYOUT : + *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE + *| | + *| LIST_compressed_pm_ctx_reg_SYS |Space allocated: numRegs words + *|---------------------------------------------| + *| | + *| LIST_compressed_nv_perf_ctx_reg_SYS |Space allocated: numRegs words + *|---------------------------------------------| + *| PADDING for 256 byte alignment | + *|---------------------------------------------|<----256 byte aligned + *| LIST_compressed_nv_perf_fbp_ctx_regs | + *| |Space allocated: numRegs * n words (for n FB units) + *|---------------------------------------------| + *| LIST_compressed_nv_perf_fbprouter_ctx_regs | + *| |Space allocated: numRegs * n words (for n FB units) + *|---------------------------------------------| + *| LIST_compressed_pm_fbpa_ctx_regs | + *| |Space allocated: numRegs * n words (for n FB units) + *|---------------------------------------------| + *| LIST_compressed_pm_ltc_ctx_regs | + *| LTC0 LTS0 | + *| LTC1 LTS0 |Space allocated: numRegs * n words (for n LTC units) + *| LTCn LTS0 | + *| LTC0 LTS1 | + *| LTC1 LTS1 | + *| LTCn LTS1 | + *| LTC0 LTSn | + *| LTC1 LTSn | + *| LTCn LTSn | + *|---------------------------------------------| + *| PADDING for 256 byte alignment | + *|---------------------------------------------|<----256 byte aligned + *| GPC0 REG0 TPC0 |Each GPC has space allocated to accommodate + *| REG0 TPC1 | all the GPC/TPC register lists + *| Lists in each GPC region: REG0 TPCn |Per GPC allocated space is always 256 byte aligned + *| LIST_pm_ctx_reg_TPC REG1 TPC0 | + *| * numTpcs REG1 TPC1 | + *| LIST_pm_ctx_reg_PPC REG1 TPCn | + *| * numPpcs REGn TPC0 | + *| LIST_pm_ctx_reg_GPC REGn TPC1 | + *| LIST_nv_perf_ctx_reg_GPC REGn TPCn | + *| ---- |-- + *| GPC1 . | + *| . |<---- + *|---------------------------------------------| + *= = + *| GPCn | + *= = + *|---------------------------------------------| + */ + +static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g) +{ + u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size; + u32 hwpm_ctxsw_reg_count_max; + u32 map_size; + u32 i, count = 0; + u32 offset = 0; + struct ctxsw_buf_offset_map_entry *map; + + if (hwpm_ctxsw_buffer_size == 0) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, + "no PM Ctxsw buffer memory in context buffer"); + return -EINVAL; + } + + hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2; + map_size = hwpm_ctxsw_reg_count_max * sizeof(*map); + + map = nvgpu_alloc(map_size, true); + if (!map) + return -ENOMEM; + + /* Add entries from _LIST_pm_ctx_reg_SYS */ + if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys, + &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) + goto cleanup; + + /* Add entries from _LIST_nv_perf_ctx_reg_SYS */ + if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys, + &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) + goto cleanup; + + offset = ALIGN(offset, 256); + + /* Add entries from _LIST_nv_perf_fbp_ctx_regs */ + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.fbp, + &count, &offset, + hwpm_ctxsw_reg_count_max, 0, + g->gr.num_fbps, NV_PMM_FBP_STRIDE, ~0)) + goto cleanup; + + /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */ + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.fbp_router, + &count, &offset, + hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps, + NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0)) + goto cleanup; + + /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */ + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.pm_fbpa, + &count, &offset, + hwpm_ctxsw_reg_count_max, 0, + proj_scal_litter_num_fbpas_v(), + proj_fbpa_stride_v(), ~0)) + goto cleanup; + + /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */ + if (add_ctxsw_buffer_map_entries_subunits(map, + &g->gr.ctx_vars.ctxsw_regs.pm_ltc, + &count, &offset, + hwpm_ctxsw_reg_count_max, 0, + g->ltc_count, proj_ltc_stride_v(), ~0)) + goto cleanup; + + offset = ALIGN(offset, 256); + + /* Add GPC entries */ + if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset, + hwpm_ctxsw_reg_count_max)) + goto cleanup; + + if (offset > hwpm_ctxsw_buffer_size) { + gk20a_err(dev_from_gk20a(g), "offset > buffer size"); + goto cleanup; + } + + sort(map, count, sizeof(*map), map_cmp, NULL); + + g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map; + g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count; + + gk20a_dbg_info("Reg Addr => HWPM Ctxt switch buffer offset"); + + for (i = 0; i < count; i++) + gk20a_dbg_info("%08x => %08x", map[i].addr, map[i].offset); + + return 0; +cleanup: + gk20a_err(dev_from_gk20a(g), "Failed to create HWPM buffer offset map"); + nvgpu_free(map); + return -EINVAL; +} + +/* + * This function will return the 32 bit offset for a priv register if it is + * present in the PM context buffer. + */ +static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g, + u32 addr, + u32 *priv_offset) +{ + struct gr_gk20a *gr = &g->gr; + int err = 0; + u32 count; + struct ctxsw_buf_offset_map_entry *map, *result, map_key; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); + + /* Create map of pri address and pm offset if necessary */ + if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) { + err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g); + if (err) + return err; + } + + *priv_offset = 0; + + map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map; + count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count; + + map_key.addr = addr; + result = bsearch(&map_key, map, count, sizeof(*map), map_cmp); + + if (result) + *priv_offset = result->offset; + else { + gk20a_err(dev_from_gk20a(g), "Lookup failed for address 0x%x", addr); + err = -EINVAL; + } + return err; +} + bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch) { int curr_gr_ctx, curr_gr_tsgid; @@ -6840,6 +7402,8 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, struct gk20a *g = ch->g; struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; void *ctx_ptr = NULL; + void *pm_ctx_ptr = NULL; + void *base_ptr = NULL; bool ch_is_curr_ctx, restart_gr_ctxsw = false; u32 i, j, offset, v; struct gr_gk20a *gr = &g->gr; @@ -6940,15 +7504,6 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, } offset_addrs = offsets + max_offsets; - /* would have been a variant of gr_gk20a_apply_instmem_overrides */ - /* recoded in-place instead.*/ - ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, - PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, - 0, pgprot_writecombine(PAGE_KERNEL)); - if (!ctx_ptr) { - err = -ENOMEM; - goto cleanup; - } err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); if (err) goto cleanup; @@ -6977,13 +7532,52 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, &num_offsets, ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD), ctx_ops[i].quad); - if (err) { - gk20a_dbg(gpu_dbg_gpu_dbg, + if (!err) { + if (!ctx_ptr) { + /* would have been a variant of + * gr_gk20a_apply_instmem_overrides, + * recoded in-place instead. + */ + ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, + PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, + 0, pgprot_writecombine(PAGE_KERNEL)); + if (!ctx_ptr) { + err = -ENOMEM; + goto cleanup; + } + } + base_ptr = ctx_ptr; + } else { + err = gr_gk20a_get_pm_ctx_buffer_offsets(g, + ctx_ops[i].offset, + max_offsets, + offsets, offset_addrs, + &num_offsets); + if (err) { + gk20a_dbg(gpu_dbg_gpu_dbg, "ctx op invalid offset: offset=0x%x", ctx_ops[i].offset); - ctx_ops[i].status = - NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET; - continue; + ctx_ops[i].status = + NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET; + continue; + } + if (!pm_ctx_ptr) { + /* Make sure ctx buffer was initialized */ + if (!ch_ctx->pm_ctx.mem.pages) { + gk20a_err(dev_from_gk20a(g), + "Invalid ctx buffer"); + err = -EINVAL; + goto cleanup; + } + pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages, + PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT, + 0, pgprot_writecombine(PAGE_KERNEL)); + if (!pm_ctx_ptr) { + err = -ENOMEM; + goto cleanup; + } + } + base_ptr = pm_ctx_ptr; } /* if this is a quad access, setup for special access*/ @@ -6993,24 +7587,27 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, ctx_ops[i].offset); for (j = 0; j < num_offsets; j++) { - /* sanity check, don't write outside, worst case */ - if (offsets[j] >= g->gr.ctx_vars.golden_image_size) + /* sanity check gr ctxt offsets, + * don't write outside, worst case + */ + if ((base_ptr == ctx_ptr) && + (offsets[j] >= g->gr.ctx_vars.golden_image_size)) continue; if (pass == 0) { /* write pass */ - v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0); + v = gk20a_mem_rd32(base_ptr + offsets[j], 0); v &= ~ctx_ops[i].and_n_mask_lo; v |= ctx_ops[i].value_lo; - gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v); + gk20a_mem_wr32(base_ptr + offsets[j], 0, v); gk20a_dbg(gpu_dbg_gpu_dbg, "context wr: offset=0x%x v=0x%x", offsets[j], v); if (ctx_ops[i].op == REGOP(WRITE_64)) { - v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0); + v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0); v &= ~ctx_ops[i].and_n_mask_hi; v |= ctx_ops[i].value_hi; - gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v); + gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v); gk20a_dbg(gpu_dbg_gpu_dbg, "context wr: offset=0x%x v=0x%x", @@ -7020,18 +7617,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, /* check to see if we need to add a special WAR for some of the SMPC perf regs */ gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j], - v, ctx_ptr); + v, base_ptr); } else { /* read pass */ ctx_ops[i].value_lo = - gk20a_mem_rd32(ctx_ptr + offsets[0], 0); + gk20a_mem_rd32(base_ptr + offsets[0], 0); gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x", offsets[0], ctx_ops[i].value_lo); if (ctx_ops[i].op == REGOP(READ_64)) { ctx_ops[i].value_hi = - gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0); + gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0); gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x", @@ -7062,6 +7659,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, if (ctx_ptr) vunmap(ctx_ptr); + if (pm_ctx_ptr) + vunmap(pm_ctx_ptr); + if (restart_gr_ctxsw) { int tmp_err = gr_gk20a_enable_ctxsw(g); if (tmp_err) { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 2c575534..c82cf75c 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -198,8 +198,13 @@ struct gr_gk20a { u32 golden_image_size; u32 *local_golden_image; + u32 hwpm_ctxsw_buffer_offset_map_count; + struct ctxsw_buf_offset_map_entry *hwpm_ctxsw_buffer_offset_map; + u32 zcull_ctxsw_image_size; + u32 pm_ctxsw_image_size; + u32 buffer_header_size; u32 priv_access_map_size; @@ -219,6 +224,14 @@ struct gr_gk20a { struct aiv_list_gk20a pm_sys; struct aiv_list_gk20a pm_gpc; struct aiv_list_gk20a pm_tpc; + struct aiv_list_gk20a pm_ppc; + struct aiv_list_gk20a perf_sys; + struct aiv_list_gk20a perf_gpc; + struct aiv_list_gk20a fbp; + struct aiv_list_gk20a fbp_router; + struct aiv_list_gk20a gpc_router; + struct aiv_list_gk20a pm_ltc; + struct aiv_list_gk20a pm_fbpa; } ctxsw_regs; int regs_base_index; bool valid; @@ -484,9 +497,17 @@ int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g, u32 *offsets, u32 *offset_addrs, u32 *num_offsets, bool is_quad, u32 quad); +int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g, + u32 addr, + u32 max_offsets, + u32 *offsets, u32 *offset_addrs, + u32 *num_offsets); int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g, - struct channel_gk20a *c, + struct channel_gk20a *c, bool enable_smpc_ctxsw); +int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g, + struct channel_gk20a *c, + bool enable_hwpm_ctxsw); struct channel_ctx_gk20a; int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx, diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h index da555f7c..08834557 100644 --- a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h @@ -94,6 +94,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void) { return 0x7 << 0; } +static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void) +{ + return 0x1; +} static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void) { return 0x0; diff --git a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h index 93c55c30..ce10db35 100644 --- a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2012-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -70,6 +70,10 @@ static inline u32 proj_lts_stride_v(void) { return 0x00000400; } +static inline u32 proj_fbpa_stride_v(void) +{ + return 0x00001000; +} static inline u32 proj_ppc_in_gpc_base_v(void) { return 0x00003000; @@ -114,6 +118,10 @@ static inline u32 proj_scal_litter_num_fbps_v(void) { return 0x00000001; } +static inline u32 proj_scal_litter_num_fbpas_v(void) +{ + return 0x00000001; +} static inline u32 proj_scal_litter_num_gpcs_v(void) { return 0x00000001; diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index 368b32d3..833d896d 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -1,7 +1,7 @@ /* * GK20A memory management * - * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -73,6 +73,11 @@ struct zcull_ctx_desc { u32 ctx_sw_mode; }; +struct pm_ctx_desc { + struct mem_desc mem; + u32 pm_mode; +}; + struct gk20a; struct gr_ctx_buffer_desc { void (*destroy)(struct gk20a *, struct gr_ctx_buffer_desc *); diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c index 1696f759..e6162af2 100644 --- a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c @@ -1,7 +1,7 @@ /* * Tegra GK20A GPU Debugger Driver Register Ops * - * Copyright (c) 2013-2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2013-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -644,22 +644,31 @@ static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s, valid = check_whitelists(dbg_s, op, offset + 4); if (valid && (op->type != REGOP(TYPE_GLOBAL))) { - err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g, + err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g, + op->offset, + 1, + &buf_offset_lo, + &buf_offset_addr, + &num_offsets, + op->type == REGOP(TYPE_GR_CTX_QUAD), + op->quad); + if (err) { + err = gr_gk20a_get_pm_ctx_buffer_offsets(dbg_s->g, op->offset, 1, &buf_offset_lo, &buf_offset_addr, - &num_offsets, - op->type == REGOP(TYPE_GR_CTX_QUAD), - op->quad); + &num_offsets); + if (err) { op->status |= REGOP(STATUS_INVALID_OFFSET); return -EINVAL; } - if (!buf_offset_lo) { - op->status |= REGOP(STATUS_INVALID_OFFSET); - return -EINVAL; - } + } + if (!buf_offset_lo) { + op->status |= REGOP(STATUS_INVALID_OFFSET); + return -EINVAL; + } } if (!valid) { diff --git a/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h index f9531ae1..b837918c 100644 --- a/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -70,6 +70,10 @@ static inline u32 proj_lts_stride_v(void) { return 0x00000200; } +static inline u32 proj_fbpa_stride_v(void) +{ + return 0x00001000; +} static inline u32 proj_ppc_in_gpc_base_v(void) { return 0x00003000; @@ -114,6 +118,10 @@ static inline u32 proj_scal_litter_num_fbps_v(void) { return 0x00000001; } +static inline u32 proj_scal_litter_num_fbpas_v(void) +{ + return 0x00000001; +} static inline u32 proj_scal_litter_num_gpcs_v(void) { return 0x00000001; -- cgit v1.2.2