gpu: nvgpu: support for hwpm context switching

Add support for hwpm context switching Bug 1648200 Change-Id: I482899bf165cd2ef24bb8617be16df01218e462f Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: http://git-master/r/1120450 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Peter Daifuku <pdaifuku@nvidia.com> 2016-03-09 22:10:20 -0500
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-04-07 14:05:49 -0400
commit: 37155b65f1dd6039bdef92f513d86640956bc12c (patch)
tree: 1deb57523c3acc445996c642da6ac96e1cf7c355
parent: 6675c03603669c667c6ffec34567eaf101a2d09d (diff)
12 files changed, 849 insertions, 65 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 29c39160..d8951b94 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -54,6 +54,7 @@ struct channel_ctx_gk20a {
        struct gr_ctx_desc      *gr_ctx;
        struct patch_desc       patch_ctx;
        struct zcull_ctx_desc   zcull_ctx;
+        struct pm_ctx_desc      pm_ctx;
        u64     global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
        u64     global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
        bool    global_ctx_buffer_mapped;
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 1ee0189b..d087d89e 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -457,6 +457,9 @@ static int nvgpu_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s,
 static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
                              struct nvgpu_dbg_gpu_smpc_ctxsw_mode_args *args);
+static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+                              struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *args);
 static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
                struct dbg_session_gk20a *dbg_s,
                struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args);
@@ -582,6 +585,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
                           (struct nvgpu_dbg_gpu_smpc_ctxsw_mode_args *)buf);
                break;
+        case NVGPU_DBG_GPU_IOCTL_HWPM_CTXSW_MODE:
+                err = nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(dbg_s,
+                           (struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *)buf);
+                break;
        case NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS:
                err = nvgpu_dbg_gpu_ioctl_suspend_resume_sm(dbg_s,
                       (struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf);
@@ -880,7 +888,7 @@ static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
        ch_gk20a = dbg_s->ch;
        if (!ch_gk20a) {
-                gk20a_err(dev_from_gk20a(dbg_s->g),
+                gk20a_err(dev_from_gk20a(g),
                          "no bound channel for smpc ctxsw mode update\n");
                err = -EINVAL;
                goto clean_up;
@@ -889,13 +897,48 @@ static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
        err = gr_gk20a_update_smpc_ctxsw_mode(g, ch_gk20a,
                      args->mode == NVGPU_DBG_GPU_SMPC_CTXSW_MODE_CTXSW);
        if (err) {
-                gk20a_err(dev_from_gk20a(dbg_s->g),
+                gk20a_err(dev_from_gk20a(g),
                          "error (%d) during smpc ctxsw mode update\n", err);
                goto clean_up;
        }
        err = g->ops.regops.apply_smpc_war(dbg_s);
+ clean_up:
+        mutex_unlock(&g->dbg_sessions_lock);
+        return  err;
+}
+static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+                               struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args *args)
+{
+        int err;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+        struct channel_gk20a *ch_gk20a;
+        gk20a_dbg_fn("%s pm ctxsw mode = %d",
+                     dev_name(dbg_s->dev), args->mode);
+        /* Take the global lock, since we'll be doing global regops */
+        mutex_lock(&g->dbg_sessions_lock);
+        ch_gk20a = dbg_s->ch;
+        if (!ch_gk20a) {
+                gk20a_err(dev_from_gk20a(g),
+                          "no bound channel for pm ctxsw mode update\n");
+                err = -EINVAL;
+                goto clean_up;
+        }
+        err = gr_gk20a_update_hwpm_ctxsw_mode(g, ch_gk20a,
+                      args->mode == NVGPU_DBG_GPU_HWPM_CTXSW_MODE_CTXSW);
+        if (err)
+                gk20a_err(dev_from_gk20a(g),
+                          "error (%d) during pm ctxsw mode update\n", err);
+        /* gk20a would require a WAR to set the core PM_ENABLE bit, not
+         * added here with gk20a being deprecated
+         */
 clean_up:
        mutex_unlock(&g->dbg_sessions_lock);
        return  err;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
index 94dba7b6..64d6542b 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
@@ -3,7 +3,7 @@
 *
 * GK20A Graphics Context
 *
- * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -281,7 +281,60 @@ static int gr_gk20a_init_ctx_vars_fw(struct gk20a *g, struct gr_gk20a *gr)
                                        netlist_num);
                                break;
                        case NETLIST_REGIONID_CTXREG_PMPPC:
-                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC skipped");
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ppc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_NVPERF_CTXREG_SYS:
+                                gk20a_dbg_info("NETLIST_REGIONID_NVPERF_CTXREG_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_sys);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_NVPERF_FBP_CTXREGS:
+                                gk20a_dbg_info("NETLIST_REGIONID_NVPERF_FBP_CTXREGS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.fbp);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_NVPERF_CTXREG_GPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_NVPERF_CTXREG_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_gpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_NVPERF_FBP_ROUTER:
+                                gk20a_dbg_info("NETLIST_REGIONID_NVPERF_FBP_ROUTER");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.fbp_router);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_NVPERF_GPC_ROUTER:
+                                gk20a_dbg_info("NETLIST_REGIONID_NVPERF_GPC_ROUTER");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.gpc_router);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMLTC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMLTC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ltc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMFBPA:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMFBPA");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_fbpa);
+                                if (err)
+                                        goto clean_up;
                                break;
                        default:
                                gk20a_dbg_info("unrecognized region %d skipped", i);
@@ -319,6 +372,14 @@ clean_up:
                kfree(g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
                kfree(g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
                kfree(g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_ppc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.perf_sys.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.fbp.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.perf_gpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.fbp_router.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.gpc_router.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_ltc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_fbpa.l);
                release_firmware(netlist_fw);
                err = -ENOENT;
        }
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
index 6844ee69..d413942a 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
@@ -1,7 +1,7 @@
 /*
 * GK20A Graphics Context
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -75,6 +75,13 @@ union __max_name {
 #define NETLIST_REGIONID_NETLIST_NUM            18
 #define NETLIST_REGIONID_CTXREG_PPC             19
 #define NETLIST_REGIONID_CTXREG_PMPPC           20
+#define NETLIST_REGIONID_NVPERF_CTXREG_SYS      21
+#define NETLIST_REGIONID_NVPERF_FBP_CTXREGS     22
+#define NETLIST_REGIONID_NVPERF_CTXREG_GPC      23
+#define NETLIST_REGIONID_NVPERF_FBP_ROUTER      24
+#define NETLIST_REGIONID_NVPERF_GPC_ROUTER      25
+#define NETLIST_REGIONID_CTXREG_PMLTC           26
+#define NETLIST_REGIONID_CTXREG_PMFBPA          27
 struct netlist_region {
        u32 region_id;
@@ -114,6 +121,11 @@ struct u32_list_gk20a {
        u32 count;
 };
+struct ctxsw_buf_offset_map_entry {
+        u32 addr;       /* Register address */
+        u32 offset;     /* Offset in ctxt switch buffer */
+};
 static inline
 struct av_gk20a *alloc_av_list_gk20a(struct av_list_gk20a *avl)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 116fd88f..a8addc7b 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -27,6 +27,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/firmware.h>
 #include <linux/nvhost.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
 #include <trace/events/gk20a.h>
 #include "gk20a.h"
@@ -59,6 +61,10 @@
 #include "ctxsw_trace_gk20a.h"
 #define BLK_SIZE (256)
+#define NV_PMM_FBP_STRIDE       0x1000
+#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200
+#define NV_PERF_PMMGPC_CHIPLET_OFFSET   0x1000
+#define NV_PERF_PMMGPCROUTER_STRIDE     0x0200
 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
@@ -1591,9 +1597,17 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
        u32 data;
        int ret;
+        gk20a_dbg_fn("");
+        if (!ch_ctx->gr_ctx) {
+                gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
+                return -EFAULT;
+        }
        c->g->ops.fifo.disable_channel(c);
        ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
        if (ret) {
+                c->g->ops.fifo.enable_channel(c);
                gk20a_err(dev_from_gk20a(g),
                        "failed to preempt channel\n");
                return ret;
@@ -1603,11 +1617,18 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
           Flush and invalidate before cpu update. */
        g->ops.mm.l2_flush(g, true);
+        if (!ch_ctx->gr_ctx) {
+                gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
+                return -EFAULT;
+        }
        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
+        if (!ctx_ptr) {
+                c->g->ops.fifo.enable_channel(c);
                return -ENOMEM;
+        }
        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
        data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
@@ -1620,11 +1641,135 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
        vunmap(ctx_ptr);
        /* enable channel */
-        gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
+        c->g->ops.fifo.enable_channel(c);
-                gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
-                ccsr_channel_enable_set_true_f());
+        return 0;
+}
+int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
+                                  struct channel_gk20a *c,
+                                  bool enable_hwpm_ctxsw)
+{
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
+        void *ctx_ptr = NULL;
+        void *pm_ctx_ptr;
+        u32 data, virt_addr;
+        int ret;
+        gk20a_dbg_fn("");
+        if (!ch_ctx->gr_ctx) {
+                gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
+                return -EFAULT;
+        }
+        if (enable_hwpm_ctxsw) {
+                if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
+                        return 0;
+        } else {
+                if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f())
+                        return 0;
+        }
+        c->g->ops.fifo.disable_channel(c);
+        ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
+        if (ret) {
+                c->g->ops.fifo.enable_channel(c);
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to preempt channel\n");
+                return ret;
+        }
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        g->ops.mm.l2_flush(g, true);
+        if (enable_hwpm_ctxsw) {
+                /* Allocate buffer if necessary */
+                if (pm_ctx->mem.gpu_va == 0) {
+                        ret = gk20a_gmmu_alloc_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
+                                        g->gr.ctx_vars.pm_ctxsw_image_size,
+                                        &pm_ctx->mem);
+                        if (ret) {
+                                c->g->ops.fifo.enable_channel(c);
+                                gk20a_err(dev_from_gk20a(g),
+                                        "failed to allocate pm ctxt buffer");
+                                return ret;
+                        }
+                        pm_ctx->mem.gpu_va = gk20a_gmmu_map(c->vm,
+                                                        &pm_ctx->mem.sgt,
+                                                        pm_ctx->mem.size,
+                                                        NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                                        gk20a_mem_flag_none, true);
+                        if (!pm_ctx->mem.gpu_va) {
+                                gk20a_err(dev_from_gk20a(g),
+                                        "failed to map pm ctxt buffer");
+                                gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
+                                                &pm_ctx->mem);
+                                c->g->ops.fifo.enable_channel(c);
+                                return -ENOMEM;
+                        }
+                }
+                /* Now clear the buffer */
+                pm_ctx_ptr = vmap(pm_ctx->mem.pages,
+                                PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT,
+                                0, pgprot_writecombine(PAGE_KERNEL));
+                if (!pm_ctx_ptr) {
+                        ret = -ENOMEM;
+                        goto cleanup_pm_buf;
+                }
+                memset(pm_ctx_ptr, 0, pm_ctx->mem.size);
+                vunmap(pm_ctx_ptr);
+        }
+        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
+                        0, pgprot_writecombine(PAGE_KERNEL));
+        if (!ctx_ptr) {
+                ret = -ENOMEM;
+                goto cleanup_pm_buf;
+        }
+        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = data & ~ctxsw_prog_main_image_pm_mode_m();
+        if (enable_hwpm_ctxsw) {
+                pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
+                /* pack upper 32 bits of virtual address into a 32 bit number
+                 * (256 byte boundary)
+                 */
+                virt_addr = (u32)(pm_ctx->mem.gpu_va >> 8);
+        } else {
+                pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+                virt_addr = 0;
+        }
+        data |= pm_ctx->pm_mode;
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+        vunmap(ctx_ptr);
+        /* enable channel */
+        c->g->ops.fifo.enable_channel(c);
        return 0;
+cleanup_pm_buf:
+        gk20a_gmmu_unmap(c->vm, pm_ctx->mem.gpu_va, pm_ctx->mem.size,
+                        gk20a_mem_flag_none);
+        gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &pm_ctx->mem);
+        memset(&pm_ctx->mem, 0, sizeof(struct mem_desc));
+        c->g->ops.fifo.enable_channel(c);
+        return ret;
 }
 /* load saved fresh copy of gloden image into channel gr_ctx */
@@ -1635,6 +1780,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
        u32 virt_addr_lo;
        u32 virt_addr_hi;
+        u32 virt_addr = 0;
        u32 i, v, data;
        int ret = 0;
        void *ctx_ptr = NULL;
@@ -1663,15 +1809,6 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
-        /* no user for client managed performance counter ctx */
-        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
-        data = data & ~ctxsw_prog_main_image_pm_mode_m();
-        data |= ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
-                 data);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
        /* set priv access map */
        virt_addr_lo =
                 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
@@ -1708,6 +1845,32 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
                 virt_addr_hi);
+        /* Update main header region of the context buffer with the info needed
+         * for PM context switching, including mode and possibly a pointer to
+         * the PM backing store.
+         */
+        if (ch_ctx->pm_ctx.pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
+                if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "context switched pm with no pm buffer!");
+                        vunmap(ctx_ptr);
+                        return -EFAULT;
+                }
+                /* pack upper 32 bits of virtual address into a 32 bit number
+                 * (256 byte boundary)
+                 */
+                virt_addr = (u32)(ch_ctx->pm_ctx.mem.gpu_va >> 8);
+        } else
+                virt_addr = 0;
+        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = data & ~ctxsw_prog_main_image_pm_mode_m();
+        data |= ch_ctx->pm_ctx.pm_mode;
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
        vunmap(ctx_ptr);
        if (tegra_platform_is_linsim()) {
@@ -2205,7 +2368,6 @@ static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
 int gr_gk20a_init_ctx_state(struct gk20a *g)
 {
-        u32 pm_ctx_image_size;
        u32 ret;
        struct fecs_method_op_gk20a op = {
                .mailbox = { .id = 0, .data = 0,
@@ -2237,7 +2399,7 @@ int gr_gk20a_init_ctx_state(struct gk20a *g)
                }
                op.method.addr =
                        gr_fecs_method_push_adr_discover_pm_image_size_v();
-                op.mailbox.ret = &pm_ctx_image_size;
+                op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size;
                ret = gr_gk20a_submit_fecs_method_op(g, op, false);
                if (ret) {
                        gk20a_err(dev_from_gk20a(g),
@@ -2641,14 +2803,30 @@ static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
        patch_ctx->data_count = 0;
 }
+static void gr_gk20a_free_channel_pm_ctx(struct channel_gk20a *c)
+{
+        struct pm_ctx_desc *pm_ctx = &c->ch_ctx.pm_ctx;
+        struct gk20a *g = c->g;
+        gk20a_dbg_fn("");
+        if (pm_ctx->mem.gpu_va) {
+                gk20a_gmmu_unmap(c->vm, pm_ctx->mem.gpu_va,
+                                 pm_ctx->mem.size, gk20a_mem_flag_none);
+                gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &pm_ctx->mem);
+        }
+}
 void gk20a_free_channel_ctx(struct channel_gk20a *c)
 {
        gr_gk20a_unmap_global_ctx_buffers(c);
        gr_gk20a_free_channel_patch_ctx(c);
+        gr_gk20a_free_channel_pm_ctx(c);
        if (!gk20a_is_channel_marked_as_tsg(c))
                gr_gk20a_free_channel_gr_ctx(c);
-        /* zcull_ctx, pm_ctx */
+        /* zcull_ctx */
        memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
@@ -2743,6 +2921,9 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
                ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
        }
+        /* PM ctxt switch is off by default */
+        ch_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
        /* commit gr ctx buffer */
        err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
        if (err) {
@@ -2983,6 +3164,10 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
        kfree(gr->ctx_vars.local_golden_image);
        gr->ctx_vars.local_golden_image = NULL;
+        if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map)
+                nvgpu_free(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
+        gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
        gk20a_comptag_allocator_destroy(&gr->comp_tags);
 }
@@ -5828,6 +6013,10 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
                                               u32 context_buffer_size,
                                               u32 *priv_offset);
+static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
+                                                  u32 addr,
+                                                  u32 *priv_offset);
 /* This function will decode a priv address and return the partition type and numbers. */
 static int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
                              int  *addr_type, /* enum ctxsw_addr_type */
@@ -6056,14 +6245,81 @@ int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
                offset_addrs[i] = priv_registers[i];
        }
-    *num_offsets = num_registers;
+        *num_offsets = num_registers;
+cleanup:
+        if (!IS_ERR_OR_NULL(priv_registers))
+                kfree(priv_registers);
- cleanup:
+        return err;
+}
+int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
+                                       u32 addr,
+                                       u32 max_offsets,
+                                       u32 *offsets, u32 *offset_addrs,
+                                       u32 *num_offsets)
+{
+        u32 i;
+        u32 priv_offset = 0;
+        u32 *priv_registers;
+        u32 num_registers = 0;
+        int err = 0;
+        struct gr_gk20a *gr = &g->gr;
+        u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* implementation is crossed-up if either of these happen */
+        if (max_offsets > potential_offsets)
+                return -EINVAL;
+        if (!g->gr.ctx_vars.golden_image_initialized)
+                return -ENODEV;
+        priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
+        if (ZERO_OR_NULL_PTR(priv_registers)) {
+                gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
+                return -ENOMEM;
+        }
+        memset(offsets,      0, sizeof(u32) * max_offsets);
+        memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+        *num_offsets = 0;
+        gr_gk20a_create_priv_addr_table(g, addr, priv_registers, &num_registers);
+        if ((max_offsets > 1) && (num_registers > max_offsets)) {
+                err = -EINVAL;
+                goto cleanup;
+        }
-    if (!IS_ERR_OR_NULL(priv_registers))
+        if ((max_offsets == 1) && (num_registers > 1))
-            kfree(priv_registers);
+                num_registers = 1;
+        if (!g->gr.ctx_vars.local_golden_image) {
+                gk20a_dbg_fn("no context switch header info to work with");
+                err = -EINVAL;
+                goto cleanup;
+        }
-    return err;
+        for (i = 0; i < num_registers; i++) {
+                err = gr_gk20a_find_priv_offset_in_pm_buffer(g,
+                                                  priv_registers[i],
+                                                  &priv_offset);
+                if (err) {
+                        gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
+                                      addr); /*, grPriRegStr(addr)));*/
+                        goto cleanup;
+                }
+                offsets[i] = priv_offset;
+                offset_addrs[i] = priv_registers[i];
+        }
+        *num_offsets = num_registers;
+cleanup:
+        kfree(priv_registers);
+        return err;
 }
 /* Setup some register tables.  This looks hacky; our
@@ -6638,8 +6894,6 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
        return 0;
 }
 /*
 *  This function will return the 32 bit offset for a priv register if it is
 *  present in the context buffer.
@@ -6801,6 +7055,314 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
        return -EINVAL;
 }
+static int map_cmp(const void *a, const void *b)
+{
+        struct ctxsw_buf_offset_map_entry *e1 =
+                                        (struct ctxsw_buf_offset_map_entry *)a;
+        struct ctxsw_buf_offset_map_entry *e2 =
+                                        (struct ctxsw_buf_offset_map_entry *)b;
+        if (e1->addr < e2->addr)
+                return -1;
+        if (e1->addr > e2->addr)
+                return 1;
+        return 0;
+}
+static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
+                                        struct aiv_list_gk20a *regs,
+                                        u32 *count, u32 *offset,
+                                        u32 max_cnt, u32 base, u32 mask)
+{
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + regs->count) > max_cnt)
+                return -EINVAL;
+        for (idx = 0; idx < regs->count; idx++) {
+                map[cnt].addr = base + (regs->l[idx].addr & mask);
+                map[cnt++].offset = off;
+                off += 4;
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+/* Helper function to add register entries to the register map for all
+ * subunits
+ */
+static int add_ctxsw_buffer_map_entries_subunits(
+                                        struct ctxsw_buf_offset_map_entry *map,
+                                        struct aiv_list_gk20a *regs,
+                                        u32 *count, u32 *offset,
+                                        u32 max_cnt, u32 base,
+                                        u32 num_units, u32 stride, u32 mask)
+{
+        u32 unit;
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + (regs->count * num_units)) > max_cnt)
+                return -EINVAL;
+        /* Data is interleaved for units in ctxsw buffer */
+        for (idx = 0; idx < regs->count; idx++) {
+                for (unit = 0; unit < num_units; unit++) {
+                        map[cnt].addr = base + (regs->l[idx].addr & mask) +
+                                        (unit * stride);
+                        map[cnt++].offset = off;
+                        off += 4;
+                }
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
+                                        struct ctxsw_buf_offset_map_entry *map,
+                                        u32 *count, u32 *offset, u32 max_cnt)
+{
+        u32 num_gpcs = g->gr.gpc_count;
+        u32 num_ppcs, num_tpcs, gpc_num, base;
+        for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
+                num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+                base = proj_gpc_base_v() +
+                       (proj_gpc_stride_v() * gpc_num) + proj_tpc_in_gpc_base_v();
+                if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_tpc,
+                                        count, offset, max_cnt, base, num_tpcs,
+                                        proj_tpc_in_gpc_stride_v(),
+                                        (proj_tpc_in_gpc_stride_v() - 1)))
+                        return -EINVAL;
+                num_ppcs = g->gr.gpc_ppc_count[gpc_num];
+                base = proj_gpc_base_v() + (proj_gpc_stride_v() * gpc_num) +
+                       proj_ppc_in_gpc_base_v();
+                if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_ppc,
+                                        count, offset, max_cnt, base, num_ppcs,
+                                        proj_ppc_in_gpc_stride_v(),
+                                        (proj_ppc_in_gpc_stride_v() - 1)))
+                        return -EINVAL;
+                base = proj_gpc_base_v() + (proj_gpc_stride_v() * gpc_num);
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_gpc,
+                                        count, offset, max_cnt, base,
+                                        (proj_gpc_stride_v() - 1)))
+                        return -EINVAL;
+                base = (NV_PERF_PMMGPC_CHIPLET_OFFSET * gpc_num);
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.perf_gpc,
+                                        count, offset, max_cnt, base, ~0))
+                        return -EINVAL;
+                base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.gpc_router,
+                                        count, offset, max_cnt, base, ~0))
+                        return -EINVAL;
+                *offset = ALIGN(*offset, 256);
+        }
+        return 0;
+}
+/*
+ *            PM CTXSW BUFFER LAYOUT :
+ *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE
+ *|                                             |
+ *|        LIST_compressed_pm_ctx_reg_SYS       |Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|                                             |
+ *|    LIST_compressed_nv_perf_ctx_reg_SYS      |Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|        PADDING for 256 byte alignment       |
+ *|---------------------------------------------|<----256 byte aligned
+ *|    LIST_compressed_nv_perf_fbp_ctx_regs     |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *| LIST_compressed_nv_perf_fbprouter_ctx_regs  |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *|    LIST_compressed_pm_fbpa_ctx_regs         |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *|    LIST_compressed_pm_ltc_ctx_regs          |
+ *|                                  LTC0 LTS0  |
+ *|                                  LTC1 LTS0  |Space allocated: numRegs * n words (for n LTC units)
+ *|                                  LTCn LTS0  |
+ *|                                  LTC0 LTS1  |
+ *|                                  LTC1 LTS1  |
+ *|                                  LTCn LTS1  |
+ *|                                  LTC0 LTSn  |
+ *|                                  LTC1 LTSn  |
+ *|                                  LTCn LTSn  |
+ *|---------------------------------------------|
+ *|        PADDING for 256 byte alignment       |
+ *|---------------------------------------------|<----256 byte aligned
+ *|                            GPC0  REG0 TPC0  |Each GPC has space allocated to accommodate
+ *|                                  REG0 TPC1  |    all the GPC/TPC register lists
+ *| Lists in each GPC region:        REG0 TPCn  |Per GPC allocated space is always 256 byte aligned
+ *|  LIST_pm_ctx_reg_TPC             REG1 TPC0  |
+ *|             * numTpcs            REG1 TPC1  |
+ *|  LIST_pm_ctx_reg_PPC             REG1 TPCn  |
+ *|             * numPpcs            REGn TPC0  |
+ *|  LIST_pm_ctx_reg_GPC             REGn TPC1  |
+ *|  LIST_nv_perf_ctx_reg_GPC        REGn TPCn  |
+ *|                                       ----  |--
+ *|                            GPC1         .   |
+ *|                                         .   |<----
+ *|---------------------------------------------|
+ *=                                             =
+ *|                            GPCn             |
+ *=                                             =
+ *|---------------------------------------------|
+ */
+static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
+{
+        u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size;
+        u32 hwpm_ctxsw_reg_count_max;
+        u32 map_size;
+        u32 i, count = 0;
+        u32 offset = 0;
+        struct ctxsw_buf_offset_map_entry *map;
+        if (hwpm_ctxsw_buffer_size == 0) {
+                gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                        "no PM Ctxsw buffer memory in context buffer");
+                return -EINVAL;
+        }
+        hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
+        map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
+        map = nvgpu_alloc(map_size, true);
+        if (!map)
+                return -ENOMEM;
+        /* Add entries from _LIST_pm_ctx_reg_SYS */
+        if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
+                goto cleanup;
+        /* Add entries from _LIST_nv_perf_ctx_reg_SYS */
+        if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0))
+                goto cleanup;
+        offset = ALIGN(offset, 256);
+        /* Add entries from _LIST_nv_perf_fbp_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.fbp,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        g->gr.num_fbps, NV_PMM_FBP_STRIDE, ~0))
+                goto cleanup;
+        /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.fbp_router,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps,
+                                        NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0))
+                goto cleanup;
+        /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_fbpa,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        proj_scal_litter_num_fbpas_v(),
+                                        proj_fbpa_stride_v(), ~0))
+                goto cleanup;
+        /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_ltc,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        g->ltc_count, proj_ltc_stride_v(), ~0))
+                goto cleanup;
+        offset = ALIGN(offset, 256);
+        /* Add GPC entries */
+        if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
+                                        hwpm_ctxsw_reg_count_max))
+                goto cleanup;
+        if (offset > hwpm_ctxsw_buffer_size) {
+                gk20a_err(dev_from_gk20a(g), "offset > buffer size");
+                goto cleanup;
+        }
+        sort(map, count, sizeof(*map), map_cmp, NULL);
+        g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map;
+        g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count;
+        gk20a_dbg_info("Reg Addr => HWPM Ctxt switch buffer offset");
+        for (i = 0; i < count; i++)
+                gk20a_dbg_info("%08x => %08x", map[i].addr, map[i].offset);
+        return 0;
+cleanup:
+        gk20a_err(dev_from_gk20a(g), "Failed to create HWPM buffer offset map");
+        nvgpu_free(map);
+        return -EINVAL;
+}
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the PM context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
+                                                  u32 addr,
+                                                  u32 *priv_offset)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        u32 count;
+        struct ctxsw_buf_offset_map_entry *map, *result, map_key;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* Create map of pri address and pm offset if necessary */
+        if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) {
+                err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g);
+                if (err)
+                        return err;
+        }
+        *priv_offset = 0;
+        map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map;
+        count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count;
+        map_key.addr = addr;
+        result = bsearch(&map_key, map, count, sizeof(*map), map_cmp);
+        if (result)
+                *priv_offset = result->offset;
+        else {
+                gk20a_err(dev_from_gk20a(g), "Lookup failed for address 0x%x", addr);
+                err = -EINVAL;
+        }
+        return err;
+}
 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
 {
        int curr_gr_ctx, curr_gr_tsgid;
@@ -6840,6 +7402,8 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
        struct gk20a *g = ch->g;
        struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
        void *ctx_ptr = NULL;
+        void *pm_ctx_ptr = NULL;
+        void *base_ptr = NULL;
        bool ch_is_curr_ctx, restart_gr_ctxsw = false;
        u32 i, j, offset, v;
        struct gr_gk20a *gr = &g->gr;
@@ -6940,15 +7504,6 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
        }
        offset_addrs = offsets + max_offsets;
-        /* would have been a variant of gr_gk20a_apply_instmem_overrides */
-        /* recoded in-place instead.*/
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr) {
-                err = -ENOMEM;
-                goto cleanup;
-        }
        err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
        if (err)
                goto cleanup;
@@ -6977,13 +7532,52 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                                &num_offsets,
                                                ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
                                                ctx_ops[i].quad);
-                        if (err) {
+                        if (!err) {
-                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                if (!ctx_ptr) {
+                                        /* would have been a variant of
+                                         * gr_gk20a_apply_instmem_overrides,
+                                         * recoded in-place instead.
+                                         */
+                                        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+                                                PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
+                                                0, pgprot_writecombine(PAGE_KERNEL));
+                                        if (!ctx_ptr) {
+                                                err = -ENOMEM;
+                                                goto cleanup;
+                                        }
+                                }
+                                base_ptr = ctx_ptr;
+                        } else {
+                                err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
+                                                        ctx_ops[i].offset,
+                                                        max_offsets,
+                                                        offsets, offset_addrs,
+                                                        &num_offsets);
+                                if (err) {
+                                        gk20a_dbg(gpu_dbg_gpu_dbg,
                                           "ctx op invalid offset: offset=0x%x",
                                           ctx_ops[i].offset);
-                                ctx_ops[i].status =
+                                        ctx_ops[i].status =
-                                        NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
+                                                NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
-                                continue;
+                                        continue;
+                                }
+                                if (!pm_ctx_ptr) {
+                                        /* Make sure ctx buffer was initialized */
+                                        if (!ch_ctx->pm_ctx.mem.pages) {
+                                                gk20a_err(dev_from_gk20a(g),
+                                                        "Invalid ctx buffer");
+                                                err = -EINVAL;
+                                                goto cleanup;
+                                        }
+                                        pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages,
+                                                PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT,
+                                                0, pgprot_writecombine(PAGE_KERNEL));
+                                        if (!pm_ctx_ptr) {
+                                                err = -ENOMEM;
+                                                goto cleanup;
+                                        }
+                                }
+                                base_ptr = pm_ctx_ptr;
                        }
                        /* if this is a quad access, setup for special access*/
@@ -6993,24 +7587,27 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                                         ctx_ops[i].offset);
                        for (j = 0; j < num_offsets; j++) {
-                                /* sanity check, don't write outside, worst case */
+                                /* sanity check gr ctxt offsets,
-                                if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
+                                 * don't write outside, worst case
+                                 */
+                                if ((base_ptr == ctx_ptr) &&
+                                        (offsets[j] >= g->gr.ctx_vars.golden_image_size))
                                        continue;
                                if (pass == 0) { /* write pass */
-                                        v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
+                                        v = gk20a_mem_rd32(base_ptr + offsets[j], 0);
                                        v &= ~ctx_ops[i].and_n_mask_lo;
                                        v |= ctx_ops[i].value_lo;
-                                        gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
+                                        gk20a_mem_wr32(base_ptr + offsets[j], 0, v);
                                        gk20a_dbg(gpu_dbg_gpu_dbg,
                                                   "context wr: offset=0x%x v=0x%x",
                                                   offsets[j], v);
                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
-                                                v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
+                                                v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0);
                                                v &= ~ctx_ops[i].and_n_mask_hi;
                                                v |= ctx_ops[i].value_hi;
-                                                gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
+                                                gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v);
                                                gk20a_dbg(gpu_dbg_gpu_dbg,
                                                           "context wr: offset=0x%x v=0x%x",
@@ -7020,18 +7617,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                        /* check to see if we need to add a special WAR
                                           for some of the SMPC perf regs */
                                        gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
-                                                        v, ctx_ptr);
+                                                        v, base_ptr);
                                } else { /* read pass */
                                        ctx_ops[i].value_lo =
-                                                gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
+                                                gk20a_mem_rd32(base_ptr + offsets[0], 0);
                                        gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
                                                   offsets[0], ctx_ops[i].value_lo);
                                        if (ctx_ops[i].op == REGOP(READ_64)) {
                                                ctx_ops[i].value_hi =
-                                                        gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
+                                                        gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0);
                                                gk20a_dbg(gpu_dbg_gpu_dbg,
                                                           "context rd: offset=0x%x v=0x%x",
@@ -7062,6 +7659,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
        if (ctx_ptr)
                vunmap(ctx_ptr);
+        if (pm_ctx_ptr)
+                vunmap(pm_ctx_ptr);
        if (restart_gr_ctxsw) {
                int tmp_err = gr_gk20a_enable_ctxsw(g);
                if (tmp_err) {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 2c575534..c82cf75c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -198,8 +198,13 @@ struct gr_gk20a {
                u32 golden_image_size;
                u32 *local_golden_image;
+                u32 hwpm_ctxsw_buffer_offset_map_count;
+                struct ctxsw_buf_offset_map_entry *hwpm_ctxsw_buffer_offset_map;
                u32 zcull_ctxsw_image_size;
+                u32 pm_ctxsw_image_size;
                u32 buffer_header_size;
                u32 priv_access_map_size;
@@ -219,6 +224,14 @@ struct gr_gk20a {
                        struct aiv_list_gk20a pm_sys;
                        struct aiv_list_gk20a pm_gpc;
                        struct aiv_list_gk20a pm_tpc;
+                        struct aiv_list_gk20a pm_ppc;
+                        struct aiv_list_gk20a perf_sys;
+                        struct aiv_list_gk20a perf_gpc;
+                        struct aiv_list_gk20a fbp;
+                        struct aiv_list_gk20a fbp_router;
+                        struct aiv_list_gk20a gpc_router;
+                        struct aiv_list_gk20a pm_ltc;
+                        struct aiv_list_gk20a pm_fbpa;
                } ctxsw_regs;
                int regs_base_index;
                bool valid;
@@ -484,9 +497,17 @@ int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
                                    u32 *offsets, u32 *offset_addrs,
                                    u32 *num_offsets,
                                    bool is_quad, u32 quad);
+int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
+                                       u32 addr,
+                                       u32 max_offsets,
+                                       u32 *offsets, u32 *offset_addrs,
+                                       u32 *num_offsets);
 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
-                                 struct channel_gk20a *c,
+                                    struct channel_gk20a *c,
                                    bool enable_smpc_ctxsw);
+int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
+                                  struct channel_gk20a *c,
+                                  bool enable_hwpm_ctxsw);
 struct channel_ctx_gk20a;
 int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
index da555f7c..08834557 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -94,6 +94,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
        return 0x7 << 0;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+        return 0x1;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
        return 0x0;
diff --git a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
index 93c55c30..ce10db35 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2012-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -70,6 +70,10 @@ static inline u32 proj_lts_stride_v(void)
 {
        return 0x00000400;
 }
+static inline u32 proj_fbpa_stride_v(void)
+{
+        return 0x00001000;
+}
 static inline u32 proj_ppc_in_gpc_base_v(void)
 {
        return 0x00003000;
@@ -114,6 +118,10 @@ static inline u32 proj_scal_litter_num_fbps_v(void)
 {
        return 0x00000001;
 }
+static inline u32 proj_scal_litter_num_fbpas_v(void)
+{
+        return 0x00000001;
+}
 static inline u32 proj_scal_litter_num_gpcs_v(void)
 {
        return 0x00000001;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 368b32d3..833d896d 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -1,7 +1,7 @@
 /*
 * GK20A memory management
 *
- * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -73,6 +73,11 @@ struct zcull_ctx_desc {
        u32 ctx_sw_mode;
 };
+struct pm_ctx_desc {
+        struct mem_desc mem;
+        u32 pm_mode;
+};
 struct gk20a;
 struct gr_ctx_buffer_desc {
        void (*destroy)(struct gk20a *, struct gr_ctx_buffer_desc *);
diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
index 1696f759..e6162af2 100644
--- a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * Tegra GK20A GPU Debugger Driver Register Ops
 *
- * Copyright (c) 2013-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2013-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -644,22 +644,31 @@ static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
                valid = check_whitelists(dbg_s, op, offset + 4);
        if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
-                        err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g,
+                err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g,
+                                                      op->offset,
+                                                      1,
+                                                      &buf_offset_lo,
+                                                      &buf_offset_addr,
+                                                      &num_offsets,
+                                                      op->type == REGOP(TYPE_GR_CTX_QUAD),
+                                                      op->quad);
+                if (err) {
+                        err = gr_gk20a_get_pm_ctx_buffer_offsets(dbg_s->g,
                                                              op->offset,
                                                              1,
                                                              &buf_offset_lo,
                                                              &buf_offset_addr,
-                                                              &num_offsets,
+                                                              &num_offsets);
-                                                              op->type == REGOP(TYPE_GR_CTX_QUAD),
-                                                              op->quad);
                        if (err) {
                                op->status |= REGOP(STATUS_INVALID_OFFSET);
                                return -EINVAL;
                        }
-                        if (!buf_offset_lo) {
+                }
-                                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                if (!buf_offset_lo) {
-                                return -EINVAL;
+                        op->status |= REGOP(STATUS_INVALID_OFFSET);
-                        }
+                        return -EINVAL;
+                }
        }
        if (!valid) {
diff --git a/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h
index f9531ae1..b837918c 100644
--- a/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_proj_gm20b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -70,6 +70,10 @@ static inline u32 proj_lts_stride_v(void)
 {
        return 0x00000200;
 }
+static inline u32 proj_fbpa_stride_v(void)
+{
+        return 0x00001000;
+}
 static inline u32 proj_ppc_in_gpc_base_v(void)
 {
        return 0x00003000;
@@ -114,6 +118,10 @@ static inline u32 proj_scal_litter_num_fbps_v(void)
 {
        return 0x00000001;
 }
+static inline u32 proj_scal_litter_num_fbpas_v(void)
+{
+        return 0x00000001;
+}
 static inline u32 proj_scal_litter_num_gpcs_v(void)
 {
        return 0x00000001;
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 45d1c217..a75a5ae0 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -647,8 +647,20 @@ struct nvgpu_dbg_gpu_set_next_stop_trigger_type_args {
        _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 12, struct nvgpu_dbg_gpu_set_next_stop_trigger_type_args)
+/* PM Context Switch Mode */
+#define NVGPU_DBG_GPU_HWPM_CTXSW_MODE_NO_CTXSW               (0x00000000)
+#define NVGPU_DBG_GPU_HWPM_CTXSW_MODE_CTXSW                  (0x00000001)
+struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args {
+        __u32 mode;
+        __u32 reserved;
+};
+#define NVGPU_DBG_GPU_IOCTL_HWPM_CTXSW_MODE \
+        _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 13, struct nvgpu_dbg_gpu_hwpm_ctxsw_mode_args)
 #define NVGPU_DBG_GPU_IOCTL_LAST                \
-        _IOC_NR(NVGPU_DBG_GPU_IOCTL_SET_NEXT_STOP_TRIGGER_TYPE)
+        _IOC_NR(NVGPU_DBG_GPU_IOCTL_HWPM_CTXSW_MODE)
 #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE                \
        sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)
author	Peter Daifuku <pdaifuku@nvidia.com>	2016-03-09 22:10:20 -0500
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-04-07 14:05:49 -0400
commit	37155b65f1dd6039bdef92f513d86640956bc12c (patch)
tree	1deb57523c3acc445996c642da6ac96e1cf7c355
parent	6675c03603669c667c6ffec34567eaf101a2d09d (diff)