From 5df3d09e16c9d2f413cea53d16bc8ca42ae42d6e Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Tue, 9 Dec 2014 10:04:05 +0200
Subject: gpu: nvgpu: gm20b: Enable CTA preemption

CTA preemption needs to be enabled by setting a value in context. Set
it for gm20b.

Bug 200063473
Bug 1517461

Change-Id: I080cd71b348d08f834fd23ebbe7443dba79224db
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/661299
---
 drivers/gpu/nvgpu/gk20a/gk20a.h    |  2 +-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 16 +++++++++-----
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h |  2 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h |  4 ++++
 drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 44 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 9bb890ca..184ef168 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -151,7 +151,7 @@ struct gpu_ops {
 		int (*init_ctx_state)(struct gk20a *g);
 		int (*alloc_gr_ctx)(struct gk20a *g,
 			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
-			  u32 padding);
+			  u32 class, u32 padding);
 		void (*free_gr_ctx)(struct gk20a *g,
 			  struct vm_gk20a *vm,
 			  struct gr_ctx_desc *gr_ctx);
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 4f6c885c..37cccba3 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -67,7 +67,8 @@ static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
 
 /* channel gr ctx buffer */
 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
-					struct channel_gk20a *c, u32 padding);
+					struct channel_gk20a *c,
+					u32 class, u32 padding);
 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
 
 /* channel patch ctx buffer */
@@ -2486,6 +2487,7 @@ static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
 
 int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
+			  u32 class,
 			  u32 padding)
 {
 	struct gr_ctx_desc *gr_ctx = NULL;
@@ -2551,7 +2553,7 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 }
 
 static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
-			struct tsg_gk20a *tsg, u32 padding)
+			struct tsg_gk20a *tsg, u32 class, u32 padding)
 {
 	struct gr_ctx_desc **gr_ctx = &tsg->tsg_gr_ctx;
 	int err;
@@ -2561,7 +2563,7 @@ static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
 		return -ENOMEM;
 	}
 
-	err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, padding);
+	err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
 	if (err)
 		return err;
 
@@ -2570,10 +2572,11 @@ static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
 
 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
 				struct channel_gk20a *c,
+				u32 class,
 				u32 padding)
 {
 	struct gr_ctx_desc **gr_ctx = &c->ch_ctx.gr_ctx;
-	int err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, c->vm, padding);
+	int err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, c->vm, class, padding);
 	if (err)
 		return err;
 
@@ -2767,6 +2770,7 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
 	if (!tsg) {
 		if (!ch_ctx->gr_ctx) {
 			err = gr_gk20a_alloc_channel_gr_ctx(g, c,
+							    args->class_num,
 							    args->padding);
 			if (err) {
 				gk20a_err(dev_from_gk20a(g),
@@ -2786,7 +2790,9 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
 		if (!tsg->tsg_gr_ctx) {
 			tsg->vm = c->vm;
 			gk20a_vm_get(tsg->vm);
-			err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg, args->padding);
+			err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
+							args->class_num,
+							args->padding);
 			if (err) {
 				gk20a_err(dev_from_gk20a(g),
 					"fail to allocate TSG gr ctx buffer");
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 309faf3b..f130b830 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -496,7 +496,7 @@ int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
 				   struct fecs_method_op_gk20a op);
 int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
-			  u32 padding);
+			  u32 class, u32 padding);
 void gr_gk20a_free_gr_ctx(struct gk20a *g,
 			  struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx);
 #endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index d3ee8670..04f9446b 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -150,11 +150,15 @@ struct gr_ctx_desc {
 	u64 iova;
 	size_t size;
 	u64 gpu_va;
+	int preempt_mode;
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	struct gr_ctx_desc_t18x t18x;
 #endif
 };
 
+#define NVGPU_GR_PREEMPTION_MODE_WFI		0
+#define NVGPU_GR_PREEMPTION_MODE_CTA		2
+
 struct compbit_store_desc {
 	struct page **pages;
 	struct sg_table *sgt;
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index cba51cd6..5f544819 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -775,6 +775,46 @@ static u32 gr_gm20b_pagepool_default_size(struct gk20a *g)
 	return gr_scc_pagepool_total_pages_hwmax_value_v();
 }
 
+int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
+			  struct gr_ctx_desc **gr_ctx, struct vm_gk20a *vm,
+			  u32 class,
+			  u32 flags)
+{
+	int err;
+
+	gk20a_dbg_fn("");
+
+	err = gr_gk20a_alloc_gr_ctx(g, gr_ctx, vm, class, flags);
+	if (err)
+		return err;
+
+	if (class == MAXWELL_COMPUTE_B)
+		(*gr_ctx)->preempt_mode = NVGPU_GR_PREEMPTION_MODE_CTA;
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
+		struct channel_ctx_gk20a *ch_ctx,
+		void *ctx_ptr)
+{
+	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
+	u32 cta_preempt_option =
+		ctxsw_prog_main_image_preemption_options_control_cta_enabled_f();
+
+	gk20a_dbg_fn("");
+
+	if (gr_ctx->preempt_mode == NVGPU_GR_PREEMPTION_MODE_CTA) {
+		gk20a_dbg_info("CTA: %x", cta_preempt_option);
+		gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_preemption_options_o(), 0,
+				cta_preempt_option);
+	}
+
+	gk20a_dbg_fn("done");
+}
+
 void gm20b_init_gr(struct gpu_ops *gops)
 {
 	gops->gr.init_gpc_mmu = gr_gm20b_init_gpc_mmu;
@@ -814,6 +854,8 @@ void gm20b_init_gr(struct gpu_ops *gops)
 	gops->gr.add_zbc_depth = gr_gk20a_add_zbc_depth;
 	gops->gr.pagepool_default_size = gr_gm20b_pagepool_default_size;
 	gops->gr.init_ctx_state = gr_gk20a_init_ctx_state;
-	gops->gr.alloc_gr_ctx = gr_gk20a_alloc_gr_ctx;
+	gops->gr.alloc_gr_ctx = gr_gm20b_alloc_gr_ctx;
 	gops->gr.free_gr_ctx = gr_gk20a_free_gr_ctx;
+	gops->gr.update_ctxsw_preemption_mode =
+		gr_gm20b_update_ctxsw_preemption_mode;
 }
-- 
cgit v1.2.2