From 2db5e4794e37952bdbd2882c22ba810a45e9ea84 Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Mon, 25 Apr 2016 14:10:40 -0700
Subject: gpu: nvgpu: Fix floorsweeping for multi-GPC GPU

There were multiple bugs in dealing with a GPU with more than one
GPC.

* Beta CB size was set to wrong PPC
* TPC mask did not shift fields correctly
* PD skip table used || instead of | operator

Change-Id: I849e2331a943586df16996fe573da2a0ac4cce19
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/1132109
---
 drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 52 +++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'drivers/gpu/nvgpu/gm20b/gr_gm20b.c')

diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 5b00078f..35bbe70c 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -175,10 +175,11 @@ static int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 	u32 alpha_offset_in_chunk = 0;
 	u32 pd_ab_max_output;
 	u32 gpc_index, ppc_index;
-	u32 temp;
 	u32 cbm_cfg_size1, cbm_cfg_size2;
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
+	u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
+			GPU_LIT_NUM_PES_PER_GPC);
 
 	gk20a_dbg_fn("");
 
@@ -199,7 +200,8 @@ static int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 		gr->tpc_count * gr->attrib_cb_size;
 
 	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
-		temp = gpc_stride * gpc_index;
+		u32 temp = gpc_stride * gpc_index;
+		u32 temp2 = num_pes_per_gpc * gpc_index;
 		for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
 		     ppc_index++) {
 			cbm_cfg_size1 = gr->attrib_cb_default_size *
@@ -234,7 +236,7 @@ static int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
 			gr_gk20a_ctx_patch_write(g, ch_ctx,
-				gr_gpcs_swdx_tc_beta_cb_size_r(ppc_index + gpc_index),
+				gr_gpcs_swdx_tc_beta_cb_size_r(ppc_index + temp2),
 				gr_gpcs_swdx_tc_beta_cb_size_v_f(cbm_cfg_size1) |
 				gr_gpcs_swdx_tc_beta_cb_size_div3_f(cbm_cfg_size1/3),
 				patch);
@@ -523,6 +525,28 @@ static void gr_gm20b_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
 	}
 }
 
+static void gr_gm20b_load_tpc_mask(struct gk20a *g)
+{
+	u32 pes_tpc_mask = 0;
+	u32 gpc, pes;
+	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
+
+	for (gpc = 0; gpc < g->gr.gpc_count; gpc++)
+		for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) {
+			pes_tpc_mask |= g->gr.pes_tpc_mask[pes][gpc] <<
+					num_tpc_per_gpc * gpc;
+		}
+
+	if (g->tpc_fs_mask_user && g->ops.gr.get_gpc_tpc_mask(g, 0) ==
+				(0x1 << g->gr.max_tpc_count) - 1) {
+		u32 val = g->tpc_fs_mask_user;
+		val &= (0x1 << g->gr.max_tpc_count) - 1;
+		gk20a_writel(g, gr_fe_tpc_fs_r(), val);
+	} else {
+		gk20a_writel(g, gr_fe_tpc_fs_r(), pes_tpc_mask);
+	}
+}
+
 int gr_gm20b_ctx_state_floorsweep(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
@@ -531,7 +555,6 @@ int gr_gm20b_ctx_state_floorsweep(struct gk20a *g)
 	u32 sm_id = 0;
 	u32 tpc_per_gpc = 0;
 	u32 tpc_sm_id = 0, gpc_tpc_id = 0;
-	u32 pes_tpc_mask = 0, pes_index;
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 
@@ -576,9 +599,9 @@ int gr_gm20b_ctx_state_floorsweep(struct gk20a *g)
 	     gpc_index += 4) {
 
 		gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
-			     gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
-			     gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
-			     gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
+			     gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) |
+			     gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) |
+			     gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) |
 			     gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
 	}
 
@@ -586,6 +609,8 @@ int gr_gm20b_ctx_state_floorsweep(struct gk20a *g)
 		     gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
 		     gr_cwd_fs_num_tpcs_f(gr->tpc_count));
 
+	gr_gm20b_load_tpc_mask(g);
+
 	gk20a_writel(g, gr_bes_zrop_settings_r(),
 		     gr_bes_zrop_settings_num_active_ltcs_f(gr->num_fbps));
 	gk20a_writel(g, gr_bes_crop_settings_r(),
@@ -595,19 +620,6 @@ int gr_gm20b_ctx_state_floorsweep(struct gk20a *g)
 		     gk20a_readl(g, gr_be0_crop_debug3_r()) |
 		     gr_bes_crop_debug3_comp_vdc_4to2_disable_m());
 
-	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
-		for (pes_index = 0; pes_index < gr->pe_count_per_gpc;
-								pes_index++)
-			pes_tpc_mask |= gr->pes_tpc_mask[pes_index][gpc_index];
-	if (g->tpc_fs_mask_user && g->ops.gr.get_gpc_tpc_mask(g, 0) ==
-				(0x1 << gr->max_tpc_count) - 1) {
-		u32 val = g->tpc_fs_mask_user;
-		val &= (0x1 << gr->max_tpc_count) - 1;
-		gk20a_writel(g, gr_fe_tpc_fs_r(), val);
-	} else {
-		gk20a_writel(g, gr_fe_tpc_fs_r(), pes_tpc_mask);
-	}
-
 	for (tpc_index = 0; tpc_index < gr->tpc_count; tpc_index++) {
 		if (tpc_index == 0) {
 			gpc_tpc_id |= gr_cwd_gpc_tpc_id_tpc0_f(tpc_index);
-- 
cgit v1.2.2