From 3e5c123862c87e22311c21558178f287f85ecb5d Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Thu, 8 May 2014 09:09:49 +0300
Subject: gpu: nvgpu: Always initialize system vm

PMU, FECS and GPCCS use the same address space. We used to initialize
the address space only if PMU is enabled. Create the system address
space always.

FECS and GPCCS used to have slower bit bang and faster DMA method
for loading ucode. Slower method is needed when FECS and GPCCS do not
have an address space. Remove the slower method as not anymore
needed.

Change-Id: I155619741ecc36aa6bf13a9c1ccb03c7c1330f0a
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/406771
---
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c  | 151 +-----------------------------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c  |   8 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h  |   2 -
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c |   2 -
 4 files changed, 9 insertions(+), 154 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index e6bdf52c..3dbf1435 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -162,123 +162,6 @@ void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
 	}
 }
 
-static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
-{
-	u32 i, ucode_u32_size;
-	const u32 *ucode_u32_data;
-	u32 checksum;
-
-	gk20a_dbg_fn("");
-
-	gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
-					      gr_gpccs_dmemc_blk_f(0)  |
-					      gr_gpccs_dmemc_aincw_f(1)));
-
-	ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
-	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
-					     gr_fecs_dmemc_blk_f(0)  |
-					     gr_fecs_dmemc_aincw_f(1)));
-
-	ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
-	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-	gk20a_dbg_fn("done");
-}
-
-static void gr_gk20a_load_falcon_imem(struct gk20a *g)
-{
-	u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
-	const u32 *ucode_u32_data;
-	u32 tag, i, pad_start, pad_end;
-	u32 checksum;
-
-	gk20a_dbg_fn("");
-
-	cfg = gk20a_readl(g, gr_fecs_cfg_r());
-	fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
-
-	cfg = gk20a_readl(g, gr_gpc0_cfg_r());
-	gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
-
-	/* Use the broadcast address to access all of the GPCCS units. */
-	gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
-					      gr_gpccs_imemc_blk_f(0) |
-					      gr_gpccs_imemc_aincw_f(1)));
-
-	/* Setup the tags for the instruction memory. */
-	tag = 0;
-	gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
-
-	ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
-	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		if (i && ((i % (256/sizeof(u32))) == 0)) {
-			tag++;
-			gk20a_writel(g, gr_gpccs_imemt_r(0),
-				      gr_gpccs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	pad_start = i*4;
-	pad_end = pad_start+(256-pad_start%256)+256;
-	for (i = pad_start;
-	     (i < gpccs_imem_size * 256) && (i < pad_end);
-	     i += 4) {
-		if (i && ((i % 256) == 0)) {
-			tag++;
-			gk20a_writel(g, gr_gpccs_imemt_r(0),
-				      gr_gpccs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
-	}
-
-	gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
-					     gr_fecs_imemc_blk_f(0) |
-					     gr_fecs_imemc_aincw_f(1)));
-
-	/* Setup the tags for the instruction memory. */
-	tag = 0;
-	gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
-
-	ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
-	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		if (i && ((i % (256/sizeof(u32))) == 0)) {
-			tag++;
-			gk20a_writel(g, gr_fecs_imemt_r(0),
-				      gr_fecs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	pad_start = i*4;
-	pad_end = pad_start+(256-pad_start%256)+256;
-	for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
-		if (i && ((i % 256) == 0)) {
-			tag++;
-			gk20a_writel(g, gr_fecs_imemt_r(0),
-				      gr_fecs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_fecs_imemd_r(0), 0);
-	}
-}
-
 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
 		u32 expect_delay)
 {
@@ -1784,22 +1667,6 @@ static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	return ret;
 }
 
-static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
-{
-	gk20a_dbg_fn("");
-
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
-		     gr_fecs_ctxsw_mailbox_clear_value_f(~0));
-
-	gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
-	gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
-
-	gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
-	gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
-
-	gk20a_dbg_fn("done");
-}
-
 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
 {
 	struct mm_gk20a *mm = &g->mm;
@@ -2184,20 +2051,10 @@ static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
 			gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
 	}
 
-	/*
-	 * In case the gPMU falcon is not being used, revert to the old way of
-	 * loading gr ucode, without the faster bootstrap routine.
-	 */
-	if (!support_gk20a_pmu()) {
-		gr_gk20a_load_falcon_dmem(g);
-		gr_gk20a_load_falcon_imem(g);
-		gr_gk20a_start_falcon_ucode(g);
-	} else {
-		if (!gr->skip_ucode_init)
-			gr_gk20a_init_ctxsw_ucode(g);
-		gr_gk20a_load_falcon_with_bootloader(g);
-		gr->skip_ucode_init = true;
-	}
+	if (!gr->skip_ucode_init)
+		gr_gk20a_init_ctxsw_ucode(g);
+	gr_gk20a_load_falcon_with_bootloader(g);
+	gr->skip_ucode_init = true;
 
 	ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
 				      GR_IS_UCODE_OP_EQUAL,
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index be22e4a0..234b43c2 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -113,7 +113,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 				   int rw_flag);
 static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
 static void gk20a_vm_remove_support(struct vm_gk20a *vm);
-
+static int gk20a_init_system_vm(struct mm_gk20a *mm);
+static int gk20a_init_bar1_vm(struct mm_gk20a *mm);
 
 /* note: keep the page sizes sorted lowest to highest here */
 static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
@@ -341,6 +342,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 
 
 	gk20a_init_bar1_vm(mm);
+	gk20a_init_system_vm(mm);
 
 	mm->remove_support = gk20a_remove_mm_support;
 	mm->sw_ready = true;
@@ -2486,7 +2488,7 @@ int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
 	return 0;
 }
 
-int gk20a_init_bar1_vm(struct mm_gk20a *mm)
+static int gk20a_init_bar1_vm(struct mm_gk20a *mm)
 {
 	int err;
 	phys_addr_t inst_pa;
@@ -2630,7 +2632,7 @@ clean_up:
 }
 
 /* pmu vm, share channel_vm interfaces */
-int gk20a_init_pmu_vm(struct mm_gk20a *mm)
+static int gk20a_init_system_vm(struct mm_gk20a *mm)
 {
 	int err;
 	phys_addr_t inst_pa;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 4dfc2b7d..c759718e 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -285,8 +285,6 @@ struct channel_gk20a;
 
 int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
-int gk20a_init_bar1_vm(struct mm_gk20a *mm);
-int gk20a_init_pmu_vm(struct mm_gk20a *mm);
 
 int gk20a_mm_fb_flush(struct gk20a *g);
 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index c12496bb..ac01302e 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -1588,8 +1588,6 @@ int gk20a_init_pmu_setup_sw(struct gk20a *g)
 	INIT_DELAYED_WORK(&pmu->elpg_enable, pmu_elpg_enable_allow);
 	INIT_WORK(&pmu->pg_init, gk20a_init_pmu_setup_hw2_workqueue);
 
-	gk20a_init_pmu_vm(mm);
-
 	dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
 	pmu->ucode.cpuva = dma_alloc_attrs(d, GK20A_PMU_UCODE_SIZE_MAX,
 					&iova,
-- 
cgit v1.2.2