2 files changed, 24 insertions, 73 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index fb777948..0c64fcba 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -641,81 +641,37 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 }
 /*
- * Context state can be written directly or "patched" at times.
+ * Context state can be written directly, or "patched" at times. So that code
- * So that code can be used in either situation it is written
+ * can be used in either situation it is written using a series of
- * using a series _ctx_patch_write(..., patch) statements.
+ * _ctx_patch_write(..., patch) statements. However any necessary map overhead
- * However any necessary cpu map/unmap and gpu l2 invalidates
+ * should be minimized; thus, bundle the sequence of these writes together, and
- * should be minimized (to avoid doing it once per patch write).
+ * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
- * Before a sequence of these set up with "_ctx_patch_write_begin"
- * and close with "_ctx_patch_write_end."
 */
 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
                                          struct channel_ctx_gk20a *ch_ctx)
 {
-        /* being defensive still... */
+        return gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem);
-        if (WARN_ON(ch_ctx->patch_ctx.mem.cpu_va)) {
-                gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
-                return -EBUSY;
-        }
-        if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
-                return -ENOMEM;
-        return 0;
 }
-int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
                                        struct channel_ctx_gk20a *ch_ctx)
 {
-        /* being defensive still... */
-        if (!ch_ctx->patch_ctx.mem.cpu_va) {
-                gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
-                return -EINVAL;
-        }
        gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
-        return 0;
 }
-int gr_gk20a_ctx_patch_write(struct gk20a *g,
+void gr_gk20a_ctx_patch_write(struct gk20a *g,
                                    struct channel_ctx_gk20a *ch_ctx,
                                    u32 addr, u32 data, bool patch)
 {
-        u32 patch_slot = 0;
-        bool mapped_here = false;
-        BUG_ON(patch != 0 && ch_ctx == NULL);
        if (patch) {
-                if (!ch_ctx)
+                u32 patch_slot = ch_ctx->patch_ctx.data_count * 2;
-                        return -EINVAL;
+                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot, addr);
-                /* we added an optimization prolog, epilog
+                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot + 1, data);
-                 * to get rid of unnecessary maps and l2 invals.
-                 * but be defensive still... */
-                if (!ch_ctx->patch_ctx.mem.cpu_va) {
-                        int err;
-                        gk20a_dbg_info("per-write ctx patch begin?");
-                        err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
-                        if (err)
-                                return err;
-                        mapped_here = true;
-                } else
-                        mapped_here = false;
-                patch_slot = ch_ctx->patch_ctx.data_count * 2;
-                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
-                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
                ch_ctx->patch_ctx.data_count++;
+        } else {
-                if (mapped_here)
-                        gr_gk20a_ctx_patch_write_end(g, ch_ctx);
-        } else
                gk20a_writel(g, addr, data);
+        }
-        return 0;
 }
 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
@@ -3105,7 +3061,6 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
        /* tweak any perf parameters per-context here */
        if (args->class_num == KEPLER_COMPUTE_A) {
-                int begin_err;
                u32 tex_lock_disable_mask;
                u32 texlock;
                u32 lockboost_mask;
@@ -3144,24 +3099,20 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
                lockboost = (lockboost & ~lockboost_mask) |
                        gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);
-                begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
-                if (!begin_err) {
+                if (!err) {
-                        err = gr_gk20a_ctx_patch_write(g, ch_ctx,
+                        gr_gk20a_ctx_patch_write(g, ch_ctx,
                                gr_gpcs_tpcs_sm_sch_texlock_r(),
                                texlock, true);
+                        gr_gk20a_ctx_patch_write(g, ch_ctx,
-                        if (!err)
+                                gr_gpcs_tpcs_sm_sch_macro_sched_r(),
-                                err = gr_gk20a_ctx_patch_write(g, ch_ctx,
+                                lockboost, true);
-                                        gr_gpcs_tpcs_sm_sch_macro_sched_r(),
+                        gr_gk20a_ctx_patch_write_end(g, ch_ctx);
-                                        lockboost, true);
+                } else {
-                }
-                if ((begin_err || err)) {
                        gk20a_err(dev_from_gk20a(g),
                                   "failed to set texlock for compute class");
                }
-                if (!begin_err)
-                        gr_gk20a_ctx_patch_write_end(g, ch_ctx);
                args->flags |= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 189994ef..ad6d8049 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -530,11 +530,11 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
                                  bool enable_hwpm_ctxsw);
 struct channel_ctx_gk20a;
-int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+void gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
                                    u32 addr, u32 data, bool patch);
 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
                                          struct channel_ctx_gk20a *ch_ctx);
-int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
                                        struct channel_ctx_gk20a *ch_ctx);
 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
                                     struct channel_ctx_gk20a *ch_ctx,

diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index fb777948..0c64fcba 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -641,81 +641,37 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
641	}	641	}
642		642
643	/*	643	/*
644	* Context state can be written directly or "patched" at times.	644	* Context state can be written directly, or "patched" at times. So that code
645	* So that code can be used in either situation it is written	645	* can be used in either situation it is written using a series of
646	* using a series _ctx_patch_write(..., patch) statements.	646	* _ctx_patch_write(..., patch) statements. However any necessary map overhead
647	* However any necessary cpu map/unmap and gpu l2 invalidates	647	* should be minimized; thus, bundle the sequence of these writes together, and
648	* should be minimized (to avoid doing it once per patch write).	648	* set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
649	* Before a sequence of these set up with "_ctx_patch_write_begin"
650	* and close with "_ctx_patch_write_end."
651	*/	649	*/
		650
652	int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,	651	int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
653	struct channel_ctx_gk20a *ch_ctx)	652	struct channel_ctx_gk20a *ch_ctx)
654	{	653	{
655	/* being defensive still... */	654	return gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem);
656	if (WARN_ON(ch_ctx->patch_ctx.mem.cpu_va)) {
657	gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
658	return -EBUSY;
659	}
660
661	if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
662	return -ENOMEM;
663
664	return 0;
665	}	655	}
666		656
667	int gr_gk20a_ctx_patch_write_end(struct gk20a *g,	657	void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
668	struct channel_ctx_gk20a *ch_ctx)	658	struct channel_ctx_gk20a *ch_ctx)
669	{	659	{
670	/* being defensive still... */
671	if (!ch_ctx->patch_ctx.mem.cpu_va) {
672	gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
673	return -EINVAL;
674	}
675
676	gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);	660	gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
677	return 0;
678	}	661	}
679		662
680	int gr_gk20a_ctx_patch_write(struct gk20a *g,	663	void gr_gk20a_ctx_patch_write(struct gk20a *g,
681	struct channel_ctx_gk20a *ch_ctx,	664	struct channel_ctx_gk20a *ch_ctx,
682	u32 addr, u32 data, bool patch)	665	u32 addr, u32 data, bool patch)
683	{	666	{
684	u32 patch_slot = 0;
685	bool mapped_here = false;
686
687	BUG_ON(patch != 0 && ch_ctx == NULL);
688
689	if (patch) {	667	if (patch) {
690	if (!ch_ctx)	668	u32 patch_slot = ch_ctx->patch_ctx.data_count * 2;
691	return -EINVAL;	669	gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot, addr);
692	/* we added an optimization prolog, epilog	670	gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot + 1, data);
693	* to get rid of unnecessary maps and l2 invals.
694	* but be defensive still... */
695	if (!ch_ctx->patch_ctx.mem.cpu_va) {
696	int err;
697	gk20a_dbg_info("per-write ctx patch begin?");
698	err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
699	if (err)
700	return err;
701	mapped_here = true;
702	} else
703	mapped_here = false;
704
705	patch_slot = ch_ctx->patch_ctx.data_count * 2;
706
707	gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
708	gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
709
710	ch_ctx->patch_ctx.data_count++;	671	ch_ctx->patch_ctx.data_count++;
711		672	} else {
712	if (mapped_here)
713	gr_gk20a_ctx_patch_write_end(g, ch_ctx);
714
715	} else
716	gk20a_writel(g, addr, data);	673	gk20a_writel(g, addr, data);
717		674	}
718	return 0;
719	}	675	}
720		676
721	static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,	677	static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
@@ -3105,7 +3061,6 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
3105		3061
3106	/* tweak any perf parameters per-context here */	3062	/* tweak any perf parameters per-context here */
3107	if (args->class_num == KEPLER_COMPUTE_A) {	3063	if (args->class_num == KEPLER_COMPUTE_A) {
3108	int begin_err;
3109	u32 tex_lock_disable_mask;	3064	u32 tex_lock_disable_mask;
3110	u32 texlock;	3065	u32 texlock;
3111	u32 lockboost_mask;	3066	u32 lockboost_mask;
@@ -3144,24 +3099,20 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
3144	lockboost = (lockboost & ~lockboost_mask) \|	3099	lockboost = (lockboost & ~lockboost_mask) \|
3145	gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);	3100	gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);
3146		3101
3147	begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);	3102	err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
3148		3103
3149	if (!begin_err) {	3104	if (!err) {
3150	err = gr_gk20a_ctx_patch_write(g, ch_ctx,	3105	gr_gk20a_ctx_patch_write(g, ch_ctx,
3151	gr_gpcs_tpcs_sm_sch_texlock_r(),	3106	gr_gpcs_tpcs_sm_sch_texlock_r(),
3152	texlock, true);	3107	texlock, true);
3153		3108	gr_gk20a_ctx_patch_write(g, ch_ctx,
3154	if (!err)	3109	gr_gpcs_tpcs_sm_sch_macro_sched_r(),
3155	err = gr_gk20a_ctx_patch_write(g, ch_ctx,	3110	lockboost, true);
3156	gr_gpcs_tpcs_sm_sch_macro_sched_r(),	3111	gr_gk20a_ctx_patch_write_end(g, ch_ctx);
3157	lockboost, true);	3112	} else {
3158	}
3159	if ((begin_err \|\| err)) {
3160	gk20a_err(dev_from_gk20a(g),	3113	gk20a_err(dev_from_gk20a(g),
3161	"failed to set texlock for compute class");	3114	"failed to set texlock for compute class");
3162	}	3115	}
3163	if (!begin_err)
3164	gr_gk20a_ctx_patch_write_end(g, ch_ctx);
3165		3116
3166	args->flags \|= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO;	3117	args->flags \|= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO;
3167		3118


diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 189994ef..ad6d8049 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -530,11 +530,11 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
530	bool enable_hwpm_ctxsw);	530	bool enable_hwpm_ctxsw);
531		531
532	struct channel_ctx_gk20a;	532	struct channel_ctx_gk20a;
533	int gr_gk20a_ctx_patch_write(struct gk20a g, struct channel_ctx_gk20a ch_ctx,	533	void gr_gk20a_ctx_patch_write(struct gk20a g, struct channel_ctx_gk20a ch_ctx,
534	u32 addr, u32 data, bool patch);	534	u32 addr, u32 data, bool patch);
535	int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,	535	int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
536	struct channel_ctx_gk20a *ch_ctx);	536	struct channel_ctx_gk20a *ch_ctx);
537	int gr_gk20a_ctx_patch_write_end(struct gk20a *g,	537	void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
538	struct channel_ctx_gk20a *ch_ctx);	538	struct channel_ctx_gk20a *ch_ctx);
539	void gr_gk20a_commit_global_pagepool(struct gk20a *g,	539	void gr_gk20a_commit_global_pagepool(struct gk20a *g,
540	struct channel_ctx_gk20a *ch_ctx,	540	struct channel_ctx_gk20a *ch_ctx,