summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2016-05-25 06:13:56 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-22 11:30:05 -0400
commitf438c66598cf169ec7669bc659c7b23587b5bab3 (patch)
tree0d280f93aa7945d3ca14a3a864fe17836c6345b8 /drivers/gpu/nvgpu/gk20a/gr_gk20a.c
parent10b75f9cdd2184e9728b7d38dd037330d1c20704 (diff)
gpu: nvgpu: force clean patch ctx begin/end
This patch_context map/unmap pair has become a mere wrapper for the more general gk20a_mem_{begin,end}(). To be consistent about mappings, require that each patch_write is surrounded by an explicit begin/end pair, instead of relying on possible inefficient per-write map/unmap. Remove also the cpu_va check from .._write_end() since the buffers may be exist in vidmem without a cpu mapping. JIRA DNVGPU-24 Change-Id: Ia05d52d3d712f2d63730eedc078845fde3e217c1 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1157298 GVS: Gerrit_Virtual_Submit Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/gr_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c93
1 files changed, 22 insertions, 71 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index fb777948..0c64fcba 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -641,81 +641,37 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
641} 641}
642 642
643/* 643/*
644 * Context state can be written directly or "patched" at times. 644 * Context state can be written directly, or "patched" at times. So that code
645 * So that code can be used in either situation it is written 645 * can be used in either situation it is written using a series of
646 * using a series _ctx_patch_write(..., patch) statements. 646 * _ctx_patch_write(..., patch) statements. However any necessary map overhead
647 * However any necessary cpu map/unmap and gpu l2 invalidates 647 * should be minimized; thus, bundle the sequence of these writes together, and
648 * should be minimized (to avoid doing it once per patch write). 648 * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
649 * Before a sequence of these set up with "_ctx_patch_write_begin"
650 * and close with "_ctx_patch_write_end."
651 */ 649 */
650
652int gr_gk20a_ctx_patch_write_begin(struct gk20a *g, 651int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
653 struct channel_ctx_gk20a *ch_ctx) 652 struct channel_ctx_gk20a *ch_ctx)
654{ 653{
655 /* being defensive still... */ 654 return gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem);
656 if (WARN_ON(ch_ctx->patch_ctx.mem.cpu_va)) {
657 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
658 return -EBUSY;
659 }
660
661 if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
662 return -ENOMEM;
663
664 return 0;
665} 655}
666 656
667int gr_gk20a_ctx_patch_write_end(struct gk20a *g, 657void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
668 struct channel_ctx_gk20a *ch_ctx) 658 struct channel_ctx_gk20a *ch_ctx)
669{ 659{
670 /* being defensive still... */
671 if (!ch_ctx->patch_ctx.mem.cpu_va) {
672 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
673 return -EINVAL;
674 }
675
676 gk20a_mem_end(g, &ch_ctx->patch_ctx.mem); 660 gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
677 return 0;
678} 661}
679 662
680int gr_gk20a_ctx_patch_write(struct gk20a *g, 663void gr_gk20a_ctx_patch_write(struct gk20a *g,
681 struct channel_ctx_gk20a *ch_ctx, 664 struct channel_ctx_gk20a *ch_ctx,
682 u32 addr, u32 data, bool patch) 665 u32 addr, u32 data, bool patch)
683{ 666{
684 u32 patch_slot = 0;
685 bool mapped_here = false;
686
687 BUG_ON(patch != 0 && ch_ctx == NULL);
688
689 if (patch) { 667 if (patch) {
690 if (!ch_ctx) 668 u32 patch_slot = ch_ctx->patch_ctx.data_count * 2;
691 return -EINVAL; 669 gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot, addr);
692 /* we added an optimization prolog, epilog 670 gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot + 1, data);
693 * to get rid of unnecessary maps and l2 invals.
694 * but be defensive still... */
695 if (!ch_ctx->patch_ctx.mem.cpu_va) {
696 int err;
697 gk20a_dbg_info("per-write ctx patch begin?");
698 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
699 if (err)
700 return err;
701 mapped_here = true;
702 } else
703 mapped_here = false;
704
705 patch_slot = ch_ctx->patch_ctx.data_count * 2;
706
707 gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
708 gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
709
710 ch_ctx->patch_ctx.data_count++; 671 ch_ctx->patch_ctx.data_count++;
711 672 } else {
712 if (mapped_here)
713 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
714
715 } else
716 gk20a_writel(g, addr, data); 673 gk20a_writel(g, addr, data);
717 674 }
718 return 0;
719} 675}
720 676
721static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g, 677static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
@@ -3105,7 +3061,6 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
3105 3061
3106 /* tweak any perf parameters per-context here */ 3062 /* tweak any perf parameters per-context here */
3107 if (args->class_num == KEPLER_COMPUTE_A) { 3063 if (args->class_num == KEPLER_COMPUTE_A) {
3108 int begin_err;
3109 u32 tex_lock_disable_mask; 3064 u32 tex_lock_disable_mask;
3110 u32 texlock; 3065 u32 texlock;
3111 u32 lockboost_mask; 3066 u32 lockboost_mask;
@@ -3144,24 +3099,20 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
3144 lockboost = (lockboost & ~lockboost_mask) | 3099 lockboost = (lockboost & ~lockboost_mask) |
3145 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0); 3100 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);
3146 3101
3147 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); 3102 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
3148 3103
3149 if (!begin_err) { 3104 if (!err) {
3150 err = gr_gk20a_ctx_patch_write(g, ch_ctx, 3105 gr_gk20a_ctx_patch_write(g, ch_ctx,
3151 gr_gpcs_tpcs_sm_sch_texlock_r(), 3106 gr_gpcs_tpcs_sm_sch_texlock_r(),
3152 texlock, true); 3107 texlock, true);
3153 3108 gr_gk20a_ctx_patch_write(g, ch_ctx,
3154 if (!err) 3109 gr_gpcs_tpcs_sm_sch_macro_sched_r(),
3155 err = gr_gk20a_ctx_patch_write(g, ch_ctx, 3110 lockboost, true);
3156 gr_gpcs_tpcs_sm_sch_macro_sched_r(), 3111 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
3157 lockboost, true); 3112 } else {
3158 }
3159 if ((begin_err || err)) {
3160 gk20a_err(dev_from_gk20a(g), 3113 gk20a_err(dev_from_gk20a(g),
3161 "failed to set texlock for compute class"); 3114 "failed to set texlock for compute class");
3162 } 3115 }
3163 if (!begin_err)
3164 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
3165 3116
3166 args->flags |= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO; 3117 args->flags |= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO;
3167 3118