summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2018-01-25 08:31:18 -0500
committermobile promotions <svcmobile_promotions@nvidia.com>2018-01-26 13:50:37 -0500
commit1a7484c901fe1abe0c35593ec96ff10e162099c4 (patch)
treeda9b0cdb8c55dbf281884d126d6d957e61d8f16f
parent91114cd6d4ca652cb726baf2329fa807442c68a8 (diff)
gpu: nvgpu: ce: store fences in a separate array
Simplify the copyengine code massively by storing the job post fence pointers in an array of fences instead of mixing them up in the command buffer memory. The post fences are used when the ring buffer of a context gets full and we need to wait for the oldest slot to free up. NVGPU-43 NVGPU-52 Change-Id: I36969e19676bec0f38de9a6357767a8d5cbcd329 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1646037 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/common/linux/ce2.c41
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c53
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.h7
3 files changed, 29 insertions, 72 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ce2.c b/drivers/gpu/nvgpu/common/linux/ce2.c
index 97dc6678..7cb39382 100644
--- a/drivers/gpu/nvgpu/common/linux/ce2.c
+++ b/drivers/gpu/nvgpu/common/linux/ce2.c
@@ -54,7 +54,6 @@ int gk20a_ce_execute_ops(struct gk20a *g,
54 u64 cmd_buf_gpu_va = 0; 54 u64 cmd_buf_gpu_va = 0;
55 u32 methodSize; 55 u32 methodSize;
56 u32 cmd_buf_read_offset; 56 u32 cmd_buf_read_offset;
57 u32 fence_index;
58 u32 dma_copy_class; 57 u32 dma_copy_class;
59 struct nvgpu_gpfifo gpfifo; 58 struct nvgpu_gpfifo gpfifo;
60 struct nvgpu_fence fence = {0,0}; 59 struct nvgpu_fence fence = {0,0};
@@ -87,38 +86,22 @@ int gk20a_ce_execute_ops(struct gk20a *g,
87 86
88 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); 87 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
89 88
90 ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset; 89 ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS;
91 90
92 cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * 91 cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
93 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); 92 (NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF / sizeof(u32)));
94
95 /* at end of command buffer has gk20a_fence for command buffer sync */
96 fence_index = (cmd_buf_read_offset +
97 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
98 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
99
100 if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
101 ret = -ENOMEM;
102 goto noop;
103 }
104 93
105 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; 94 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
106 95
107 /* 0 is treated as invalid pre-sync */ 96 if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset]) {
108 if (cmd_buf_cpu_va[fence_index]) { 97 struct gk20a_fence **prev_post_fence =
109 struct gk20a_fence * ce_cmd_buf_fence_in = NULL; 98 &ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset];
110 99
111 memcpy((void *)&ce_cmd_buf_fence_in, 100 ret = gk20a_fence_wait(g, *prev_post_fence,
112 (void *)(cmd_buf_cpu_va + fence_index),
113 sizeof(struct gk20a_fence *));
114 ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in,
115 gk20a_get_gr_idle_timeout(g)); 101 gk20a_get_gr_idle_timeout(g));
116 102
117 gk20a_fence_put(ce_cmd_buf_fence_in); 103 gk20a_fence_put(*prev_post_fence);
118 /* Reset the stored last pre-sync */ 104 *prev_post_fence = NULL;
119 memset((void *)(cmd_buf_cpu_va + fence_index),
120 0,
121 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
122 if (ret) 105 if (ret)
123 goto noop; 106 goto noop;
124 } 107 }
@@ -130,7 +113,7 @@ int gk20a_ce_execute_ops(struct gk20a *g,
130 dst_buf, 113 dst_buf,
131 size, 114 size,
132 &cmd_buf_cpu_va[cmd_buf_read_offset], 115 &cmd_buf_cpu_va[cmd_buf_read_offset],
133 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF, 116 NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
134 payload, 117 payload,
135 gk20a_get_valid_launch_flags(g, launch_flags), 118 gk20a_get_valid_launch_flags(g, launch_flags),
136 request_operation, 119 request_operation,
@@ -154,10 +137,8 @@ int gk20a_ce_execute_ops(struct gk20a *g,
154 &ce_cmd_buf_fence_out, false, NULL); 137 &ce_cmd_buf_fence_out, false, NULL);
155 138
156 if (!ret) { 139 if (!ret) {
157 memcpy((void *)(cmd_buf_cpu_va + fence_index), 140 ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] =
158 (void *)&ce_cmd_buf_fence_out, 141 ce_cmd_buf_fence_out;
159 sizeof(struct gk20a_fence *));
160
161 if (gk20a_fence_out) { 142 if (gk20a_fence_out) {
162 gk20a_fence_get(ce_cmd_buf_fence_out); 143 gk20a_fence_get(ce_cmd_buf_fence_out);
163 *gk20a_fence_out = ce_cmd_buf_fence_out; 144 *gk20a_fence_out = ce_cmd_buf_fence_out;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index c4fcca3c..18878991 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -103,39 +103,15 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
103} 103}
104 104
105/* static CE app api */ 105/* static CE app api */
106static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx) 106static void gk20a_ce_put_fences(struct gk20a_gpu_ctx *ce_ctx)
107{ 107{
108 u32 cmd_buf_index; 108 u32 i;
109 u32 cmd_buf_read_offset; 109
110 u32 fence_index; 110 for (i = 0; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
111 u32 *cmd_buf_cpu_va; 111 struct gk20a_fence **fence = &ce_ctx->postfences[i];
112 112 if (*fence)
113 for (cmd_buf_index = 0; 113 gk20a_fence_put(*fence);
114 cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset; 114 *fence = NULL;
115 cmd_buf_index++) {
116 cmd_buf_read_offset = (cmd_buf_index *
117 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
118
119 /* at end of command buffer has gk20a_fence for command buffer sync */
120 fence_index = (cmd_buf_read_offset +
121 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
122 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
123
124 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
125
126 /* 0 is treated as invalid pre-sync */
127 if (cmd_buf_cpu_va[fence_index]) {
128 struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
129
130 memcpy((void *)&ce_cmd_buf_fence_in,
131 (void *)(cmd_buf_cpu_va + fence_index),
132 sizeof(struct gk20a_fence *));
133 gk20a_fence_put(ce_cmd_buf_fence_in);
134 /* Reset the stored last pre-sync */
135 memset((void *)(cmd_buf_cpu_va + fence_index),
136 0,
137 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
138 }
139 } 115 }
140} 116}
141 117
@@ -148,8 +124,8 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
148 124
149 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); 125 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
150 126
151 if (ce_ctx->cmd_buf_mem.cpu_va) { 127 if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
152 gk20a_ce_free_command_buffer_stored_fence(ce_ctx); 128 gk20a_ce_put_fences(ce_ctx);
153 nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); 129 nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
154 } 130 }
155 131
@@ -449,8 +425,6 @@ u32 gk20a_ce_create_context(struct gk20a *g,
449 ce_ctx->g = g; 425 ce_ctx->g = g;
450 426
451 ce_ctx->cmd_buf_read_queue_offset = 0; 427 ce_ctx->cmd_buf_read_queue_offset = 0;
452 ce_ctx->cmd_buf_end_queue_offset =
453 (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
454 428
455 ce_ctx->vm = g->mm.ce.vm; 429 ce_ctx->vm = g->mm.ce.vm;
456 430
@@ -491,8 +465,11 @@ u32 gk20a_ce_create_context(struct gk20a *g,
491 goto end; 465 goto end;
492 } 466 }
493 467
494 /* allocate command buffer (4096 should be more than enough) from sysmem*/ 468 /* allocate command buffer from sysmem */
495 err = nvgpu_dma_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem); 469 err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
470 NVGPU_CE_MAX_INFLIGHT_JOBS *
471 NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
472 &ce_ctx->cmd_buf_mem);
496 if (err) { 473 if (err) {
497 nvgpu_err(g, 474 nvgpu_err(g,
498 "ce: could not allocate command buffer for CE context"); 475 "ce: could not allocate command buffer for CE context");
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index 0b475f65..1a102070 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -36,9 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
36#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff 36#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
37#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff 37#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
38 38
39#define NVGPU_CE_COMMAND_BUF_SIZE 8192 39#define NVGPU_CE_MAX_INFLIGHT_JOBS 32
40#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256 40#define NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF 256
41#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8
42 41
43/* dma launch_flags */ 42/* dma launch_flags */
44enum { 43enum {
@@ -106,11 +105,11 @@ struct gk20a_gpu_ctx {
106 105
107 /* cmd buf mem_desc */ 106 /* cmd buf mem_desc */
108 struct nvgpu_mem cmd_buf_mem; 107 struct nvgpu_mem cmd_buf_mem;
108 struct gk20a_fence *postfences[NVGPU_CE_MAX_INFLIGHT_JOBS];
109 109
110 struct nvgpu_list_node list; 110 struct nvgpu_list_node list;
111 111
112 u32 cmd_buf_read_queue_offset; 112 u32 cmd_buf_read_queue_offset;
113 u32 cmd_buf_end_queue_offset;
114}; 113};
115 114
116static inline struct gk20a_gpu_ctx * 115static inline struct gk20a_gpu_ctx *