diff options
-rw-r--r-- | drivers/gpu/nvgpu/common/linux/ce2.c | 41 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 53 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/ce2_gk20a.h | 7 |
3 files changed, 29 insertions, 72 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ce2.c b/drivers/gpu/nvgpu/common/linux/ce2.c index 97dc6678..7cb39382 100644 --- a/drivers/gpu/nvgpu/common/linux/ce2.c +++ b/drivers/gpu/nvgpu/common/linux/ce2.c | |||
@@ -54,7 +54,6 @@ int gk20a_ce_execute_ops(struct gk20a *g, | |||
54 | u64 cmd_buf_gpu_va = 0; | 54 | u64 cmd_buf_gpu_va = 0; |
55 | u32 methodSize; | 55 | u32 methodSize; |
56 | u32 cmd_buf_read_offset; | 56 | u32 cmd_buf_read_offset; |
57 | u32 fence_index; | ||
58 | u32 dma_copy_class; | 57 | u32 dma_copy_class; |
59 | struct nvgpu_gpfifo gpfifo; | 58 | struct nvgpu_gpfifo gpfifo; |
60 | struct nvgpu_fence fence = {0,0}; | 59 | struct nvgpu_fence fence = {0,0}; |
@@ -87,38 +86,22 @@ int gk20a_ce_execute_ops(struct gk20a *g, | |||
87 | 86 | ||
88 | nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); | 87 | nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); |
89 | 88 | ||
90 | ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset; | 89 | ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS; |
91 | 90 | ||
92 | cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * | 91 | cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * |
93 | (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); | 92 | (NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF / sizeof(u32))); |
94 | |||
95 | /* at end of command buffer has gk20a_fence for command buffer sync */ | ||
96 | fence_index = (cmd_buf_read_offset + | ||
97 | ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - | ||
98 | (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); | ||
99 | |||
100 | if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) { | ||
101 | ret = -ENOMEM; | ||
102 | goto noop; | ||
103 | } | ||
104 | 93 | ||
105 | cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; | 94 | cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; |
106 | 95 | ||
107 | /* 0 is treated as invalid pre-sync */ | 96 | if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset]) { |
108 | if (cmd_buf_cpu_va[fence_index]) { | 97 | struct gk20a_fence **prev_post_fence = |
109 | struct gk20a_fence * ce_cmd_buf_fence_in = NULL; | 98 | &ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset]; |
110 | 99 | ||
111 | memcpy((void *)&ce_cmd_buf_fence_in, | 100 | ret = gk20a_fence_wait(g, *prev_post_fence, |
112 | (void *)(cmd_buf_cpu_va + fence_index), | ||
113 | sizeof(struct gk20a_fence *)); | ||
114 | ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in, | ||
115 | gk20a_get_gr_idle_timeout(g)); | 101 | gk20a_get_gr_idle_timeout(g)); |
116 | 102 | ||
117 | gk20a_fence_put(ce_cmd_buf_fence_in); | 103 | gk20a_fence_put(*prev_post_fence); |
118 | /* Reset the stored last pre-sync */ | 104 | *prev_post_fence = NULL; |
119 | memset((void *)(cmd_buf_cpu_va + fence_index), | ||
120 | 0, | ||
121 | NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); | ||
122 | if (ret) | 105 | if (ret) |
123 | goto noop; | 106 | goto noop; |
124 | } | 107 | } |
@@ -130,7 +113,7 @@ int gk20a_ce_execute_ops(struct gk20a *g, | |||
130 | dst_buf, | 113 | dst_buf, |
131 | size, | 114 | size, |
132 | &cmd_buf_cpu_va[cmd_buf_read_offset], | 115 | &cmd_buf_cpu_va[cmd_buf_read_offset], |
133 | NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF, | 116 | NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF, |
134 | payload, | 117 | payload, |
135 | gk20a_get_valid_launch_flags(g, launch_flags), | 118 | gk20a_get_valid_launch_flags(g, launch_flags), |
136 | request_operation, | 119 | request_operation, |
@@ -154,10 +137,8 @@ int gk20a_ce_execute_ops(struct gk20a *g, | |||
154 | &ce_cmd_buf_fence_out, false, NULL); | 137 | &ce_cmd_buf_fence_out, false, NULL); |
155 | 138 | ||
156 | if (!ret) { | 139 | if (!ret) { |
157 | memcpy((void *)(cmd_buf_cpu_va + fence_index), | 140 | ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] = |
158 | (void *)&ce_cmd_buf_fence_out, | 141 | ce_cmd_buf_fence_out; |
159 | sizeof(struct gk20a_fence *)); | ||
160 | |||
161 | if (gk20a_fence_out) { | 142 | if (gk20a_fence_out) { |
162 | gk20a_fence_get(ce_cmd_buf_fence_out); | 143 | gk20a_fence_get(ce_cmd_buf_fence_out); |
163 | *gk20a_fence_out = ce_cmd_buf_fence_out; | 144 | *gk20a_fence_out = ce_cmd_buf_fence_out; |
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index c4fcca3c..18878991 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | |||
@@ -103,39 +103,15 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) | |||
103 | } | 103 | } |
104 | 104 | ||
105 | /* static CE app api */ | 105 | /* static CE app api */ |
106 | static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx) | 106 | static void gk20a_ce_put_fences(struct gk20a_gpu_ctx *ce_ctx) |
107 | { | 107 | { |
108 | u32 cmd_buf_index; | 108 | u32 i; |
109 | u32 cmd_buf_read_offset; | 109 | |
110 | u32 fence_index; | 110 | for (i = 0; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) { |
111 | u32 *cmd_buf_cpu_va; | 111 | struct gk20a_fence **fence = &ce_ctx->postfences[i]; |
112 | 112 | if (*fence) | |
113 | for (cmd_buf_index = 0; | 113 | gk20a_fence_put(*fence); |
114 | cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset; | 114 | *fence = NULL; |
115 | cmd_buf_index++) { | ||
116 | cmd_buf_read_offset = (cmd_buf_index * | ||
117 | (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); | ||
118 | |||
119 | /* at end of command buffer has gk20a_fence for command buffer sync */ | ||
120 | fence_index = (cmd_buf_read_offset + | ||
121 | ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - | ||
122 | (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); | ||
123 | |||
124 | cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; | ||
125 | |||
126 | /* 0 is treated as invalid pre-sync */ | ||
127 | if (cmd_buf_cpu_va[fence_index]) { | ||
128 | struct gk20a_fence * ce_cmd_buf_fence_in = NULL; | ||
129 | |||
130 | memcpy((void *)&ce_cmd_buf_fence_in, | ||
131 | (void *)(cmd_buf_cpu_va + fence_index), | ||
132 | sizeof(struct gk20a_fence *)); | ||
133 | gk20a_fence_put(ce_cmd_buf_fence_in); | ||
134 | /* Reset the stored last pre-sync */ | ||
135 | memset((void *)(cmd_buf_cpu_va + fence_index), | ||
136 | 0, | ||
137 | NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); | ||
138 | } | ||
139 | } | 115 | } |
140 | } | 116 | } |
141 | 117 | ||
@@ -148,8 +124,8 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) | |||
148 | 124 | ||
149 | nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); | 125 | nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); |
150 | 126 | ||
151 | if (ce_ctx->cmd_buf_mem.cpu_va) { | 127 | if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) { |
152 | gk20a_ce_free_command_buffer_stored_fence(ce_ctx); | 128 | gk20a_ce_put_fences(ce_ctx); |
153 | nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); | 129 | nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); |
154 | } | 130 | } |
155 | 131 | ||
@@ -449,8 +425,6 @@ u32 gk20a_ce_create_context(struct gk20a *g, | |||
449 | ce_ctx->g = g; | 425 | ce_ctx->g = g; |
450 | 426 | ||
451 | ce_ctx->cmd_buf_read_queue_offset = 0; | 427 | ce_ctx->cmd_buf_read_queue_offset = 0; |
452 | ce_ctx->cmd_buf_end_queue_offset = | ||
453 | (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF); | ||
454 | 428 | ||
455 | ce_ctx->vm = g->mm.ce.vm; | 429 | ce_ctx->vm = g->mm.ce.vm; |
456 | 430 | ||
@@ -491,8 +465,11 @@ u32 gk20a_ce_create_context(struct gk20a *g, | |||
491 | goto end; | 465 | goto end; |
492 | } | 466 | } |
493 | 467 | ||
494 | /* allocate command buffer (4096 should be more than enough) from sysmem*/ | 468 | /* allocate command buffer from sysmem */ |
495 | err = nvgpu_dma_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem); | 469 | err = nvgpu_dma_alloc_map_sys(ce_ctx->vm, |
470 | NVGPU_CE_MAX_INFLIGHT_JOBS * | ||
471 | NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF, | ||
472 | &ce_ctx->cmd_buf_mem); | ||
496 | if (err) { | 473 | if (err) { |
497 | nvgpu_err(g, | 474 | nvgpu_err(g, |
498 | "ce: could not allocate command buffer for CE context"); | 475 | "ce: could not allocate command buffer for CE context"); |
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h index 0b475f65..1a102070 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h | |||
@@ -36,9 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base); | |||
36 | #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff | 36 | #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff |
37 | #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff | 37 | #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff |
38 | 38 | ||
39 | #define NVGPU_CE_COMMAND_BUF_SIZE 8192 | 39 | #define NVGPU_CE_MAX_INFLIGHT_JOBS 32 |
40 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256 | 40 | #define NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF 256 |
41 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8 | ||
42 | 41 | ||
43 | /* dma launch_flags */ | 42 | /* dma launch_flags */ |
44 | enum { | 43 | enum { |
@@ -106,11 +105,11 @@ struct gk20a_gpu_ctx { | |||
106 | 105 | ||
107 | /* cmd buf mem_desc */ | 106 | /* cmd buf mem_desc */ |
108 | struct nvgpu_mem cmd_buf_mem; | 107 | struct nvgpu_mem cmd_buf_mem; |
108 | struct gk20a_fence *postfences[NVGPU_CE_MAX_INFLIGHT_JOBS]; | ||
109 | 109 | ||
110 | struct nvgpu_list_node list; | 110 | struct nvgpu_list_node list; |
111 | 111 | ||
112 | u32 cmd_buf_read_queue_offset; | 112 | u32 cmd_buf_read_queue_offset; |
113 | u32 cmd_buf_end_queue_offset; | ||
114 | }; | 113 | }; |
115 | 114 | ||
116 | static inline struct gk20a_gpu_ctx * | 115 | static inline struct gk20a_gpu_ctx * |