summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
diff options
context:
space:
mode:
authorPeter Pipkorn <ppipkorn@nvidia.com>2015-09-28 07:49:53 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-01-11 12:04:01 -0500
commit2b064ce65e0035a860d1bc3bcccfcf8aac1f31c7 (patch)
tree1f20c0e608efcca51ef321d308df8e8cb059ad8c /drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
parenta9c6f595399074e88c16f3557e5acb29db1d52d5 (diff)
gpu: nvgpu: add high priority channel interleave
Interleave all high priority channels between all other channels. This reduces the latency for high priority work when there are a lot of lower priority work present, imposing an upper bound on the latency. Change the default high priority timeslice from 5.2ms to 3.0 in the process, to prevent long running high priority apps from hogging the GPU too much. Introduce a new debugfs node to enable/disable high priority channel interleaving. It is currently enabled by default. Adds new runlist length max register, used for allocating suitable sized runlist. Limit the number of interleaved channels to 32. This change reduces the maximum time a lower priority job is running (one timeslice) before we check that high priority jobs are running. Tested with gles2_context_priority (still passes) Basic sanity testing is done with graphics_submit (one app is high priority) Also more functional testing using lots of parallel runs with: NVRM_GPU_CHANNEL_PRIORITY=3 ./gles2_expensive_draw –drawsperframe 20000 –triangles 50 –runtime 30 –finish plus multiple: NVRM_GPU_CHANNEL_PRIORITY=2 ./gles2_expensive_draw –drawsperframe 20000 –triangles 50 –runtime 30 -finish Previous to this change, the relative performance between high priority work and normal priority work comes down to timeslice value. This means that when there are many low priority channels, the high priority work will still drop quite a lot. But with this change, the high priority work will roughly get about half the entire GPU time, meaning that after the initial lower performance, it is less likely to get lower in performance due to more apps running on the system. This change makes a large step towards real priority levels. It is not perfect and there are no guarantees on anything, but it is a step forwards without any additional CPU overhead or other complications. It will also serve as a baseline to judge other algorithms against. Support for priorities with TSG is future work. Support for interleave mid + high priority channels, instead of just high, is also future work. Bug 1419900 Change-Id: I0f7d0ce83b6598fe86000577d72e14d312fdad98 Signed-off-by: Peter Pipkorn <ppipkorn@nvidia.com> Reviewed-on: http://git-master/r/805961 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c133
1 files changed, 123 insertions, 10 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5c99877b..ca5c0ee6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -303,7 +303,13 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
303 if (!runlist->active_tsgs) 303 if (!runlist->active_tsgs)
304 goto clean_up_runlist_info; 304 goto clean_up_runlist_info;
305 305
306 runlist_size = ram_rl_entry_size_v() * f->num_channels; 306 runlist->high_prio_channels =
307 kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
308 GFP_KERNEL);
309 if (!runlist->high_prio_channels)
310 goto clean_up_runlist_info;
311
312 runlist_size = ram_rl_entry_size_v() * f->num_runlist_entries;
307 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) { 313 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
308 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]); 314 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
309 if (err) { 315 if (err) {
@@ -324,10 +330,16 @@ clean_up_runlist:
324 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) 330 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++)
325 gk20a_gmmu_free(g, &runlist->mem[i]); 331 gk20a_gmmu_free(g, &runlist->mem[i]);
326 332
333clean_up_runlist_info:
327 kfree(runlist->active_channels); 334 kfree(runlist->active_channels);
328 runlist->active_channels = NULL; 335 runlist->active_channels = NULL;
329 336
330clean_up_runlist_info: 337 kfree(runlist->active_tsgs);
338 runlist->active_tsgs = NULL;
339
340 kfree(runlist->high_prio_channels);
341 runlist->high_prio_channels = NULL;
342
331 kfree(f->runlist_info); 343 kfree(f->runlist_info);
332 f->runlist_info = NULL; 344 f->runlist_info = NULL;
333 345
@@ -483,6 +495,7 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
483 gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */ 495 gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
484 496
485 f->num_channels = g->ops.fifo.get_num_fifos(g); 497 f->num_channels = g->ops.fifo.get_num_fifos(g);
498 f->num_runlist_entries = fifo_eng_runlist_length_max_v();
486 f->num_pbdma = proj_host_num_pbdma_v(); 499 f->num_pbdma = proj_host_num_pbdma_v();
487 f->max_engines = ENGINE_INVAL_GK20A; 500 f->max_engines = ENGINE_INVAL_GK20A;
488 501
@@ -2149,6 +2162,34 @@ static inline u32 gk20a_get_tsg_runlist_entry_0(struct tsg_gk20a *tsg)
2149 return runlist_entry_0; 2162 return runlist_entry_0;
2150} 2163}
2151 2164
2165/* add all active high priority channels */
2166static inline u32 gk20a_fifo_runlist_add_high_prio_entries(
2167 struct fifo_gk20a *f,
2168 struct fifo_runlist_info_gk20a *runlist,
2169 u32 *runlist_entry)
2170{
2171 struct channel_gk20a *ch = NULL;
2172 unsigned long high_prio_chid;
2173 u32 count = 0;
2174
2175 for_each_set_bit(high_prio_chid,
2176 runlist->high_prio_channels, f->num_channels) {
2177 ch = &f->channel[high_prio_chid];
2178
2179 if (!gk20a_is_channel_marked_as_tsg(ch) &&
2180 test_bit(high_prio_chid, runlist->active_channels) == 1) {
2181 gk20a_dbg_info("add high prio channel %lu to runlist",
2182 high_prio_chid);
2183 runlist_entry[0] = ram_rl_entry_chid_f(high_prio_chid);
2184 runlist_entry[1] = 0;
2185 runlist_entry += 2;
2186 count++;
2187 }
2188 }
2189
2190 return count;
2191}
2192
2152static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, 2193static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2153 u32 hw_chid, bool add, 2194 u32 hw_chid, bool add,
2154 bool wait_for_finish) 2195 bool wait_for_finish)
@@ -2158,7 +2199,7 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2158 struct fifo_runlist_info_gk20a *runlist = NULL; 2199 struct fifo_runlist_info_gk20a *runlist = NULL;
2159 u32 *runlist_entry_base = NULL; 2200 u32 *runlist_entry_base = NULL;
2160 u32 *runlist_entry = NULL; 2201 u32 *runlist_entry = NULL;
2161 phys_addr_t runlist_pa; 2202 u64 runlist_iova;
2162 u32 old_buf, new_buf; 2203 u32 old_buf, new_buf;
2163 u32 chid, tsgid; 2204 u32 chid, tsgid;
2164 struct channel_gk20a *ch = NULL; 2205 struct channel_gk20a *ch = NULL;
@@ -2194,11 +2235,13 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2194 old_buf = runlist->cur_buffer; 2235 old_buf = runlist->cur_buffer;
2195 new_buf = !runlist->cur_buffer; 2236 new_buf = !runlist->cur_buffer;
2196 2237
2238 runlist_iova = g->ops.mm.get_iova_addr(
2239 g, runlist->mem[new_buf].sgt->sgl, 0);
2240
2197 gk20a_dbg_info("runlist_id : %d, switch to new buffer 0x%16llx", 2241 gk20a_dbg_info("runlist_id : %d, switch to new buffer 0x%16llx",
2198 runlist_id, (u64)gk20a_mem_phys(&runlist->mem[new_buf])); 2242 runlist_id, (u64)runlist_iova);
2199 2243
2200 runlist_pa = gk20a_mem_phys(&runlist->mem[new_buf]); 2244 if (!runlist_iova) {
2201 if (!runlist_pa) {
2202 ret = -EINVAL; 2245 ret = -EINVAL;
2203 goto clean_up; 2246 goto clean_up;
2204 } 2247 }
@@ -2213,25 +2256,52 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2213 add /* resume to add all channels back */) { 2256 add /* resume to add all channels back */) {
2214 runlist_entry = runlist_entry_base; 2257 runlist_entry = runlist_entry_base;
2215 2258
2216 /* add non-TSG channels first */ 2259 /* Runlist manipulation:
2260 Insert an entry of all high priority channels inbetween
2261 all lower priority channels. This ensure that the maximum
2262 delay a runnable high priority channel has to wait is one
2263 medium timeslice + any context switching overhead +
2264 wait on other high priority channels.
2265 add non-TSG channels first */
2217 for_each_set_bit(chid, 2266 for_each_set_bit(chid,
2218 runlist->active_channels, f->num_channels) { 2267 runlist->active_channels, f->num_channels) {
2219 ch = &f->channel[chid]; 2268 ch = &f->channel[chid];
2220 2269
2221 if (!gk20a_is_channel_marked_as_tsg(ch)) { 2270 if (!gk20a_is_channel_marked_as_tsg(ch) &&
2222 gk20a_dbg_info("add channel %d to runlist", 2271 !ch->interleave) {
2272 u32 added;
2273
2274 gk20a_dbg_info("add normal prio channel %d to runlist",
2223 chid); 2275 chid);
2224 runlist_entry[0] = ram_rl_entry_chid_f(chid); 2276 runlist_entry[0] = ram_rl_entry_chid_f(chid);
2225 runlist_entry[1] = 0; 2277 runlist_entry[1] = 0;
2226 runlist_entry += 2; 2278 runlist_entry += 2;
2227 count++; 2279 count++;
2280
2281 added = gk20a_fifo_runlist_add_high_prio_entries(
2282 f,
2283 runlist,
2284 runlist_entry);
2285 count += added;
2286 runlist_entry += 2 * added;
2228 } 2287 }
2229 } 2288 }
2230 2289
2290 /* if there were no lower priority channels, then just
2291 * add the high priority channels once. */
2292 if (count == 0) {
2293 count = gk20a_fifo_runlist_add_high_prio_entries(
2294 f,
2295 runlist,
2296 runlist_entry);
2297 runlist_entry += 2 * count;
2298 }
2299
2231 /* now add TSG entries and channels bound to TSG */ 2300 /* now add TSG entries and channels bound to TSG */
2232 mutex_lock(&f->tsg_inuse_mutex); 2301 mutex_lock(&f->tsg_inuse_mutex);
2233 for_each_set_bit(tsgid, 2302 for_each_set_bit(tsgid,
2234 runlist->active_tsgs, f->num_channels) { 2303 runlist->active_tsgs, f->num_channels) {
2304 u32 added;
2235 tsg = &f->tsg[tsgid]; 2305 tsg = &f->tsg[tsgid];
2236 /* add TSG entry */ 2306 /* add TSG entry */
2237 gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid); 2307 gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
@@ -2260,6 +2330,13 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2260 2330
2261 WARN_ON(tsg->num_active_channels != 2331 WARN_ON(tsg->num_active_channels !=
2262 count_channels_in_tsg); 2332 count_channels_in_tsg);
2333
2334 added = gk20a_fifo_runlist_add_high_prio_entries(
2335 f,
2336 runlist,
2337 runlist_entry);
2338 count += added;
2339 runlist_entry += 2 * added;
2263 } 2340 }
2264 mutex_unlock(&f->tsg_inuse_mutex); 2341 mutex_unlock(&f->tsg_inuse_mutex);
2265 } else /* suspend to remove all channels */ 2342 } else /* suspend to remove all channels */
@@ -2267,7 +2344,7 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2267 2344
2268 if (count != 0) { 2345 if (count != 0) {
2269 gk20a_writel(g, fifo_runlist_base_r(), 2346 gk20a_writel(g, fifo_runlist_base_r(),
2270 fifo_runlist_base_ptr_f(u64_lo32(runlist_pa >> 12)) | 2347 fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12)) |
2271 fifo_runlist_base_target_vid_mem_f()); 2348 fifo_runlist_base_target_vid_mem_f());
2272 } 2349 }
2273 2350
@@ -2416,6 +2493,42 @@ u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
2416 return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f(); 2493 return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
2417} 2494}
2418 2495
2496int gk20a_fifo_set_channel_priority(
2497 struct gk20a *g,
2498 u32 runlist_id,
2499 u32 hw_chid,
2500 bool interleave)
2501{
2502 struct fifo_runlist_info_gk20a *runlist = NULL;
2503 struct fifo_gk20a *f = &g->fifo;
2504 struct channel_gk20a *ch = NULL;
2505
2506 if (hw_chid >= f->num_channels)
2507 return -EINVAL;
2508
2509 if (runlist_id >= f->max_runlists)
2510 return -EINVAL;
2511
2512 ch = &f->channel[hw_chid];
2513
2514 gk20a_dbg_fn("");
2515
2516 runlist = &f->runlist_info[runlist_id];
2517
2518 mutex_lock(&runlist->mutex);
2519
2520 if (ch->interleave)
2521 set_bit(hw_chid, runlist->high_prio_channels);
2522 else
2523 clear_bit(hw_chid, runlist->high_prio_channels);
2524
2525 gk20a_dbg_fn("done");
2526
2527 mutex_unlock(&runlist->mutex);
2528
2529 return 0;
2530}
2531
2419void gk20a_init_fifo(struct gpu_ops *gops) 2532void gk20a_init_fifo(struct gpu_ops *gops)
2420{ 2533{
2421 gk20a_init_channel(gops); 2534 gk20a_init_channel(gops);