From 7680fd689ecf7d11bf2dfdba41dc2f33cde2bbe7 Mon Sep 17 00:00:00 2001 From: Konsta Holtta Date: Tue, 16 May 2017 13:47:58 +0300 Subject: gpu: nvgpu: hold power ref for deterministic channels To support deterministic channels even with platforms where railgating is supported, have each deterministic-marked channel hold a power reference during their lifetime, and skip taking power refs for jobs in submit path for those. Previously, railgating blocked deterministic submits in general because of gk20a_busy()/gk20a_idle() calls in submit path possibly taking time and more significantly because the gpu may need turning on which takes a nondeterministic and long amount of time. As an exception, gk20a_do_idle() can still block deterministic submits until gk20a_do_unidle() is called. Add a rwsem to guard this. VPR resize needs do_idle, which conflicts with deterministic channels' requirement to keep the GPU on. This is documented in the ioctl header now. Make NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING always set in the gpu characteristics now that it's supported. The only thing left now blocking NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_FULL is the sync framework. Make the channel debug dump show which channels are deterministic. Bug 200291300 Jira NVGPU-70 Change-Id: I47b6f3a8517cd6e4255f6ca2855e3dd912e4f5f3 Signed-off-by: Konsta Holtta Reviewed-on: http://git-master/r/1483038 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/linux/driver_common.c | 1 + drivers/gpu/nvgpu/common/linux/module.c | 9 ++ drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 169 ++++++++++++++++++++++--- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 3 + drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 6 +- drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 1 + drivers/gpu/nvgpu/gk20a/gk20a.c | 19 ++- drivers/gpu/nvgpu/gk20a/gk20a.h | 6 + 8 files changed, 185 insertions(+), 29 deletions(-) (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c index bd9a4e77..a00880ed 100644 --- a/drivers/gpu/nvgpu/common/linux/driver_common.c +++ b/drivers/gpu/nvgpu/common/linux/driver_common.c @@ -39,6 +39,7 @@ static void nvgpu_init_vars(struct gk20a *g) gk20a_init_gr(g); init_rwsem(&g->busy_lock); + init_rwsem(&g->deterministic_busy); nvgpu_spinlock_init(&g->mc_enable_lock); diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c index 34a0ded6..cbad3993 100644 --- a/drivers/gpu/nvgpu/common/linux/module.c +++ b/drivers/gpu/nvgpu/common/linux/module.c @@ -298,6 +298,12 @@ int __gk20a_do_idle(struct device *dev, bool force_reset) bool is_railgated; int err = 0; + /* + * Hold back deterministic submits and changes to deterministic + * channels - this must be outside the power busy locks. + */ + gk20a_channel_deterministic_idle(g); + /* acquire busy lock to block other busy() calls */ down_write(&g->busy_lock); @@ -403,6 +409,7 @@ fail_drop_usage_count: fail_timeout: nvgpu_mutex_release(&platform->railgate_lock); up_write(&g->busy_lock); + gk20a_channel_deterministic_unidle(g); return -EBUSY; } @@ -456,6 +463,8 @@ int __gk20a_do_unidle(struct device *dev) nvgpu_mutex_release(&platform->railgate_lock); up_write(&g->busy_lock); + gk20a_channel_deterministic_unidle(g); + return 0; } diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 49d83069..90202fd7 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -575,8 +575,15 @@ unbind: g->ops.fifo.unbind_channel(ch); g->ops.fifo.free_inst(g, ch); + /* put back the channel-wide submit ref from init */ + if (ch->deterministic) { + down_read(&g->deterministic_busy); + ch->deterministic = false; + gk20a_idle(g); + up_read(&g->deterministic_busy); + } + ch->vpr = false; - ch->deterministic = false; ch->vm = NULL; WARN_ON(ch->sync); @@ -1228,22 +1235,42 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED) c->vpr = true; - if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC) + if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC) { + down_read(&g->deterministic_busy); + /* + * Railgating isn't deterministic; instead of disallowing + * railgating globally, take a power refcount for this + * channel's lifetime. The gk20a_idle() pair for this happens + * when the channel gets freed. + * + * Deterministic flag and this busy must be atomic within the + * busy lock. + */ + err = gk20a_busy(g); + if (err) { + up_read(&g->deterministic_busy); + return err; + } + c->deterministic = true; + up_read(&g->deterministic_busy); + } /* an address space needs to have been bound at this point. */ if (!gk20a_channel_as_bound(c)) { nvgpu_err(g, "not bound to an address space at time of gpfifo" " allocation."); - return -EINVAL; + err = -EINVAL; + goto clean_up_idle; } ch_vm = c->vm; if (c->gpfifo.mem.size) { nvgpu_err(g, "channel %d :" "gpfifo already allocated", c->hw_chid); - return -EEXIST; + err = -EEXIST; + goto clean_up_idle; } err = nvgpu_dma_alloc_map_sys(ch_vm, @@ -1336,6 +1363,13 @@ clean_up_unmap: nvgpu_dma_unmap_free(ch_vm, &c->gpfifo.mem); clean_up: memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc)); +clean_up_idle: + if (c->deterministic) { + down_read(&g->deterministic_busy); + gk20a_idle(g); + c->deterministic = false; + up_read(&g->deterministic_busy); + } nvgpu_err(g, "fail"); return err; } @@ -2089,7 +2123,13 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, channel_gk20a_free_job(c, job); job_finished = 1; - gk20a_idle(g); + + /* + * Deterministic channels have a channel-wide power reference; + * for others, there's one per submit. + */ + if (!c->deterministic) + gk20a_idle(g); if (!clean_all) { /* Timeout isn't supported here so don't touch it. */ @@ -2457,7 +2497,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, * Job tracking is necessary for any of the following conditions: * - pre- or post-fence functionality * - channel wdt - * - GPU rail-gating + * - GPU rail-gating with non-deterministic channels * - buffer refcounting * * If none of the conditions are met, then job tracking is not @@ -2467,7 +2507,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || c->wdt_enabled || - g->can_railgate || + (g->can_railgate && !c->deterministic) || !skip_buffer_refcounting; if (need_job_tracking) { @@ -2495,7 +2535,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, * behavior of the clean-up operation non-deterministic * (should not be performed in the submit path) * - channel wdt - * - GPU rail-gating + * - GPU rail-gating with non-deterministic channels * - buffer refcounting * * If none of the conditions are met, then deferred clean-up @@ -2505,7 +2545,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, need_deferred_cleanup = !c->deterministic || need_sync_framework || c->wdt_enabled || - g->can_railgate || + (g->can_railgate && + !c->deterministic) || !skip_buffer_refcounting; /* @@ -2515,12 +2556,20 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, if (c->deterministic && need_deferred_cleanup) return -EINVAL; - /* released by job cleanup via syncpt or sema interrupt */ - err = gk20a_busy(g); - if (err) { - nvgpu_err(g, "failed to host gk20a to submit gpfifo, process %s", - current->comm); - return err; + if (!c->deterministic) { + /* + * Get a power ref unless this is a deterministic + * channel that holds them during the channel lifetime. + * This one is released by gk20a_channel_clean_up_jobs, + * via syncpt or sema interrupt, whichever is used. + */ + err = gk20a_busy(g); + if (err) { + nvgpu_err(g, + "failed to host gk20a to submit gpfifo, process %s", + current->comm); + return err; + } } if (!need_deferred_cleanup) { @@ -2529,6 +2578,11 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, } } + + /* Grab access to HW to deal with do_idle */ + if (c->deterministic) + down_read(&g->deterministic_busy); + trace_gk20a_channel_submit_gpfifo(g->name, c->hw_chid, num_entries, @@ -2601,6 +2655,10 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, g->ops.fifo.userd_gp_put(g, c); + /* No hw access beyond this point */ + if (c->deterministic) + up_read(&g->deterministic_busy); + trace_gk20a_channel_submitted_gpfifo(g->name, c->hw_chid, num_entries, @@ -2622,11 +2680,90 @@ clean_up: gk20a_dbg_fn("fail"); gk20a_fence_put(pre_fence); gk20a_fence_put(post_fence); - if (need_deferred_cleanup) + if (c->deterministic) + up_read(&g->deterministic_busy); + else if (need_deferred_cleanup) gk20a_idle(g); + return err; } +/* + * Stop deterministic channel activity for do_idle() when power needs to go off + * momentarily but deterministic channels keep power refs for potentially a + * long time. + * + * Takes write access on g->deterministic_busy. + * + * Must be paired with gk20a_channel_deterministic_unidle(). + */ +void gk20a_channel_deterministic_idle(struct gk20a *g) +{ + struct fifo_gk20a *f = &g->fifo; + u32 chid; + + /* Grab exclusive access to the hw to block new submits */ + down_write(&g->deterministic_busy); + + for (chid = 0; chid < f->num_channels; chid++) { + struct channel_gk20a *ch = &f->channel[chid]; + + if (!gk20a_channel_get(ch)) + continue; + + if (ch->deterministic) { + /* + * Drop the power ref taken when setting deterministic + * flag. deterministic_unidle will put this and the + * channel ref back. + * + * Hold the channel ref: it must not get freed in + * between. A race could otherwise result in lost + * gk20a_busy() via unidle, and in unbalanced + * gk20a_idle() via closing the channel. + */ + gk20a_idle(g); + } else { + /* Not interesting, carry on. */ + gk20a_channel_put(ch); + } + } +} + +/* + * Allow deterministic channel activity again for do_unidle(). + * + * This releases write access on g->deterministic_busy. + */ +void gk20a_channel_deterministic_unidle(struct gk20a *g) +{ + struct fifo_gk20a *f = &g->fifo; + u32 chid; + + for (chid = 0; chid < f->num_channels; chid++) { + struct channel_gk20a *ch = &f->channel[chid]; + + if (!gk20a_channel_get(ch)) + continue; + + /* + * Deterministic state changes inside deterministic_busy lock, + * which we took in deterministic_idle. + */ + if (ch->deterministic) { + if (gk20a_busy(g)) + nvgpu_err(g, "cannot busy() again!"); + /* Took this in idle() */ + gk20a_channel_put(ch); + } + + gk20a_channel_put(ch); + } + + /* Release submits, new deterministic channels and frees */ + up_write(&g->deterministic_busy); +} + int gk20a_init_channel_support(struct gk20a *g, u32 chid) { struct channel_gk20a *c = g->fifo.channel+chid; diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 9872e1b2..ca042883 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -328,6 +328,9 @@ int gk20a_disable_channel_tsg(struct gk20a *g, struct channel_gk20a *ch); int gk20a_channel_suspend(struct gk20a *g); int gk20a_channel_resume(struct gk20a *g); +void gk20a_channel_deterministic_idle(struct gk20a *g); +void gk20a_channel_deterministic_unidle(struct gk20a *g); + int nvgpu_channel_worker_init(struct gk20a *g); void nvgpu_channel_worker_deinit(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 5a571dc8..37e19ef8 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -3494,10 +3494,11 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g, syncpointa = inst_mem[ram_fc_syncpointa_w()]; syncpointb = inst_mem[ram_fc_syncpointb_w()]; - gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid, + gk20a_debug_output(o, "%d-%s, pid %d, refs %d%s: ", hw_chid, g->name, ch_state->pid, - ch_state->refs); + ch_state->refs, + ch_state->deterministic ? ", deterministic" : ""); gk20a_debug_output(o, "channel status: %s in use %s %s\n", ccsr_channel_enable_v(channel) ? "" : "not", gk20a_decode_ccsr_chan_status(status), @@ -3576,6 +3577,7 @@ void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g, ch_state[chid]->pid = ch->pid; ch_state[chid]->refs = atomic_read(&ch->ref_count); + ch_state[chid]->deterministic = ch->deterministic; nvgpu_mem_rd_n(g, &ch->inst_block, 0, &ch_state[chid]->inst_block[0], ram_in_alloc_size_v()); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 228e5130..1566302f 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -216,6 +216,7 @@ static inline const char *gk20a_fifo_interleave_level_name(u32 interleave_level) struct ch_state { int pid; int refs; + bool deterministic; u32 inst_block[0]; }; diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index 8624d601..13635706 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c @@ -456,22 +456,19 @@ int gk20a_init_gpu_characteristics(struct gk20a *g) gpu->flags |= NVGPU_GPU_FLAGS_HAS_SYNCPOINTS; /* - * Railgating needs job tracking which prevents fast submits. They're - * supported otherwise, provided that the user doesn't request anything - * that depends on job tracking. (Here, fast means strictly no + * Fast submits are supported as long as the user doesn't request + * anything that depends on job tracking. (Here, fast means strictly no * metadata, just the gpfifo contents are copied and gp_put updated). */ - if (!g->can_railgate) - gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING; + gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING; /* - * Railgating and sync framework require deferred job cleanup which - * prevents deterministic submits. They're supported otherwise, - * provided that the user doesn't request anything that depends on - * deferred cleanup. + * Sync framework requires deferred job cleanup, wrapping syncs in FDs, + * and other heavy stuff, which prevents deterministic submits. This is + * supported otherwise, provided that the user doesn't request anything + * that depends on deferred cleanup. */ - if (!g->can_railgate - && !gk20a_channel_sync_needs_sync_framework(g)) + if (!gk20a_channel_sync_needs_sync_framework(g)) gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_DETERMINISTIC_SUBMIT_FULL; gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_USERSPACE_MANAGED_AS; diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 1d867912..79118fca 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -1025,6 +1025,12 @@ struct gk20a { u32 log_trace; struct rw_semaphore busy_lock; + /* + * Guards access to hardware when usual gk20a_{busy,idle} are skipped + * for submits and held for channel lifetime but dropped for an ongoing + * gk20a_do_idle(). + */ + struct rw_semaphore deterministic_busy; struct nvgpu_falcon pmu_flcn; struct nvgpu_falcon sec2_flcn; -- cgit v1.2.2