From b9feba6efc48743da70e474d40b7889a7efb4ba5 Mon Sep 17 00:00:00 2001 From: David Nieto Date: Tue, 21 Feb 2017 15:36:49 -0800 Subject: gpu: nvgpu: in-kernel kickoff profiling Add a debugfs interface to profile the kickoff ioctl it provides the probability distribution and separates the information between time spent in: the full ioctl, the kickoff function, the amount of time spent in job tracking and the amount of time doing pushbuffer copies JIRA: EVLR-1003 Change-Id: I9888b114c3fbced61b1cf134c79f7a8afce15f56 Signed-off-by: David Nieto Reviewed-on: http://git-master/r/1308997 Reviewed-by: svccoveritychecker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 3 +- drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 2 +- drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 31 ++++- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 4 +- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 218 ++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 39 +++++- 6 files changed, 289 insertions(+), 8 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index d43bc93f..d19479a2 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c @@ -727,7 +727,8 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx, } return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL, - num_entries, flags, fence, fence_out, true); + num_entries, flags, fence, fence_out, true, + NULL); } static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx) diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index fd248313..db1ac539 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c @@ -653,7 +653,7 @@ int gk20a_ce_execute_ops(struct device *dev, ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL, 1, submit_flags, &fence, - &ce_cmd_buf_fence_out, false); + &ce_cmd_buf_fence_out, false, NULL); if (!ret) { memcpy((void *)(cmd_buf_cpu_va + fence_index), diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 68e43259..f58b208c 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -2987,7 +2987,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, u32 flags, struct nvgpu_fence *fence, struct gk20a_fence **fence_out, - bool force_need_sync_fence) + bool force_need_sync_fence, + struct fifo_profile_gk20a *profile) { struct gk20a *g = c->g; struct device *d = dev_from_gk20a(g); @@ -3036,6 +3037,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, return -EINVAL; } + if (profile) + profile->timestamp[PROFILE_ENTRY] = sched_clock(); + #ifdef CONFIG_DEBUG_FS /* update debug settings */ if (g->ops.ltc.sync_debugfs) @@ -3162,6 +3166,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, goto clean_up_job; } + if (profile) + profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock(); + if (wait_cmd) gk20a_submit_append_priv_cmdbuf(c, wait_cmd); @@ -3184,6 +3191,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, if (need_job_tracking) /* TODO! Check for errors... */ gk20a_channel_add_job(c, job, skip_buffer_refcounting); + if (profile) + profile->timestamp[PROFILE_APPEND] = sched_clock(); g->ops.fifo.userd_gp_put(g, c); @@ -3197,6 +3206,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, gk20a_dbg_info("post-submit put %d, get %d, size %d", c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); + if (profile) + profile->timestamp[PROFILE_END] = sched_clock(); gk20a_dbg_fn("done"); return err; @@ -3789,15 +3800,22 @@ static int gk20a_ioctl_channel_submit_gpfifo( struct nvgpu_submit_gpfifo_args *args) { struct gk20a_fence *fence_out; + struct fifo_profile_gk20a *profile = NULL; + int ret = 0; gk20a_dbg_fn(""); +#ifdef CONFIG_DEBUG_FS + profile = gk20a_fifo_profile_acquire(ch->g); + + if (profile) + profile->timestamp[PROFILE_IOCTL_ENTRY] = sched_clock(); +#endif if (ch->has_timedout) return -ETIMEDOUT; - ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries, args->flags, &args->fence, - &fence_out, false); + &fence_out, false, profile); if (ret) goto clean_up; @@ -3816,7 +3834,12 @@ static int gk20a_ioctl_channel_submit_gpfifo( } } gk20a_fence_put(fence_out); - +#ifdef CONFIG_DEBUG_FS + if (profile) { + profile->timestamp[PROFILE_IOCTL_EXIT] = sched_clock(); + gk20a_fifo_profile_release(ch->g, profile); + } +#endif clean_up: return ret; } diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index d9913cd7..42550632 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -33,6 +33,7 @@ struct gk20a; struct gr_gk20a; struct dbg_session_gk20a; struct gk20a_fence; +struct fifo_profile_gk20a; #include "channel_sync_gk20a.h" @@ -344,7 +345,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, u32 flags, struct nvgpu_fence *fence, struct gk20a_fence **fence_out, - bool force_need_sync_fence); + bool force_need_sync_fence, + struct fifo_profile_gk20a *profile); int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c, struct nvgpu_alloc_gpfifo_ex_args *args); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index d072fb48..35d56ce4 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,10 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, bool wait_for_finish); static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg); +#ifdef CONFIG_DEBUG_FS +static void __gk20a_fifo_profile_free(struct kref *ref); +#endif + u32 gk20a_fifo_get_engine_ids(struct gk20a *g, u32 engine_id[], u32 engine_id_sz, u32 engine_enum) @@ -532,6 +537,14 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f) f->engine_info = NULL; kfree(f->active_engines_list); f->active_engines_list = NULL; +#ifdef CONFIG_DEBUG_FS + nvgpu_mutex_acquire(&f->profile.lock); + if (f->profile.enabled) { + f->profile.enabled = false; + kref_put(&f->profile.ref, __gk20a_fifo_profile_free); + } + nvgpu_mutex_release(&f->profile.lock); +#endif } /* reads info from hardware and fills in pbmda exception info record */ @@ -3203,6 +3216,32 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g, } #ifdef CONFIG_DEBUG_FS + +/* Get the next element in the ring buffer of profile entries + * and grab a reference to the structure + */ +struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g) +{ + struct fifo_gk20a *f = &g->fifo; + struct fifo_profile_gk20a *profile; + unsigned int index; + + /* If kref is zero, profiling is not enabled */ + if (!kref_get_unless_zero(&f->profile.ref)) + return NULL; + index = atomic_inc_return(&f->profile.get); + profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES]; + + return profile; +} + +/* Free the reference to the structure. This allows deferred cleanups */ +void gk20a_fifo_profile_release(struct gk20a *g, + struct fifo_profile_gk20a *profile) +{ + kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free); +} + static void *gk20a_fifo_sched_debugfs_seq_start( struct seq_file *s, loff_t *pos) { @@ -3316,6 +3355,168 @@ static const struct file_operations gk20a_fifo_sched_debugfs_fops = { .release = seq_release }; +static void __gk20a_fifo_profile_free(struct kref *ref) +{ + struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a, + profile.ref); + vfree(f->profile.data); + vfree(f->profile.sorted); +} + +static int gk20a_fifo_profile_enable(void *data, u64 val) +{ + struct gk20a *g = (struct gk20a *) data; + struct fifo_gk20a *f = &g->fifo; + + + nvgpu_mutex_acquire(&f->profile.lock); + if (val == 0) { + if (f->profile.enabled) { + f->profile.enabled = false; + kref_put(&f->profile.ref, __gk20a_fifo_profile_free); + } + } else { + if (!f->profile.enabled) { + /* not kref init as it can have a running condition if + * we enable/disable/enable while kickoff is happening + */ + if (!kref_get_unless_zero(&f->profile.ref)) { + f->profile.data = vzalloc( + FIFO_PROFILING_ENTRIES * + sizeof(struct fifo_profile_gk20a)); + f->profile.sorted = vzalloc( + FIFO_PROFILING_ENTRIES * + sizeof(u64)); + if (!(f->profile.data && f->profile.sorted)) { + vfree(f->profile.data); + vfree(f->profile.sorted); + nvgpu_mutex_release(&f->profile.lock); + return -ENOMEM; + } + kref_init(&f->profile.ref); + } + atomic_set(&f->profile.get, 0); + f->profile.enabled = true; + } + } + nvgpu_mutex_release(&f->profile.lock); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE( + gk20a_fifo_profile_enable_debugfs_fops, + NULL, + gk20a_fifo_profile_enable, + "%llu\n" +); + +static int __profile_cmp(const void *a, const void *b) +{ + return *((unsigned long long *) a) - *((unsigned long long *) b); +} + +/* + * This uses about 800b in the stack, but the function using it is not part + * of a callstack where much memory is being used, so it is fine + */ +#define PERCENTILE_WIDTH 5 +#define PERCENTILE_RANGES (100/PERCENTILE_WIDTH) + +static unsigned int __gk20a_fifo_create_stats(struct gk20a *g, + u64 *percentiles, u32 index_end, u32 index_start) +{ + unsigned int nelem = 0; + unsigned int index; + struct fifo_profile_gk20a *profile; + + for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) { + profile = &g->fifo.profile.data[index]; + + if (profile->timestamp[index_end] > + profile->timestamp[index_start]) { + /* This is a valid element */ + g->fifo.profile.sorted[nelem] = + profile->timestamp[index_end] - + profile->timestamp[index_start]; + nelem++; + } + } + + /* sort it */ + sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long), + __profile_cmp, NULL); + + /* build ranges */ + for (index = 0; index < PERCENTILE_RANGES; index++) + percentiles[index] = + g->fifo.profile.sorted[(PERCENTILE_WIDTH * index * + nelem)/100]; + return nelem; +} + +static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused) +{ + struct gk20a *g = s->private; + unsigned int get, nelem, index; + /* + * 800B in the stack, but function is declared statically and only + * called from debugfs handler + */ + u64 percentiles_ioctl[PERCENTILE_RANGES]; + u64 percentiles_kickoff[PERCENTILE_RANGES]; + u64 percentiles_jobtracking[PERCENTILE_RANGES]; + u64 percentiles_append[PERCENTILE_RANGES]; + u64 percentiles_userd[PERCENTILE_RANGES]; + + if (!kref_get_unless_zero(&g->fifo.profile.ref)) { + seq_printf(s, "Profiling disabled\n"); + return 0; + } + + get = atomic_read(&g->fifo.profile.get); + + __gk20a_fifo_create_stats(g, percentiles_ioctl, + PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY); + __gk20a_fifo_create_stats(g, percentiles_kickoff, + PROFILE_END, PROFILE_ENTRY); + __gk20a_fifo_create_stats(g, percentiles_jobtracking, + PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY); + __gk20a_fifo_create_stats(g, percentiles_append, + PROFILE_APPEND, PROFILE_JOB_TRACKING); + nelem = __gk20a_fifo_create_stats(g, percentiles_userd, + PROFILE_END, PROFILE_APPEND); + + seq_printf(s, "Number of kickoffs: %d\n", nelem); + seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n"); + + for (index = 0; index < PERCENTILE_RANGES; index++) + seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n", + PERCENTILE_WIDTH * (index+1), + percentiles_ioctl[index], + percentiles_kickoff[index], + percentiles_append[index], + percentiles_jobtracking[index], + percentiles_userd[index]); + + kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free); + + return 0; +} + +static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, gk20a_fifo_profile_stats, inode->i_private); +} + +static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = { + .open = gk20a_fifo_profile_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + + void gk20a_fifo_debugfs_init(struct device *dev) { struct gk20a_platform *platform = dev_get_drvdata(dev); @@ -3323,6 +3524,8 @@ void gk20a_fifo_debugfs_init(struct device *dev) struct dentry *gpu_root = platform->debugfs; struct dentry *fifo_root; + struct dentry *profile_root; + fifo_root = debugfs_create_dir("fifo", gpu_root); if (IS_ERR_OR_NULL(fifo_root)) @@ -3333,6 +3536,21 @@ void gk20a_fifo_debugfs_init(struct device *dev) debugfs_create_file("sched", 0600, fifo_root, g, &gk20a_fifo_sched_debugfs_fops); + profile_root = debugfs_create_dir("profile", fifo_root); + if (IS_ERR_OR_NULL(profile_root)) + return; + + nvgpu_mutex_init(&g->fifo.profile.lock); + g->fifo.profile.enabled = false; + atomic_set(&g->fifo.profile.get, 0); + atomic_set(&g->fifo.profile.ref.refcount, 0); + + debugfs_create_file("enable", 0600, profile_root, g, + &gk20a_fifo_profile_enable_debugfs_fops); + + debugfs_create_file("stats", 0600, profile_root, g, + &gk20a_fifo_profile_stats_debugfs_fops); + } #endif /* CONFIG_DEBUG_FS */ diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 147d1bea..75c801c6 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -30,6 +30,15 @@ #define FIFO_INVAL_CHANNEL_ID ((u32)~0) #define FIFO_INVAL_TSG_ID ((u32)~0) +/* + * Number of entries in the kickoff latency buffer, used to calculate + * the profiling and histogram. This number is calculated to be statistically + * significative on a histogram on a 5% step + */ +#ifdef CONFIG_DEBUG_FS +#define FIFO_PROFILING_ENTRIES 16384 +#endif + /* generally corresponds to the "pbdma" engine */ struct fifo_runlist_info_gk20a { @@ -99,6 +108,20 @@ struct fifo_engine_info_gk20a { }; +enum { + PROFILE_IOCTL_ENTRY = 0, + PROFILE_ENTRY, + PROFILE_JOB_TRACKING, + PROFILE_APPEND, + PROFILE_END, + PROFILE_IOCTL_EXIT, + PROFILE_MAX +}; + +struct fifo_profile_gk20a { + u64 timestamp[PROFILE_MAX]; +}; + struct fifo_gk20a { struct gk20a *g; unsigned int num_channels; @@ -115,7 +138,16 @@ struct fifo_gk20a { struct fifo_runlist_info_gk20a *runlist_info; u32 max_runlists; - +#ifdef CONFIG_DEBUG_FS + struct { + struct fifo_profile_gk20a *data; + atomic_t get; + bool enabled; + u64 *sorted; + struct kref ref; + struct nvgpu_mutex lock; + } profile; +#endif struct mem_desc userd; u32 userd_entry_size; @@ -275,5 +307,10 @@ void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist); u32 gk20a_userd_gp_get(struct gk20a *g, struct channel_gk20a *c); void gk20a_userd_gp_put(struct gk20a *g, struct channel_gk20a *c); bool gk20a_is_fault_engine_subid_gpc(struct gk20a *g, u32 engine_subid); +#ifdef CONFIG_DEBUG_FS +struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g); +void gk20a_fifo_profile_release(struct gk20a *g, + struct fifo_profile_gk20a *profile); +#endif #endif /*__GR_GK20A_H__*/ -- cgit v1.2.2