From b9feba6efc48743da70e474d40b7889a7efb4ba5 Mon Sep 17 00:00:00 2001
From: David Nieto <dmartineznie@nvidia.com>
Date: Tue, 21 Feb 2017 15:36:49 -0800
Subject: gpu: nvgpu: in-kernel kickoff profiling

Add a debugfs interface to profile the kickoff ioctl
it provides the probability distribution and separates the information
between time spent in: the full ioctl, the kickoff function, the amount
of time spent in job tracking and the amount of time doing pushbuffer
copies

JIRA: EVLR-1003

Change-Id: I9888b114c3fbced61b1cf134c79f7a8afce15f56
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1308997
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c     |   3 +-
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.c     |   2 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c |  31 ++++-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |   4 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c    | 218 ++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h    |  39 +++++-
 6 files changed, 289 insertions(+), 8 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index d43bc93f..d19479a2 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -727,7 +727,8 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 	}
 
 	return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
-				   num_entries, flags, fence, fence_out, true);
+				   num_entries, flags, fence, fence_out, true,
+				   NULL);
 }
 
 static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index fd248313..db1ac539 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -653,7 +653,7 @@ int gk20a_ce_execute_ops(struct device *dev,
 
 		ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
 					1, submit_flags, &fence,
-					&ce_cmd_buf_fence_out, false);
+					&ce_cmd_buf_fence_out, false, NULL);
 
 		if (!ret) {
 			memcpy((void *)(cmd_buf_cpu_va + fence_index),
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 68e43259..f58b208c 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -2987,7 +2987,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				u32 flags,
 				struct nvgpu_fence *fence,
 				struct gk20a_fence **fence_out,
-				bool force_need_sync_fence)
+				bool force_need_sync_fence,
+				struct fifo_profile_gk20a *profile)
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
@@ -3036,6 +3037,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		return -EINVAL;
 	}
 
+	if (profile)
+		profile->timestamp[PROFILE_ENTRY] = sched_clock();
+
 #ifdef CONFIG_DEBUG_FS
 	/* update debug settings */
 	if (g->ops.ltc.sync_debugfs)
@@ -3162,6 +3166,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 			goto clean_up_job;
 	}
 
+	if (profile)
+		profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
+
 	if (wait_cmd)
 		gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
 
@@ -3184,6 +3191,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	if (need_job_tracking)
 		/* TODO! Check for errors... */
 		gk20a_channel_add_job(c, job, skip_buffer_refcounting);
+	if (profile)
+		profile->timestamp[PROFILE_APPEND] = sched_clock();
 
 	g->ops.fifo.userd_gp_put(g, c);
 
@@ -3197,6 +3206,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	gk20a_dbg_info("post-submit put %d, get %d, size %d",
 		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
 
+	if (profile)
+		profile->timestamp[PROFILE_END] = sched_clock();
 	gk20a_dbg_fn("done");
 	return err;
 
@@ -3789,15 +3800,22 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	struct nvgpu_submit_gpfifo_args *args)
 {
 	struct gk20a_fence *fence_out;
+	struct fifo_profile_gk20a *profile = NULL;
+
 	int ret = 0;
 	gk20a_dbg_fn("");
 
+#ifdef CONFIG_DEBUG_FS
+	profile = gk20a_fifo_profile_acquire(ch->g);
+
+	if (profile)
+		profile->timestamp[PROFILE_IOCTL_ENTRY] = sched_clock();
+#endif
 	if (ch->has_timedout)
 		return -ETIMEDOUT;
-
 	ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries,
 					  args->flags, &args->fence,
-					  &fence_out, false);
+					  &fence_out, false, profile);
 
 	if (ret)
 		goto clean_up;
@@ -3816,7 +3834,12 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 		}
 	}
 	gk20a_fence_put(fence_out);
-
+#ifdef CONFIG_DEBUG_FS
+	if (profile) {
+		profile->timestamp[PROFILE_IOCTL_EXIT] = sched_clock();
+		gk20a_fifo_profile_release(ch->g, profile);
+	}
+#endif
 clean_up:
 	return ret;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index d9913cd7..42550632 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -33,6 +33,7 @@ struct gk20a;
 struct gr_gk20a;
 struct dbg_session_gk20a;
 struct gk20a_fence;
+struct fifo_profile_gk20a;
 
 #include "channel_sync_gk20a.h"
 
@@ -344,7 +345,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				u32 flags,
 				struct nvgpu_fence *fence,
 				struct gk20a_fence **fence_out,
-				bool force_need_sync_fence);
+				bool force_need_sync_fence,
+				struct fifo_profile_gk20a *profile);
 
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 			       struct nvgpu_alloc_gpfifo_ex_args *args);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index d072fb48..35d56ce4 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -23,6 +23,7 @@
 #include <trace/events/gk20a.h>
 #include <linux/dma-mapping.h>
 #include <linux/nvhost.h>
+#include <linux/sort.h>
 
 #include <nvgpu/timers.h>
 #include <nvgpu/semaphore.h>
@@ -46,6 +47,10 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 					    bool wait_for_finish);
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
 
+#ifdef CONFIG_DEBUG_FS
+static void __gk20a_fifo_profile_free(struct kref *ref);
+#endif
+
 u32 gk20a_fifo_get_engine_ids(struct gk20a *g,
 		u32 engine_id[], u32 engine_id_sz,
 		u32 engine_enum)
@@ -532,6 +537,14 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
 	f->engine_info = NULL;
 	kfree(f->active_engines_list);
 	f->active_engines_list = NULL;
+#ifdef CONFIG_DEBUG_FS
+	nvgpu_mutex_acquire(&f->profile.lock);
+	if (f->profile.enabled) {
+		f->profile.enabled = false;
+		kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+	}
+	nvgpu_mutex_release(&f->profile.lock);
+#endif
 }
 
 /* reads info from hardware and fills in pbmda exception info record */
@@ -3203,6 +3216,32 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
 }
 
 #ifdef CONFIG_DEBUG_FS
+
+/* Get the next element in the ring buffer of profile entries
+ * and grab a reference to the structure
+ */
+struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_profile_gk20a *profile;
+	unsigned int index;
+
+	/* If kref is zero, profiling is not enabled */
+	if (!kref_get_unless_zero(&f->profile.ref))
+		return NULL;
+	index = atomic_inc_return(&f->profile.get);
+	profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
+
+	return profile;
+}
+
+/* Free the reference to the structure. This allows deferred cleanups */
+void gk20a_fifo_profile_release(struct gk20a *g,
+					struct fifo_profile_gk20a *profile)
+{
+	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+}
+
 static void *gk20a_fifo_sched_debugfs_seq_start(
 		struct seq_file *s, loff_t *pos)
 {
@@ -3316,6 +3355,168 @@ static const struct file_operations gk20a_fifo_sched_debugfs_fops = {
 	.release = seq_release
 };
 
+static void __gk20a_fifo_profile_free(struct kref *ref)
+{
+	struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a,
+						profile.ref);
+	vfree(f->profile.data);
+	vfree(f->profile.sorted);
+}
+
+static int gk20a_fifo_profile_enable(void *data, u64 val)
+{
+	struct gk20a *g = (struct gk20a *) data;
+	struct fifo_gk20a *f = &g->fifo;
+
+
+	nvgpu_mutex_acquire(&f->profile.lock);
+	if (val == 0) {
+		if (f->profile.enabled) {
+			f->profile.enabled = false;
+			kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+		}
+	} else {
+		if (!f->profile.enabled) {
+			/* not kref init as it can have a running condition if
+			 * we enable/disable/enable while kickoff is happening
+			 */
+			if (!kref_get_unless_zero(&f->profile.ref)) {
+				f->profile.data = vzalloc(
+							FIFO_PROFILING_ENTRIES *
+					sizeof(struct fifo_profile_gk20a));
+				f->profile.sorted  = vzalloc(
+							FIFO_PROFILING_ENTRIES *
+							sizeof(u64));
+				if (!(f->profile.data && f->profile.sorted)) {
+					vfree(f->profile.data);
+					vfree(f->profile.sorted);
+					nvgpu_mutex_release(&f->profile.lock);
+					return -ENOMEM;
+				}
+				kref_init(&f->profile.ref);
+			}
+			atomic_set(&f->profile.get, 0);
+			f->profile.enabled = true;
+		}
+	}
+	nvgpu_mutex_release(&f->profile.lock);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(
+	gk20a_fifo_profile_enable_debugfs_fops,
+	NULL,
+	gk20a_fifo_profile_enable,
+	"%llu\n"
+);
+
+static int __profile_cmp(const void *a, const void *b)
+{
+	return *((unsigned long long *) a) - *((unsigned long long *) b);
+}
+
+/*
+ * This uses about 800b in the stack, but the function using it is not part
+ * of a callstack where much memory is being used, so it is fine
+ */
+#define PERCENTILE_WIDTH	5
+#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)
+
+static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
+		u64 *percentiles, u32 index_end, u32 index_start)
+{
+	unsigned int nelem = 0;
+	unsigned int index;
+	struct fifo_profile_gk20a *profile;
+
+	for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
+		profile = &g->fifo.profile.data[index];
+
+		if (profile->timestamp[index_end] >
+				profile->timestamp[index_start]) {
+			/* This is a valid element */
+			g->fifo.profile.sorted[nelem] =
+						profile->timestamp[index_end] -
+						profile->timestamp[index_start];
+			nelem++;
+		}
+	}
+
+	/* sort it */
+	sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
+		__profile_cmp, NULL);
+
+	/* build ranges */
+	for (index = 0; index < PERCENTILE_RANGES; index++)
+		percentiles[index] =
+			g->fifo.profile.sorted[(PERCENTILE_WIDTH * index *
+						nelem)/100];
+	return nelem;
+}
+
+static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
+{
+	struct gk20a *g = s->private;
+	unsigned int get, nelem, index;
+	/*
+	 * 800B in the stack, but function is declared statically and only
+	 * called from debugfs handler
+	 */
+	u64 percentiles_ioctl[PERCENTILE_RANGES];
+	u64 percentiles_kickoff[PERCENTILE_RANGES];
+	u64 percentiles_jobtracking[PERCENTILE_RANGES];
+	u64 percentiles_append[PERCENTILE_RANGES];
+	u64 percentiles_userd[PERCENTILE_RANGES];
+
+	if (!kref_get_unless_zero(&g->fifo.profile.ref)) {
+		seq_printf(s, "Profiling disabled\n");
+		return 0;
+	}
+
+	get = atomic_read(&g->fifo.profile.get);
+
+	__gk20a_fifo_create_stats(g, percentiles_ioctl,
+		PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_kickoff,
+		PROFILE_END, PROFILE_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_jobtracking,
+		PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_append,
+		PROFILE_APPEND, PROFILE_JOB_TRACKING);
+	nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
+		PROFILE_END, PROFILE_APPEND);
+
+	seq_printf(s, "Number of kickoffs: %d\n", nelem);
+	seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
+
+	for (index = 0; index < PERCENTILE_RANGES; index++)
+		seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
+			PERCENTILE_WIDTH * (index+1),
+			percentiles_ioctl[index],
+			percentiles_kickoff[index],
+			percentiles_append[index],
+			percentiles_jobtracking[index],
+			percentiles_userd[index]);
+
+	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+
+	return 0;
+}
+
+static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, gk20a_fifo_profile_stats, inode->i_private);
+}
+
+static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
+	.open		= gk20a_fifo_profile_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
 void gk20a_fifo_debugfs_init(struct device *dev)
 {
 	struct gk20a_platform *platform = dev_get_drvdata(dev);
@@ -3323,6 +3524,8 @@ void gk20a_fifo_debugfs_init(struct device *dev)
 
 	struct dentry *gpu_root = platform->debugfs;
 	struct dentry *fifo_root;
+	struct dentry *profile_root;
+
 
 	fifo_root = debugfs_create_dir("fifo", gpu_root);
 	if (IS_ERR_OR_NULL(fifo_root))
@@ -3333,6 +3536,21 @@ void gk20a_fifo_debugfs_init(struct device *dev)
 	debugfs_create_file("sched", 0600, fifo_root, g,
 		&gk20a_fifo_sched_debugfs_fops);
 
+	profile_root = debugfs_create_dir("profile", fifo_root);
+	if (IS_ERR_OR_NULL(profile_root))
+		return;
+
+	nvgpu_mutex_init(&g->fifo.profile.lock);
+	g->fifo.profile.enabled = false;
+	atomic_set(&g->fifo.profile.get, 0);
+	atomic_set(&g->fifo.profile.ref.refcount, 0);
+
+	debugfs_create_file("enable", 0600, profile_root, g,
+		&gk20a_fifo_profile_enable_debugfs_fops);
+
+	debugfs_create_file("stats", 0600, profile_root, g,
+		&gk20a_fifo_profile_stats_debugfs_fops);
+
 }
 #endif /* CONFIG_DEBUG_FS */
 
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 147d1bea..75c801c6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -30,6 +30,15 @@
 #define FIFO_INVAL_CHANNEL_ID	((u32)~0)
 #define FIFO_INVAL_TSG_ID	((u32)~0)
 
+/*
+ * Number of entries in the kickoff latency buffer, used to calculate
+ * the profiling and histogram. This number is calculated to be statistically
+ * significative on a histogram on a 5% step
+ */
+#ifdef CONFIG_DEBUG_FS
+#define FIFO_PROFILING_ENTRIES	16384
+#endif
+
 /* generally corresponds to the "pbdma" engine */
 
 struct fifo_runlist_info_gk20a {
@@ -99,6 +108,20 @@ struct fifo_engine_info_gk20a {
 
 };
 
+enum {
+	PROFILE_IOCTL_ENTRY = 0,
+	PROFILE_ENTRY,
+	PROFILE_JOB_TRACKING,
+	PROFILE_APPEND,
+	PROFILE_END,
+	PROFILE_IOCTL_EXIT,
+	PROFILE_MAX
+};
+
+struct fifo_profile_gk20a {
+	u64 timestamp[PROFILE_MAX];
+};
+
 struct fifo_gk20a {
 	struct gk20a *g;
 	unsigned int num_channels;
@@ -115,7 +138,16 @@ struct fifo_gk20a {
 
 	struct fifo_runlist_info_gk20a *runlist_info;
 	u32 max_runlists;
-
+#ifdef CONFIG_DEBUG_FS
+	struct {
+		struct fifo_profile_gk20a *data;
+		atomic_t get;
+		bool enabled;
+		u64 *sorted;
+		struct kref ref;
+		struct nvgpu_mutex lock;
+	} profile;
+#endif
 	struct mem_desc userd;
 	u32 userd_entry_size;
 
@@ -275,5 +307,10 @@ void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist);
 u32 gk20a_userd_gp_get(struct gk20a *g, struct channel_gk20a *c);
 void gk20a_userd_gp_put(struct gk20a *g, struct channel_gk20a *c);
 bool gk20a_is_fault_engine_subid_gpc(struct gk20a *g, u32 engine_subid);
+#ifdef CONFIG_DEBUG_FS
+struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g);
+void gk20a_fifo_profile_release(struct gk20a *g,
+	struct fifo_profile_gk20a *profile);
+#endif
 
 #endif /*__GR_GK20A_H__*/
-- 
cgit v1.2.2