From 2b064ce65e0035a860d1bc3bcccfcf8aac1f31c7 Mon Sep 17 00:00:00 2001
From: Peter Pipkorn <ppipkorn@nvidia.com>
Date: Mon, 28 Sep 2015 13:49:53 +0200
Subject: gpu: nvgpu: add high priority channel interleave
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Interleave all high priority channels between all other channels.
This reduces the latency for high priority work when there
are a lot of lower priority work present, imposing an upper
bound on the latency. Change the default high priority timeslice
from 5.2ms to 3.0 in the process, to prevent long running high priority
apps from hogging the GPU too much.

Introduce a new debugfs node to enable/disable high priority
channel interleaving. It is currently enabled by default.

Adds new runlist length max register, used for allocating
suitable sized runlist.

Limit the number of interleaved channels to 32.

This change reduces the maximum time a lower priority job
is running (one timeslice) before we check that high priority
jobs are running.

Tested with gles2_context_priority (still passes)
Basic sanity testing is done with graphics_submit
(one app is high priority)

Also more functional testing using lots of parallel runs with:
NVRM_GPU_CHANNEL_PRIORITY=3 ./gles2_expensive_draw
 –drawsperframe 20000 –triangles 50 –runtime 30 –finish
plus multiple:
NVRM_GPU_CHANNEL_PRIORITY=2 ./gles2_expensive_draw
–drawsperframe 20000 –triangles 50 –runtime 30 -finish

Previous to this change, the relative performance between
high priority work and normal priority work comes down
to timeslice value. This means that when there are many
low priority channels, the high priority work will still
drop quite a lot. But with this change, the high priority
work will roughly get about half the entire GPU time, meaning
that after the initial lower performance, it is less likely
to get lower in performance due to more apps running on the system.

This change makes a large step towards real priority levels.
It is not perfect and there are no guarantees on anything,
but it is a step forwards without any additional CPU overhead
or other complications. It will also serve as a baseline to
judge other algorithms against.

Support for priorities with TSG is future work.
Support for interleave mid + high priority channels,
instead of just high, is also future work.

Bug 1419900

Change-Id: I0f7d0ce83b6598fe86000577d72e14d312fdad98
Signed-off-by: Peter Pipkorn <ppipkorn@nvidia.com>
Reviewed-on: http://git-master/r/805961
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c |  50 +++++++++++-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |   3 +
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c    | 133 +++++++++++++++++++++++++++++---
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h    |   4 +
 drivers/gpu/nvgpu/gk20a/gk20a.c         |  16 +++-
 drivers/gpu/nvgpu/gk20a/gk20a.h         |  14 +++-
 drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h |   4 +
 7 files changed, 206 insertions(+), 18 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index a5c2efb3..0421c0f6 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -175,7 +175,7 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 }
 
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
-				u32 timeslice_period)
+				u32 timeslice_period, bool interleave)
 {
 	void *inst_ptr;
 	int shift = 0, value = 0;
@@ -203,6 +203,30 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
 		gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
 		ccsr_channel_enable_set_true_f());
 
+	if (c->interleave != interleave) {
+		mutex_lock(&c->g->interleave_lock);
+		c->interleave = interleave;
+		if (interleave)
+			if (c->g->num_interleaved_channels >=
+					MAX_INTERLEAVED_CHANNELS) {
+				gk20a_err(dev_from_gk20a(c->g),
+					"Change of priority would exceed runlist length, only changing timeslice\n");
+				c->interleave = false;
+			} else
+				c->g->num_interleaved_channels += 1;
+		else
+			c->g->num_interleaved_channels -= 1;
+
+		mutex_unlock(&c->g->interleave_lock);
+		gk20a_dbg_info("Set channel %d to interleave %d",
+			c->hw_chid, c->interleave);
+
+		gk20a_fifo_set_channel_priority(
+				c->g, 0, c->hw_chid, c->interleave);
+		c->g->ops.fifo.update_runlist(
+				c->g, 0, ~0, true, false);
+	}
+
 	return 0;
 }
 
@@ -836,6 +860,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
 	}
 	mutex_unlock(&f->deferred_reset_mutex);
 
+	if (ch->interleave) {
+		ch->interleave = false;
+		gk20a_fifo_set_channel_priority(
+				ch->g, 0, ch->hw_chid, ch->interleave);
+
+		mutex_lock(&f->g->interleave_lock);
+		WARN_ON(f->g->num_interleaved_channels == 0);
+		f->g->num_interleaved_channels -= 1;
+		mutex_unlock(&f->g->interleave_lock);
+	}
+
 	if (!ch->bound)
 		goto release;
 
@@ -1079,6 +1114,10 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
 	ch->timeout_debug_dump = true;
 	ch->has_timedout = false;
 	ch->obj_class = 0;
+	ch->interleave = false;
+	gk20a_fifo_set_channel_priority(
+			ch->g, 0, ch->hw_chid, ch->interleave);
+
 
 	/* The channel is *not* runnable at this point. It still needs to have
 	 * an address space bound and allocate a gpfifo and grctx. */
@@ -2458,6 +2497,7 @@ static int gk20a_channel_set_priority(struct channel_gk20a *ch,
 		u32 priority)
 {
 	u32 timeslice_timeout;
+	bool interleave = false;
 
 	if (gk20a_is_channel_marked_as_tsg(ch)) {
 		gk20a_err(dev_from_gk20a(ch->g),
@@ -2474,15 +2514,17 @@ static int gk20a_channel_set_priority(struct channel_gk20a *ch,
 		timeslice_timeout = ch->g->timeslice_medium_priority_us;
 		break;
 	case NVGPU_PRIORITY_HIGH:
+		if (ch->g->interleave_high_priority)
+			interleave = true;
 		timeslice_timeout = ch->g->timeslice_high_priority_us;
 		break;
 	default:
 		pr_err("Unsupported priority");
 		return -EINVAL;
 	}
-	channel_gk20a_set_schedule_params(ch,
-			timeslice_timeout);
-	return 0;
+
+	return channel_gk20a_set_schedule_params(ch,
+			timeslice_timeout, interleave);
 }
 
 static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index ddc517b9..91ae0e7a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -180,6 +180,9 @@ struct channel_gk20a {
 	void *update_fn_data;
 	spinlock_t update_fn_lock; /* make access to the two above atomic */
 	struct work_struct update_fn_work;
+
+	/* true if channel is interleaved with lower priority channels */
+	bool interleave;
 };
 
 static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5c99877b..ca5c0ee6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -303,7 +303,13 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
 	if (!runlist->active_tsgs)
 		goto clean_up_runlist_info;
 
-	runlist_size  = ram_rl_entry_size_v() * f->num_channels;
+	runlist->high_prio_channels =
+		kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
+			GFP_KERNEL);
+	if (!runlist->high_prio_channels)
+		goto clean_up_runlist_info;
+
+	runlist_size  = ram_rl_entry_size_v() * f->num_runlist_entries;
 	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
 		int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
 		if (err) {
@@ -324,10 +330,16 @@ clean_up_runlist:
 	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++)
 		gk20a_gmmu_free(g, &runlist->mem[i]);
 
+clean_up_runlist_info:
 	kfree(runlist->active_channels);
 	runlist->active_channels = NULL;
 
-clean_up_runlist_info:
+	kfree(runlist->active_tsgs);
+	runlist->active_tsgs = NULL;
+
+	kfree(runlist->high_prio_channels);
+	runlist->high_prio_channels = NULL;
+
 	kfree(f->runlist_info);
 	f->runlist_info = NULL;
 
@@ -483,6 +495,7 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
 
 	f->num_channels = g->ops.fifo.get_num_fifos(g);
+	f->num_runlist_entries = fifo_eng_runlist_length_max_v();
 	f->num_pbdma = proj_host_num_pbdma_v();
 	f->max_engines = ENGINE_INVAL_GK20A;
 
@@ -2149,6 +2162,34 @@ static inline u32 gk20a_get_tsg_runlist_entry_0(struct tsg_gk20a *tsg)
 	return runlist_entry_0;
 }
 
+/* add all active high priority channels */
+static inline u32 gk20a_fifo_runlist_add_high_prio_entries(
+		struct fifo_gk20a *f,
+		struct fifo_runlist_info_gk20a *runlist,
+		u32 *runlist_entry)
+{
+	struct channel_gk20a *ch = NULL;
+	unsigned long high_prio_chid;
+	u32 count = 0;
+
+	for_each_set_bit(high_prio_chid,
+			runlist->high_prio_channels, f->num_channels) {
+		ch = &f->channel[high_prio_chid];
+
+		if (!gk20a_is_channel_marked_as_tsg(ch) &&
+		     test_bit(high_prio_chid, runlist->active_channels) == 1) {
+			gk20a_dbg_info("add high prio channel %lu to runlist",
+					high_prio_chid);
+			runlist_entry[0] = ram_rl_entry_chid_f(high_prio_chid);
+			runlist_entry[1] = 0;
+			runlist_entry += 2;
+			count++;
+		}
+	}
+
+	return count;
+}
+
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 					    u32 hw_chid, bool add,
 					    bool wait_for_finish)
@@ -2158,7 +2199,7 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 	struct fifo_runlist_info_gk20a *runlist = NULL;
 	u32 *runlist_entry_base = NULL;
 	u32 *runlist_entry = NULL;
-	phys_addr_t runlist_pa;
+	u64 runlist_iova;
 	u32 old_buf, new_buf;
 	u32 chid, tsgid;
 	struct channel_gk20a *ch = NULL;
@@ -2194,11 +2235,13 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 	old_buf = runlist->cur_buffer;
 	new_buf = !runlist->cur_buffer;
 
+	runlist_iova = g->ops.mm.get_iova_addr(
+			g, runlist->mem[new_buf].sgt->sgl, 0);
+
 	gk20a_dbg_info("runlist_id : %d, switch to new buffer 0x%16llx",
-		runlist_id, (u64)gk20a_mem_phys(&runlist->mem[new_buf]));
+		runlist_id, (u64)runlist_iova);
 
-	runlist_pa = gk20a_mem_phys(&runlist->mem[new_buf]);
-	if (!runlist_pa) {
+	if (!runlist_iova) {
 		ret = -EINVAL;
 		goto clean_up;
 	}
@@ -2213,25 +2256,52 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 	    add /* resume to add all channels back */) {
 		runlist_entry = runlist_entry_base;
 
-		/* add non-TSG channels first */
+		/* Runlist manipulation:
+		   Insert an entry of all high priority channels inbetween
+		   all lower priority channels. This ensure that the maximum
+		   delay a runnable high priority channel has to wait is one
+		   medium timeslice + any context switching overhead +
+		   wait on other high priority channels.
+		   add non-TSG channels first */
 		for_each_set_bit(chid,
 			runlist->active_channels, f->num_channels) {
 			ch = &f->channel[chid];
 
-			if (!gk20a_is_channel_marked_as_tsg(ch)) {
-				gk20a_dbg_info("add channel %d to runlist",
+			if (!gk20a_is_channel_marked_as_tsg(ch) &&
+				!ch->interleave) {
+				u32 added;
+
+				gk20a_dbg_info("add normal prio channel %d to runlist",
 					chid);
 				runlist_entry[0] = ram_rl_entry_chid_f(chid);
 				runlist_entry[1] = 0;
 				runlist_entry += 2;
 				count++;
+
+				added =	gk20a_fifo_runlist_add_high_prio_entries(
+						f,
+						runlist,
+						runlist_entry);
+				count += added;
+				runlist_entry += 2 * added;
 			}
 		}
 
+		/* if there were no lower priority channels, then just
+		 * add the high priority channels once. */
+		if (count == 0) {
+			count =	gk20a_fifo_runlist_add_high_prio_entries(
+					f,
+					runlist,
+					runlist_entry);
+			runlist_entry += 2 * count;
+		}
+
 		/* now add TSG entries and channels bound to TSG */
 		mutex_lock(&f->tsg_inuse_mutex);
 		for_each_set_bit(tsgid,
 				runlist->active_tsgs, f->num_channels) {
+			u32 added;
 			tsg = &f->tsg[tsgid];
 			/* add TSG entry */
 			gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
@@ -2260,6 +2330,13 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 
 			WARN_ON(tsg->num_active_channels !=
 				count_channels_in_tsg);
+
+			added = gk20a_fifo_runlist_add_high_prio_entries(
+					f,
+					runlist,
+					runlist_entry);
+			count += added;
+			runlist_entry += 2 * added;
 		}
 		mutex_unlock(&f->tsg_inuse_mutex);
 	} else	/* suspend to remove all channels */
@@ -2267,7 +2344,7 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 
 	if (count != 0) {
 		gk20a_writel(g, fifo_runlist_base_r(),
-			fifo_runlist_base_ptr_f(u64_lo32(runlist_pa >> 12)) |
+			fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12)) |
 			fifo_runlist_base_target_vid_mem_f());
 	}
 
@@ -2416,6 +2493,42 @@ u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
 	return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
 }
 
+int gk20a_fifo_set_channel_priority(
+		struct gk20a *g,
+		u32 runlist_id,
+		u32 hw_chid,
+		bool interleave)
+{
+	struct fifo_runlist_info_gk20a *runlist = NULL;
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = NULL;
+
+	if (hw_chid >= f->num_channels)
+		return -EINVAL;
+
+	if (runlist_id >= f->max_runlists)
+		return -EINVAL;
+
+	ch = &f->channel[hw_chid];
+
+	gk20a_dbg_fn("");
+
+	runlist = &f->runlist_info[runlist_id];
+
+	mutex_lock(&runlist->mutex);
+
+	if (ch->interleave)
+		set_bit(hw_chid, runlist->high_prio_channels);
+	else
+		clear_bit(hw_chid, runlist->high_prio_channels);
+
+	gk20a_dbg_fn("done");
+
+	mutex_unlock(&runlist->mutex);
+
+	return 0;
+}
+
 void gk20a_init_fifo(struct gpu_ops *gops)
 {
 	gk20a_init_channel(gops);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 1b47677b..6ba4153b 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -31,6 +31,7 @@
 struct fifo_runlist_info_gk20a {
 	unsigned long *active_channels;
 	unsigned long *active_tsgs;
+	unsigned long *high_prio_channels;
 	/* Each engine has its own SW and HW runlist buffer.*/
 	struct mem_desc mem[MAX_RUNLIST_BUFFERS];
 	u32  cur_buffer;
@@ -91,6 +92,7 @@ struct fifo_engine_info_gk20a {
 struct fifo_gk20a {
 	struct gk20a *g;
 	int num_channels;
+	int num_runlist_entries;
 
 	int num_pbdma;
 	u32 *pbdma_map;
@@ -182,6 +184,8 @@ void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
 int gk20a_fifo_wait_engine_idle(struct gk20a *g);
 u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
 u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
+int gk20a_fifo_set_channel_priority(struct gk20a *g, u32 runlist_id,
+		u32 hw_chid, bool interleave);
 u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
 		int *__id, bool *__is_tsg);
 bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 9bbc9bd8..c5124c51 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -670,6 +670,9 @@ static int gk20a_init_support(struct platform_device *dev)
 	mutex_init(&g->client_lock);
 	mutex_init(&g->ch_wdt_lock);
 
+	mutex_init(&g->interleave_lock);
+	g->num_interleaved_channels = 0;
+
 	g->remove_support = gk20a_remove_support;
 	return 0;
 
@@ -1437,9 +1440,14 @@ static int gk20a_probe(struct platform_device *dev)
 	if (tegra_platform_is_silicon())
 		gk20a->timeouts_enabled = true;
 
+	gk20a->interleave_high_priority = true;
+
 	gk20a->timeslice_low_priority_us = 1300;
 	gk20a->timeslice_medium_priority_us = 2600;
-	gk20a->timeslice_high_priority_us = 5200;
+	if (gk20a->interleave_high_priority)
+		gk20a->timeslice_high_priority_us = 3000;
+	else
+		gk20a->timeslice_high_priority_us = 5200;
 
 	/* Set up initial power settings. For non-slicon platforms, disable *
 	 * power features and for silicon platforms, read from platform data */
@@ -1512,6 +1520,12 @@ static int gk20a_probe(struct platform_device *dev)
 					platform->debugfs,
 					&gk20a->timeslice_high_priority_us);
 
+	gk20a->debugfs_interleave_high_priority =
+			debugfs_create_bool("interleave_high_priority",
+					S_IRUGO|S_IWUSR,
+					platform->debugfs,
+					&gk20a->interleave_high_priority);
+
 	gr_gk20a_debugfs_init(gk20a);
 	gk20a_pmu_debugfs_init(dev);
 	gk20a_cde_debugfs_init(dev);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index f7b98e39..da115fa8 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -54,6 +54,8 @@ struct acr_gm20b;
     32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
 
+#define MAX_INTERLEAVED_CHANNELS                32
+
 struct cooling_device_gk20a {
 	struct thermal_cooling_device *gk20a_cooling_dev;
 	unsigned int gk20a_freq_state;
@@ -512,6 +514,10 @@ struct gk20a {
 	u32 timeslice_low_priority_us;
 	u32 timeslice_medium_priority_us;
 	u32 timeslice_high_priority_us;
+	u32 interleave_high_priority;
+
+	struct mutex interleave_lock;
+	u32 num_interleaved_channels;
 
 	bool slcg_enabled;
 	bool blcg_enabled;
@@ -533,9 +539,11 @@ struct gk20a {
 	struct dentry *debugfs_disable_bigpage;
 	struct dentry *debugfs_gr_default_attrib_cb_size;
 
-	struct dentry * debugfs_timeslice_low_priority_us;
-	struct dentry * debugfs_timeslice_medium_priority_us;
-	struct dentry * debugfs_timeslice_high_priority_us;
+	struct dentry *debugfs_timeslice_low_priority_us;
+	struct dentry *debugfs_timeslice_medium_priority_us;
+	struct dentry *debugfs_timeslice_high_priority_us;
+	struct dentry *debugfs_interleave_high_priority;
+
 #endif
 	struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
 
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
index a131972e..99d92782 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
@@ -110,6 +110,10 @@ static inline u32 fifo_eng_runlist_length_f(u32 v)
 {
 	return (v & 0xffff) << 0;
 }
+static inline u32 fifo_eng_runlist_length_max_v(void)
+{
+	return 0x0000ffff;
+}
 static inline u32 fifo_eng_runlist_pending_true_f(void)
 {
 	return 0x100000;
-- 
cgit v1.2.2