From 733fb79b39869665addcd80ccdf1c15f4a5aaa29 Mon Sep 17 00:00:00 2001
From: Sachit Kadle <skadle@nvidia.com>
Date: Mon, 15 Aug 2016 14:32:39 -0700
Subject: gpu: nvgpu: add support for pre-allocated resources

Add support for pre-allocation of job tracking resources
w/ new (extended) ioctl. Goal is to avoid dynamic memory
allocation in the submit path. This patch does the following:

1) Intoduces a new ioctl, NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX,
which enables pre-allocation of tracking resources per job:
a) 2x priv_cmd_entry
b) 2x gk20a_fence

2) Implements circular ring buffer for job
tracking to avoid lock contention between producer
(submitter) and consumer (clean-up)

Bug 1795076

Change-Id: I6b52e5c575871107ff380f9a5790f440a6969347
Signed-off-by: Sachit Kadle <skadle@nvidia.com>
Reviewed-on: http://git-master/r/1203300
(cherry picked from commit 9fd270c22b860935dffe244753dabd87454bef39)
Reviewed-on: http://git-master/r/1223934
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c     |   8 +-
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.c     |   8 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 390 ++++++++++++++++++++++++++++----
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |  28 ++-
 drivers/gpu/nvgpu/gk20a/fence_gk20a.c   |  70 +++++-
 drivers/gpu/nvgpu/gk20a/fence_gk20a.h   |  15 +-
 6 files changed, 454 insertions(+), 65 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ca785b19..17453489 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1126,9 +1126,9 @@ __releases(&cde_app->mutex)
 	struct gk20a_cde_app *cde_app = &g->cde_app;
 	bool channel_idle;
 
-	spin_lock(&ch->jobs_lock);
-	channel_idle = list_empty(&ch->jobs);
-	spin_unlock(&ch->jobs_lock);
+	channel_gk20a_joblist_lock(ch);
+	channel_idle = channel_gk20a_joblist_is_empty(ch);
+	channel_gk20a_joblist_unlock(ch);
 
 	if (!channel_idle)
 		return;
@@ -1207,7 +1207,7 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 
 	/* allocate gpfifo (1024 should be more than enough) */
 	err = gk20a_alloc_channel_gpfifo(ch,
-		&(struct nvgpu_alloc_gpfifo_args){1024, 0});
+		&(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
 	if (err) {
 		gk20a_warn(cde_ctx->dev, "cde: unable to allocate gpfifo");
 		goto err_alloc_gpfifo;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 109ec240..bfd183fb 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -126,9 +126,9 @@ static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 	bool channel_idle;
 	u32 event;
 
-	spin_lock(&ch->jobs_lock);
-	channel_idle = list_empty(&ch->jobs);
-	spin_unlock(&ch->jobs_lock);
+	channel_gk20a_joblist_lock(ch);
+	channel_idle = channel_gk20a_joblist_is_empty(ch);
+	channel_gk20a_joblist_unlock(ch);
 
 	if (!channel_idle)
 		return;
@@ -462,7 +462,7 @@ u32 gk20a_ce_create_context_with_cb(struct device *dev,
 
 	/* allocate gpfifo (1024 should be more than enough) */
 	err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
-		&(struct nvgpu_alloc_gpfifo_args){1024, 0});
+		&(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
 	if (err) {
 		gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
 		goto end;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 4019721a..cc3bbbd2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -26,6 +26,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/dma-buf.h>
 #include <linux/vmalloc.h>
+#include <linux/circ_buf.h>
 
 #include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
@@ -55,6 +56,15 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
 static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
 static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
 
+static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c);
+
+static void channel_gk20a_joblist_add(struct channel_gk20a *c,
+		struct channel_gk20a_job *job);
+static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
+		struct channel_gk20a_job *job);
+static struct channel_gk20a_job *channel_gk20a_joblist_peek(
+		struct channel_gk20a *c);
+
 static int channel_gk20a_commit_userd(struct channel_gk20a *c);
 static int channel_gk20a_setup_userd(struct channel_gk20a *c);
 
@@ -460,6 +470,7 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 {
 	struct channel_gk20a_job *job, *n;
 	bool released_job_semaphore = false;
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(ch);
 
 	gk20a_channel_cancel_job_clean_up(ch, true);
 
@@ -471,14 +482,37 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 
 	/* release all job semaphores (applies only to jobs that use
 	   semaphore synchronization) */
-	spin_lock(&ch->jobs_lock);
-	list_for_each_entry_safe(job, n, &ch->jobs, list) {
-		if (job->post_fence->semaphore) {
-			gk20a_semaphore_release(job->post_fence->semaphore);
-			released_job_semaphore = true;
+	channel_gk20a_joblist_lock(ch);
+	if (pre_alloc_enabled) {
+		int tmp_get = ch->joblist.pre_alloc.get;
+		int put = ch->joblist.pre_alloc.put;
+
+		/*
+		 * ensure put is read before any subsequent reads.
+		 * see corresponding wmb in gk20a_channel_add_job()
+		 */
+		rmb();
+
+		while (tmp_get != put) {
+			job = &ch->joblist.pre_alloc.jobs[tmp_get];
+			if (job->post_fence->semaphore) {
+				gk20a_semaphore_release(
+						job->post_fence->semaphore);
+				released_job_semaphore = true;
+			}
+			tmp_get = (tmp_get + 1) % ch->joblist.pre_alloc.length;
+		}
+	} else {
+		list_for_each_entry_safe(job, n,
+				&ch->joblist.dynamic.jobs, list) {
+			if (job->post_fence->semaphore) {
+				gk20a_semaphore_release(
+						job->post_fence->semaphore);
+				released_job_semaphore = true;
+			}
 		}
 	}
-	spin_unlock(&ch->jobs_lock);
+	channel_gk20a_joblist_unlock(ch);
 
 	if (released_job_semaphore)
 		wake_up_interruptible_all(&ch->semaphore_wq);
@@ -511,9 +545,9 @@ int gk20a_wait_channel_idle(struct channel_gk20a *ch)
 		msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g));
 
 	do {
-		spin_lock(&ch->jobs_lock);
-		channel_idle = list_empty(&ch->jobs);
-		spin_unlock(&ch->jobs_lock);
+		channel_gk20a_joblist_lock(ch);
+		channel_idle = channel_gk20a_joblist_is_empty(ch);
+		channel_gk20a_joblist_unlock(ch);
 		if (channel_idle)
 			break;
 
@@ -1016,6 +1050,10 @@ unbind:
 
 	mutex_unlock(&g->dbg_sessions_lock);
 
+	/* free pre-allocated resources, if applicable */
+	if (channel_gk20a_is_prealloc_enabled(ch))
+		channel_gk20a_free_prealloc_resources(ch);
+
 	/* make sure we catch accesses of unopened channels in case
 	 * there's non-refcounted channel pointers hanging around */
 	ch->g = NULL;
@@ -1422,7 +1460,10 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 	/* we already handled q->put + size > q->size so BUG_ON this */
 	BUG_ON(q->put > q->size);
 
-	/* commit the previous writes before making the entry valid */
+	/*
+	 * commit the previous writes before making the entry valid.
+	 * see the corresponding rmb() in gk20a_free_priv_cmdbuf().
+	 */
 	wmb();
 
 	e->valid = true;
@@ -1436,26 +1477,222 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 static void free_priv_cmdbuf(struct channel_gk20a *c,
 			     struct priv_cmd_entry *e)
 {
-	kfree(e);
+	if (channel_gk20a_is_prealloc_enabled(c))
+		memset(e, 0, sizeof(struct priv_cmd_entry));
+	else
+		kfree(e);
+}
+
+static int channel_gk20a_alloc_job(struct channel_gk20a *c,
+		struct channel_gk20a_job **job_out)
+{
+	int err = 0;
+
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		int put = c->joblist.pre_alloc.put;
+		int get = c->joblist.pre_alloc.get;
+
+		/*
+		 * ensure all subsequent reads happen after reading get.
+		 * see corresponding wmb in gk20a_channel_clean_up_jobs()
+		 */
+		rmb();
+
+		if (CIRC_SPACE(put, get, c->joblist.pre_alloc.length))
+			*job_out = &c->joblist.pre_alloc.jobs[put];
+		else {
+			gk20a_warn(dev_from_gk20a(c->g),
+					"out of job ringbuffer space\n");
+			err = -EAGAIN;
+		}
+	} else {
+		*job_out = kzalloc(sizeof(struct channel_gk20a_job),
+				GFP_KERNEL);
+		if (!job_out)
+			err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void channel_gk20a_free_job(struct channel_gk20a *c,
+		struct channel_gk20a_job *job)
+{
+	/*
+	 * In case of pre_allocated jobs, we need to clean out
+	 * the job but maintain the pointers to the priv_cmd_entry,
+	 * since they're inherently tied to the job node.
+	 */
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		struct priv_cmd_entry *wait_cmd = job->wait_cmd;
+		struct priv_cmd_entry *incr_cmd = job->incr_cmd;
+		memset(job, 0, sizeof(*job));
+		job->wait_cmd = wait_cmd;
+		job->incr_cmd = incr_cmd;
+	} else
+		kfree(job);
+}
+
+void channel_gk20a_joblist_lock(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c))
+		mutex_lock(&c->joblist.pre_alloc.read_lock);
+	else
+		spin_lock(&c->joblist.dynamic.lock);
 }
 
-static struct channel_gk20a_job *channel_gk20a_alloc_job(
+void channel_gk20a_joblist_unlock(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c))
+		mutex_unlock(&c->joblist.pre_alloc.read_lock);
+	else
+		spin_unlock(&c->joblist.dynamic.lock);
+}
+
+static struct channel_gk20a_job *channel_gk20a_joblist_peek(
 		struct channel_gk20a *c)
 {
+	int get;
 	struct channel_gk20a_job *job = NULL;
 
-	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		if (!channel_gk20a_joblist_is_empty(c)) {
+			get = c->joblist.pre_alloc.get;
+			job = &c->joblist.pre_alloc.jobs[get];
+		}
+	} else {
+		if (!list_empty(&c->joblist.dynamic.jobs))
+			job = list_first_entry(&c->joblist.dynamic.jobs,
+				       struct channel_gk20a_job, list);
+	}
+
 	return job;
 }
 
-static void channel_gk20a_free_job(struct channel_gk20a *c,
+static void channel_gk20a_joblist_add(struct channel_gk20a *c,
 		struct channel_gk20a_job *job)
 {
-	kfree(job);
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		c->joblist.pre_alloc.put = (c->joblist.pre_alloc.put + 1) %
+				(c->joblist.pre_alloc.length);
+	} else {
+		list_add_tail(&job->list, &c->joblist.dynamic.jobs);
+	}
+}
+
+static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
+		struct channel_gk20a_job *job)
+{
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		c->joblist.pre_alloc.get = (c->joblist.pre_alloc.get + 1) %
+				(c->joblist.pre_alloc.length);
+	} else {
+		list_del_init(&job->list);
+	}
+}
+
+bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		int get = c->joblist.pre_alloc.get;
+		int put = c->joblist.pre_alloc.put;
+		return !(CIRC_CNT(put, get, c->joblist.pre_alloc.length));
+	}
+
+	return list_empty(&c->joblist.dynamic.jobs);
+}
+
+bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c)
+{
+	bool pre_alloc_enabled = c->joblist.pre_alloc.enabled;
+
+	rmb();
+	return pre_alloc_enabled;
+}
+
+static int channel_gk20a_prealloc_resources(struct channel_gk20a *c,
+	       unsigned int num_jobs)
+{
+	int i, err;
+	size_t size;
+	struct priv_cmd_entry *entries = NULL;
+
+	if (channel_gk20a_is_prealloc_enabled(c) || !num_jobs)
+		return -EINVAL;
+
+	/*
+	 * pre-allocate the job list.
+	 * since vmalloc take in an unsigned long, we need
+	 * to make sure we don't hit an overflow condition
+	 */
+	size = sizeof(struct channel_gk20a_job);
+	if (num_jobs <= ULONG_MAX / size)
+		c->joblist.pre_alloc.jobs = vzalloc(num_jobs * size);
+	if (!c->joblist.pre_alloc.jobs) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	/*
+	 * pre-allocate 2x priv_cmd_entry for each job up front.
+	 * since vmalloc take in an unsigned long, we need
+	 * to make sure we don't hit an overflow condition
+	 */
+	size = sizeof(struct priv_cmd_entry);
+	if (num_jobs <= ULONG_MAX / (size << 1))
+		entries = vzalloc((num_jobs << 1) * size);
+	if (!entries) {
+		err = -ENOMEM;
+		goto clean_up_joblist;
+	}
+
+	for (i = 0; i < num_jobs; i++) {
+		c->joblist.pre_alloc.jobs[i].wait_cmd = &entries[i];
+		c->joblist.pre_alloc.jobs[i].incr_cmd =
+			&entries[i + num_jobs];
+	}
+
+	/* pre-allocate a fence pool */
+	err = gk20a_alloc_fence_pool(c, num_jobs);
+	if (err)
+		goto clean_up_priv_cmd;
+
+	c->joblist.pre_alloc.length = num_jobs;
+
+	/*
+	 * commit the previous writes before setting the flag.
+	 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
+	 */
+	wmb();
+	c->joblist.pre_alloc.enabled = true;
+
+	return 0;
+
+clean_up_priv_cmd:
+	vfree(entries);
+clean_up_joblist:
+	vfree(c->joblist.pre_alloc.jobs);
+clean_up:
+	memset(&c->joblist.pre_alloc, 0, sizeof(c->joblist.pre_alloc));
+	return err;
+}
+
+static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c)
+{
+	vfree(c->joblist.pre_alloc.jobs[0].wait_cmd);
+	vfree(c->joblist.pre_alloc.jobs);
+	gk20a_free_fence_pool(c);
+
+	/*
+	 * commit the previous writes before disabling the flag.
+	 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
+	 */
+	wmb();
+	c->joblist.pre_alloc.enabled = false;
 }
 
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
-		struct nvgpu_alloc_gpfifo_args *args)
+		struct nvgpu_alloc_gpfifo_ex_args *args)
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
@@ -1539,19 +1776,30 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 
 	/* TBD: setup engine contexts */
 
+	if (args->num_inflight_jobs) {
+		err = channel_gk20a_prealloc_resources(c,
+				args->num_inflight_jobs);
+		if (err)
+			goto clean_up_sync;
+	}
+
 	err = channel_gk20a_alloc_priv_cmdbuf(c);
 	if (err)
-		goto clean_up_sync;
+		goto clean_up_prealloc;
 
 	err = channel_gk20a_update_runlist(c, true);
 	if (err)
-		goto clean_up_sync;
+		goto clean_up_priv_cmd;
 
 	g->ops.fifo.bind_channel(c);
 
 	gk20a_dbg_fn("done");
 	return 0;
 
+clean_up_priv_cmd:
+	channel_gk20a_free_priv_cmdbuf(c);
+clean_up_prealloc:
+	channel_gk20a_free_prealloc_resources(c);
 clean_up_sync:
 	gk20a_channel_sync_destroy(c->sync);
 	c->sync = NULL;
@@ -1878,6 +2126,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 	struct vm_gk20a *vm = c->vm;
 	struct mapped_buffer_node **mapped_buffers = NULL;
 	int err = 0, num_mapped_buffers = 0;
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
 
 	/* job needs reference to this vm (released in channel_update) */
 	gk20a_vm_get(vm);
@@ -1898,9 +2147,19 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 
 		gk20a_channel_timeout_start(c, job);
 
-		spin_lock(&c->jobs_lock);
-		list_add_tail(&job->list, &c->jobs);
-		spin_unlock(&c->jobs_lock);
+		if (!pre_alloc_enabled)
+			channel_gk20a_joblist_lock(c);
+
+		/*
+		 * ensure all pending write complete before adding to the list.
+		 * see corresponding rmb in gk20a_channel_clean_up_jobs() &
+		 * gk20a_channel_abort_clean_up()
+		 */
+		wmb();
+		channel_gk20a_joblist_add(c, job);
+
+		if (!pre_alloc_enabled)
+			channel_gk20a_joblist_unlock(c);
 	} else {
 		err = -ETIMEDOUT;
 		goto err_put_buffers;
@@ -1945,14 +2204,20 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 	while (1) {
 		bool completed;
 
-		spin_lock(&c->jobs_lock);
-		if (list_empty(&c->jobs)) {
-			spin_unlock(&c->jobs_lock);
+		channel_gk20a_joblist_lock(c);
+		if (channel_gk20a_joblist_is_empty(c)) {
+			channel_gk20a_joblist_unlock(c);
 			break;
 		}
-		job = list_first_entry(&c->jobs,
-				       struct channel_gk20a_job, list);
-		spin_unlock(&c->jobs_lock);
+
+		/*
+		 * ensure that all subsequent reads occur after checking
+		 * that we have a valid node. see corresponding wmb in
+		 * gk20a_channel_add_job().
+		 */
+		rmb();
+		job = channel_gk20a_joblist_peek(c);
+		channel_gk20a_joblist_unlock(c);
 
 		completed = gk20a_fence_is_expired(job->post_fence);
 		if (!completed) {
@@ -1998,9 +2263,14 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 		 * so this wouldn't get freed here. */
 		gk20a_channel_put(c);
 
-		spin_lock(&c->jobs_lock);
-		list_del_init(&job->list);
-		spin_unlock(&c->jobs_lock);
+		/*
+		 * ensure all pending writes complete before deleting the node.
+		 * see corresponding rmb in channel_gk20a_alloc_job().
+		 */
+		wmb();
+		channel_gk20a_joblist_lock(c);
+		channel_gk20a_joblist_delete(c, job);
+		channel_gk20a_joblist_unlock(c);
 
 		channel_gk20a_free_job(c, job);
 		job_finished = 1;
@@ -2160,6 +2430,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	int wait_fence_fd = -1;
 	int err = 0;
 	bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
 
 	/*
 	 * If user wants to always allocate sync_fence_fds then respect that;
@@ -2197,9 +2468,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * this condition.
 	 */
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
-		job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
-					GFP_KERNEL);
 		job->pre_fence = gk20a_alloc_fence(c);
+		if (!pre_alloc_enabled)
+			job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
+						GFP_KERNEL);
 
 		if (!job->wait_cmd || !job->pre_fence) {
 			err = -ENOMEM;
@@ -2233,8 +2505,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * is used to keep track of method completion for idle railgating. The
 	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
 	 */
-	job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
 	job->post_fence = gk20a_alloc_fence(c);
+	if (!pre_alloc_enabled)
+		job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry),
+					GFP_KERNEL);
 
 	if (!job->incr_cmd || !job->post_fence) {
 		err = -ENOMEM;
@@ -2256,15 +2530,17 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	return 0;
 
 clean_up_post_fence:
-	gk20a_free_priv_cmdbuf(c, job->incr_cmd);
 	gk20a_fence_put(job->post_fence);
-	job->incr_cmd = NULL;
 	job->post_fence = NULL;
+	free_priv_cmdbuf(c, job->incr_cmd);
+	if (!pre_alloc_enabled)
+		job->incr_cmd = NULL;
 clean_up_pre_fence:
-	gk20a_free_priv_cmdbuf(c, job->wait_cmd);
 	gk20a_fence_put(job->pre_fence);
-	job->wait_cmd = NULL;
 	job->pre_fence = NULL;
+	free_priv_cmdbuf(c, job->wait_cmd);
+	if (!pre_alloc_enabled)
+		job->wait_cmd = NULL;
 	*wait_cmd = NULL;
 	*pre_fence = NULL;
 fail:
@@ -2388,11 +2664,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	}
 
 	if (need_job_tracking) {
-		job = channel_gk20a_alloc_job(c);
-		if (!job) {
-			err = -ENOMEM;
+		err = channel_gk20a_alloc_job(c, &job);
+		if (err)
 			goto clean_up;
-		}
 
 		err = gk20a_submit_prepare_syncs(c, fence, job,
 						 &wait_cmd, &incr_cmd,
@@ -2463,13 +2737,14 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	init_waitqueue_head(&c->ref_count_dec_wq);
 	mutex_init(&c->ioctl_lock);
 	mutex_init(&c->error_notifier_mutex);
-	spin_lock_init(&c->jobs_lock);
+	spin_lock_init(&c->joblist.dynamic.lock);
+	mutex_init(&c->joblist.pre_alloc.read_lock);
 	raw_spin_lock_init(&c->timeout.lock);
 	mutex_init(&c->sync_lock);
 	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
 	INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_jobs);
 	mutex_init(&c->clean_up.lock);
-	INIT_LIST_HEAD(&c->jobs);
+	INIT_LIST_HEAD(&c->joblist.dynamic.jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
 	mutex_init(&c->cs_client_mutex);
@@ -3119,7 +3394,7 @@ long gk20a_channel_ioctl(struct file *filp,
 				(struct nvgpu_free_obj_ctx_args *)buf);
 		gk20a_idle(dev);
 		break;
-	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO:
+	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX:
 		err = gk20a_busy(dev);
 		if (err) {
 			dev_err(dev,
@@ -3128,9 +3403,34 @@ long gk20a_channel_ioctl(struct file *filp,
 			break;
 		}
 		err = gk20a_alloc_channel_gpfifo(ch,
-				(struct nvgpu_alloc_gpfifo_args *)buf);
+				(struct nvgpu_alloc_gpfifo_ex_args *)buf);
+		gk20a_idle(dev);
+		break;
+	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO:
+	{
+		struct nvgpu_alloc_gpfifo_ex_args alloc_gpfifo_ex_args;
+		struct nvgpu_alloc_gpfifo_args *alloc_gpfifo_args =
+			(struct nvgpu_alloc_gpfifo_args *)buf;
+
+		err = gk20a_busy(dev);
+		if (err) {
+			dev_err(dev,
+				"%s: failed to host gk20a for ioctl cmd: 0x%x",
+				__func__, cmd);
+			break;
+		}
+
+		/* prepare new args structure */
+		memset(&alloc_gpfifo_ex_args, 0,
+				sizeof(struct nvgpu_alloc_gpfifo_ex_args));
+		alloc_gpfifo_ex_args.num_entries =
+				alloc_gpfifo_args->num_entries;
+		alloc_gpfifo_ex_args.flags = alloc_gpfifo_args->flags;
+
+		err = gk20a_alloc_channel_gpfifo(ch, &alloc_gpfifo_ex_args);
 		gk20a_idle(dev);
 		break;
+	}
 	case NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO:
 		err = gk20a_ioctl_channel_submit_gpfifo(ch,
 				(struct nvgpu_submit_gpfifo_args *)buf);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 0d8746b8..8cceb6b2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -70,6 +70,22 @@ struct channel_gk20a_job {
 	struct list_head list;
 };
 
+struct channel_gk20a_joblist {
+	struct {
+		bool enabled;
+		unsigned int length;
+		unsigned int put;
+		unsigned int get;
+		struct channel_gk20a_job *jobs;
+		struct mutex read_lock;
+	} pre_alloc;
+
+	struct {
+		struct list_head jobs;
+		spinlock_t lock;
+	} dynamic;
+};
+
 struct channel_gk20a_timeout {
 	struct delayed_work wq;
 	raw_spinlock_t lock;
@@ -115,6 +131,7 @@ struct channel_gk20a {
 	bool bound;
 	bool first_init;
 	bool vpr;
+	bool no_block;
 	bool cde;
 	pid_t pid;
 	pid_t tgid;
@@ -123,8 +140,8 @@ struct channel_gk20a {
 	int tsgid;
 	struct list_head ch_entry; /* channel's entry in TSG */
 
-	struct list_head jobs;
-	spinlock_t jobs_lock;
+	struct channel_gk20a_joblist joblist;
+	struct gk20a_allocator fence_allocator;
 
 	struct vm_gk20a *vm;
 
@@ -272,7 +289,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				bool force_need_sync_fence);
 
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
-			       struct nvgpu_alloc_gpfifo_args *args);
+			       struct nvgpu_alloc_gpfifo_ex_args *args);
 
 void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 void channel_gk20a_disable(struct channel_gk20a *ch);
@@ -284,6 +301,11 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 void channel_gk20a_enable(struct channel_gk20a *ch);
 void gk20a_channel_timeout_restart_all_channels(struct gk20a *g);
 
+bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c);
+void channel_gk20a_joblist_lock(struct channel_gk20a *c);
+void channel_gk20a_joblist_unlock(struct channel_gk20a *c);
+bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c);
+
 int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 		int timeslice_period,
 		int *__timeslice_timeout, int *__timeslice_scale);
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index f788829f..c11d363e 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -47,7 +47,12 @@ static void gk20a_fence_free(struct kref *ref)
 #endif
 	if (f->semaphore)
 		gk20a_semaphore_put(f->semaphore);
-	kfree(f);
+
+	if (f->allocator) {
+		if (gk20a_alloc_initialized(f->allocator))
+			gk20a_free(f->allocator, (u64)f);
+	} else
+		kfree(f);
 }
 
 void gk20a_fence_put(struct gk20a_fence *f)
@@ -109,15 +114,66 @@ int gk20a_fence_install_fd(struct gk20a_fence *f)
 #endif
 }
 
-struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
+int gk20a_alloc_fence_pool(struct channel_gk20a *c, int count)
+{
+	int err;
+	size_t size;
+	struct gk20a_fence *fence_pool = NULL;
+
+	size = sizeof(struct gk20a_fence);
+	if (count <= ULONG_MAX / size) {
+		size = count * size;
+		fence_pool = vzalloc(size);
+	}
+
+	if (!fence_pool)
+		return -ENOMEM;
+
+	err = gk20a_lockless_allocator_init(&c->fence_allocator,
+			      "fence_pool", (u64)fence_pool, size,
+			      sizeof(struct gk20a_fence), 0);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	vfree(fence_pool);
+	return err;
+}
+
+void gk20a_free_fence_pool(struct channel_gk20a *c)
 {
-	struct gk20a_fence *fence;
+	if (gk20a_alloc_initialized(&c->fence_allocator)) {
+		void *base = (void *)gk20a_alloc_base(&c->fence_allocator);
+
+		gk20a_alloc_destroy(&c->fence_allocator);
+		vfree(base);
+	}
+}
 
-	fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL);
-	if (!fence)
-		return NULL;
+struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
+{
+	struct gk20a_fence *fence = NULL;
+
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		if (gk20a_alloc_initialized(&c->fence_allocator)) {
+			fence = (struct gk20a_fence *)
+				gk20a_alloc(&c->fence_allocator,
+					sizeof(struct gk20a_fence));
+
+			/* clear the node and reset the allocator pointer */
+			if (fence) {
+				memset(fence, 0, sizeof(*fence));
+				fence->allocator = &c->fence_allocator;
+			}
+		}
+	} else
+		fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL);
+
+	if (fence)
+		kref_init(&fence->ref);
 
-	kref_init(&fence->ref);
 	return fence;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
index 3fe2d8b2..97a7d957 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A Fences
  *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -45,6 +45,9 @@ struct gk20a_fence {
 	struct platform_device *host1x_pdev;
 	u32 syncpt_id;
 	u32 syncpt_value;
+
+	/* Valid for fences part of a pre-allocated fence pool */
+	struct gk20a_allocator *allocator;
 };
 
 /* Fences can be created from semaphores or syncpoint (id, value) pairs */
@@ -62,7 +65,15 @@ int gk20a_fence_from_syncpt(
 		u32 id, u32 value, bool wfi,
 		bool need_sync_fence);
 
-struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c);
+int gk20a_alloc_fence_pool(
+		struct channel_gk20a *c,
+		int size);
+
+void gk20a_free_fence_pool(
+		struct channel_gk20a *c);
+
+struct gk20a_fence *gk20a_alloc_fence(
+		struct channel_gk20a *c);
 
 void gk20a_init_fence(struct gk20a_fence *f,
 		const struct gk20a_fence_ops *ops,
-- 
cgit v1.2.2