From 6085c90f499c642bc41a646b0efbdfe60e096c74 Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Fri, 6 Mar 2015 16:33:43 +0200
Subject: gpu: nvgpu: add per-channel refcounting

Add reference counting for channels, and wait for reference count to
get to 0 in gk20a_channel_free() before actually freeing the channel.
Also, change free channel tracking a bit by employing a list of free
channels, which simplifies the procedure of finding available channels
with reference counting.

Each use of a channel must have a reference taken before use or held
by the caller. Taking a reference of a wild channel pointer may fail, if
the channel is either not opened or in a process of being closed. Also,
add safeguards for protecting accidental use of closed channels,
specifically, by setting ch->g = NULL in channel free. This will make it
obvious if freed channel is attempted to be used.

The last user of a channel might be the deferred interrupt handler,
so wait for deferred interrupts to be processed twice in the channel
free procedure: once for providing last notifications to the channel
and once to make sure there are no stale pointers left after referencing
to the channel has been denied.

Finally, fix some races in channel and TSG force reset IOCTL path,
by pausing the channel scheduler in gk20a_fifo_recover_ch() and
gk20a_fifo_recover_tsg(), while the affected engines have been identified,
the appropriate MMU faults triggered, and the MMU faults handled. In this
case, make sure that the MMU fault does not attempt to query the hardware
about the failing channel or TSG ids. This should make channel recovery
more safe also in the regular (i.e., not in the interrupt handler) context.

Bug 1530226
Bug 1597493
Bug 1625901
Bug 200076344
Bug 200071810

Change-Id: Ib274876908e18219c64ea41e50ca443df81d957b
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Signed-off-by: Sami Kiminki <skiminki@nvidia.com>
Reviewed-on: http://git-master/r/448463
(cherry picked from commit 3f03aeae64ef2af4829e06f5f63062e8ebd21353)
Reviewed-on: http://git-master/r/755147
Reviewed-by: Automatic_Commit_Validation_User
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c          |   4 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c      | 302 ++++++++++++++++++++++-----
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h      |  32 ++-
 drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c |  58 ++++-
 drivers/gpu/nvgpu/gk20a/debug_gk20a.c        |  34 ++-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c         | 247 ++++++++++++++++------
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h         |  15 +-
 drivers/gpu/nvgpu/gk20a/gk20a.c              |   3 +
 drivers/gpu/nvgpu/gk20a/gk20a.h              |   9 +
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c           |  93 ++++++---
 drivers/gpu/nvgpu/gk20a/mc_gk20a.c           |  18 ++
 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c           |   4 +-
 include/trace/events/gk20a.h                 |  50 ++++-
 13 files changed, 681 insertions(+), 188 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4a3076b5..b4fdfb44 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * Color decompression engine support
  *
- * Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA Corporation.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -74,7 +74,7 @@ __must_hold(&cde_app->mutex)
 	trace_gk20a_cde_remove_ctx(cde_ctx);
 
 	/* free the channel */
-	gk20a_free_channel(cde_ctx->ch, true);
+	gk20a_channel_close(ch);
 
 	/* ..then release mapped memory */
 	gk20a_deinit_cde_img(cde_ctx);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c12f196d..5a71e874 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -42,8 +42,8 @@
 
 #define NVMAP_HANDLE_PARAM_SIZE 1
 
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
+static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
 
 static void free_priv_cmdbuf(struct channel_gk20a *c,
 			     struct priv_cmd_entry *e);
@@ -61,29 +61,33 @@ static int channel_gk20a_update_runlist(struct channel_gk20a *c,
 					bool add);
 static void gk20a_free_error_notifiers(struct channel_gk20a *ch);
 
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+/* allocate GPU channel */
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 {
 	struct channel_gk20a *ch = NULL;
-	int chid;
 
-	mutex_lock(&f->ch_inuse_mutex);
-	for (chid = 0; chid < f->num_channels; chid++) {
-		if (!f->channel[chid].in_use) {
-			f->channel[chid].in_use = true;
-			ch = &f->channel[chid];
-			break;
-		}
+	mutex_lock(&f->free_chs_mutex);
+	if (!list_empty(&f->free_chs)) {
+		ch = list_first_entry(&f->free_chs, struct channel_gk20a,
+				free_chs);
+		list_del(&ch->free_chs);
+		WARN_ON(atomic_read(&ch->ref_count));
+		WARN_ON(ch->referenceable);
 	}
-	mutex_unlock(&f->ch_inuse_mutex);
+	mutex_unlock(&f->free_chs_mutex);
 
 	return ch;
 }
 
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+static void free_channel(struct fifo_gk20a *f,
+		struct channel_gk20a *ch)
 {
-	mutex_lock(&f->ch_inuse_mutex);
-	f->channel[c->hw_chid].in_use = false;
-	mutex_unlock(&f->ch_inuse_mutex);
+	trace_gk20a_release_used_channel(ch->hw_chid);
+	/* refcount is zero here and channel is in a freed/dead state */
+	mutex_lock(&f->free_chs_mutex);
+	/* add to head to increase visibility of timing-related bugs */
+	list_add(&ch->free_chs, &f->free_chs);
+	mutex_unlock(&f->free_chs_mutex);
 }
 
 int channel_gk20a_commit_va(struct channel_gk20a *c)
@@ -361,6 +365,11 @@ void gk20a_channel_abort(struct channel_gk20a *ch)
 	struct channel_gk20a_job *job, *n;
 	bool released_job_semaphore = false;
 
+	gk20a_dbg_fn("");
+
+	/* make sure new kickoffs are prevented */
+	ch->has_timedout = true;
+
 	/* ensure no fences are pending */
 	mutex_lock(&ch->submit_lock);
 	if (ch->sync)
@@ -416,6 +425,8 @@ void gk20a_disable_channel(struct channel_gk20a *ch,
 			   bool finish,
 			   unsigned long finish_timeout)
 {
+	gk20a_dbg_fn("");
+
 	if (finish) {
 		int err = gk20a_channel_finish(ch, finish_timeout);
 		WARN_ON(err);
@@ -627,8 +638,9 @@ void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
 				(u32)(nsec >> 32);
 		ch->error_notifier->info32 = error;
 		ch->error_notifier->status = 0xffff;
+
 		gk20a_err(dev_from_gk20a(ch->g),
-		    "error notifier set to %d for ch %d\n", error, ch->hw_chid);
+		    "error notifier set to %d for ch %d", error, ch->hw_chid);
 	}
 }
 
@@ -643,7 +655,53 @@ static void gk20a_free_error_notifiers(struct channel_gk20a *ch)
 	}
 }
 
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
+/* Returns delta of cyclic integers a and b. If a is ahead of b, delta
+ * is positive */
+static int cyclic_delta(int a, int b)
+{
+	return a - b;
+}
+
+static void gk20a_wait_for_deferred_interrupts(struct gk20a *g)
+{
+	int stall_irq_threshold = atomic_read(&g->hw_irq_stall_count);
+	int nonstall_irq_threshold = atomic_read(&g->hw_irq_nonstall_count);
+
+	/* wait until all stalling irqs are handled */
+	wait_event(g->sw_irq_stall_last_handled_wq,
+		   cyclic_delta(stall_irq_threshold,
+				atomic_read(&g->sw_irq_stall_last_handled))
+		   <= 0);
+
+	/* wait until all non-stalling irqs are handled */
+	wait_event(g->sw_irq_nonstall_last_handled_wq,
+		   cyclic_delta(nonstall_irq_threshold,
+				atomic_read(&g->sw_irq_nonstall_last_handled))
+		   <= 0);
+}
+
+static void gk20a_wait_until_counter_is_N(
+	struct channel_gk20a *ch, atomic_t *counter, int wait_value,
+	wait_queue_head_t *wq, const char *caller, const char *counter_name)
+{
+	while (true) {
+		if (wait_event_timeout(
+			    *wq,
+			    atomic_read(counter) == wait_value,
+			    msecs_to_jiffies(5000)) > 0)
+			break;
+
+		gk20a_warn(dev_from_gk20a(ch->g),
+			   "%s: channel %d, still waiting, %s left: %d, waiting for: %d",
+			   caller, ch->hw_chid, counter_name,
+			   atomic_read(counter), wait_value);
+	}
+}
+
+
+
+/* call ONLY when no references to the channel exist: after the last put */
+static void gk20a_free_channel(struct channel_gk20a *ch)
 {
 	struct gk20a *g = ch->g;
 	struct fifo_gk20a *f = &g->fifo;
@@ -654,13 +712,50 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 
 	gk20a_dbg_fn("");
 
+	WARN_ON(ch->g == NULL);
+
+	trace_gk20a_free_channel(ch->hw_chid);
+
+	/* prevent new kickoffs */
+	ch->has_timedout = true;
+	wmb();
+
+	/* wait until there's only our ref to the channel */
+	gk20a_wait_until_counter_is_N(
+		ch, &ch->ref_count, 1, &ch->ref_count_dec_wq,
+		__func__, "references");
+
+	/* wait until all pending interrupts for recently completed
+	 * jobs are handled */
+	gk20a_wait_for_deferred_interrupts(g);
+
+	/* prevent new refs */
+	spin_lock(&ch->ref_obtain_lock);
+	if (!ch->referenceable) {
+		spin_unlock(&ch->ref_obtain_lock);
+		gk20a_err(dev_from_gk20a(ch->g),
+			  "Extra %s() called to channel %u",
+			  __func__, ch->hw_chid);
+		return;
+	}
+	ch->referenceable = false;
+	spin_unlock(&ch->ref_obtain_lock);
+
+	/* matches with the initial reference in gk20a_open_new_channel() */
+	atomic_dec(&ch->ref_count);
+
+	/* wait until no more refs to the channel */
+	gk20a_wait_until_counter_is_N(
+		ch, &ch->ref_count, 0, &ch->ref_count_dec_wq,
+		__func__, "references");
+
 	/* if engine reset was deferred, perform it now */
 	mutex_lock(&f->deferred_reset_mutex);
 	if (g->fifo.deferred_reset_pending) {
 		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
 			   " deferred, running now");
-		gk20a_fifo_reset_engine(g, g->fifo.mmu_fault_engines);
-		g->fifo.mmu_fault_engines = 0;
+		gk20a_fifo_reset_engine(g, g->fifo.deferred_fault_engines);
+		g->fifo.deferred_fault_engines = 0;
 		g->fifo.deferred_reset_pending = false;
 	}
 	mutex_unlock(&f->deferred_reset_mutex);
@@ -674,7 +769,7 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 	gk20a_dbg_info("freeing bound channel context, timeout=%ld",
 			timeout);
 
-	gk20a_disable_channel(ch, finish && !ch->has_timedout, timeout);
+	gk20a_disable_channel(ch, !ch->has_timedout, timeout);
 
 	gk20a_free_error_notifiers(ch);
 
@@ -714,6 +809,10 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 	spin_unlock(&ch->update_fn_lock);
 	cancel_work_sync(&ch->update_fn_work);
 
+	/* make sure we don't have deferred interrupts pending that
+	 * could still touch the channel */
+	gk20a_wait_for_deferred_interrupts(g);
+
 unbind:
 	if (gk20a_is_channel_marked_as_tsg(ch))
 		gk20a_tsg_unbind_channel(ch);
@@ -743,8 +842,66 @@ unbind:
 	mutex_unlock(&ch->dbg_s_lock);
 
 release:
+	/* make sure we catch accesses of unopened channels in case
+	 * there's non-refcounted channel pointers hanging around */
+	ch->g = NULL;
+	wmb();
+
 	/* ALWAYS last */
-	release_used_channel(f, ch);
+	free_channel(f, ch);
+}
+
+/* Try to get a reference to the channel. Return nonzero on success. If fails,
+ * the channel is dead or being freed elsewhere and you must not touch it.
+ *
+ * Always when a channel_gk20a pointer is seen and about to be used, a
+ * reference must be held to it - either by you or the caller, which should be
+ * documented well or otherwise clearly seen. This usually boils down to the
+ * file from ioctls directly, or an explicit get in exception handlers when the
+ * channel is found by a hw_chid.
+ *
+ * Most global functions in this file require a reference to be held by the
+ * caller.
+ */
+struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch,
+					 const char *caller) {
+	struct channel_gk20a *ret;
+
+	spin_lock(&ch->ref_obtain_lock);
+
+	if (likely(ch->referenceable)) {
+		atomic_inc(&ch->ref_count);
+		ret = ch;
+	} else
+		ret = NULL;
+
+	spin_unlock(&ch->ref_obtain_lock);
+
+	if (ret)
+		trace_gk20a_channel_get(ch->hw_chid, caller);
+
+	return ret;
+}
+
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller)
+{
+	trace_gk20a_channel_put(ch->hw_chid, caller);
+	atomic_dec(&ch->ref_count);
+	wake_up_all(&ch->ref_count_dec_wq);
+
+	/* More puts than gets. Channel is probably going to get
+	 * stuck. */
+	WARN_ON(atomic_read(&ch->ref_count) < 0);
+
+	/* Also, more puts than gets. ref_count can go to 0 only if
+	 * the channel is closing. Channel is probably going to get
+	 * stuck. */
+	WARN_ON(atomic_read(&ch->ref_count) == 0 && ch->referenceable);
+}
+
+void gk20a_channel_close(struct channel_gk20a *ch)
+{
+	gk20a_free_channel(ch);
 }
 
 int gk20a_channel_release(struct inode *inode, struct file *filp)
@@ -758,14 +915,14 @@ int gk20a_channel_release(struct inode *inode, struct file *filp)
 
 	trace_gk20a_channel_release(dev_name(&g->dev->dev));
 
-	err = gk20a_busy(ch->g->dev);
+	err = gk20a_busy(g->dev);
 	if (err) {
 		gk20a_err(dev_from_gk20a(g), "failed to release channel %d",
 			ch->hw_chid);
 		return err;
 	}
-	gk20a_free_channel(ch, true);
-	gk20a_idle(ch->g->dev);
+	gk20a_channel_close(ch);
+	gk20a_idle(g->dev);
 
 	filp->private_data = NULL;
 	return 0;
@@ -808,22 +965,31 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
 	struct fifo_gk20a *f = &g->fifo;
 	struct channel_gk20a *ch;
 
-	ch = acquire_unused_channel(f);
+	gk20a_dbg_fn("");
+
+	ch = allocate_channel(f);
 	if (ch == NULL) {
 		/* TBD: we want to make this virtualizable */
 		gk20a_err(dev_from_gk20a(g), "out of hw chids");
 		return NULL;
 	}
 
+	trace_gk20a_open_new_channel(ch->hw_chid);
+
+	BUG_ON(ch->g);
 	ch->g = g;
 
 	if (g->ops.fifo.alloc_inst(g, ch)) {
-		ch->in_use = false;
+		ch->g = NULL;
+		free_channel(f, ch);
 		gk20a_err(dev_from_gk20a(g),
 			   "failed to open gk20a channel, out of inst mem");
-
 		return NULL;
 	}
+
+	/* now the channel is in a limbo out of the free list but not marked as
+	 * alive and used (i.e. get-able) yet */
+
 	ch->pid = current->pid;
 
 	/* By default, channel is regular (non-TSG) channel */
@@ -854,6 +1020,13 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
 	spin_lock_init(&ch->update_fn_lock);
 	INIT_WORK(&ch->update_fn_work, gk20a_channel_update_runcb_fn);
 
+	/* Mark the channel alive, get-able, with 1 initial use
+	 * references. The initial reference will be decreased in
+	 * gk20a_free_channel() */
+	ch->referenceable = true;
+	atomic_set(&ch->ref_count, 1);
+	wmb();
+
 	return ch;
 }
 
@@ -1379,7 +1552,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 	struct mapped_buffer_node **mapped_buffers = NULL;
 	int err = 0, num_mapped_buffers;
 
-	/* job needs reference to this vm */
+	/* job needs reference to this vm (released in channel_update) */
 	gk20a_vm_get(vm);
 
 	err = gk20a_vm_get_buffers(vm, &mapped_buffers, &num_mapped_buffers);
@@ -1395,14 +1568,21 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 		return -ENOMEM;
 	}
 
-	job->num_mapped_buffers = num_mapped_buffers;
-	job->mapped_buffers = mapped_buffers;
-	job->pre_fence = gk20a_fence_get(pre_fence);
-	job->post_fence = gk20a_fence_get(post_fence);
+	/* put() is done in gk20a_channel_update() when the job is done */
+	c = gk20a_channel_get(c);
 
-	mutex_lock(&c->jobs_lock);
-	list_add_tail(&job->list, &c->jobs);
-	mutex_unlock(&c->jobs_lock);
+	if (c) {
+		job->num_mapped_buffers = num_mapped_buffers;
+		job->mapped_buffers = mapped_buffers;
+		job->pre_fence = gk20a_fence_get(pre_fence);
+		job->post_fence = gk20a_fence_get(post_fence);
+
+		mutex_lock(&c->jobs_lock);
+		list_add_tail(&job->list, &c->jobs);
+		mutex_unlock(&c->jobs_lock);
+	} else {
+		return -ETIMEDOUT;
+	}
 
 	return 0;
 }
@@ -1412,13 +1592,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 	struct vm_gk20a *vm = c->vm;
 	struct channel_gk20a_job *job, *n;
 
-	trace_gk20a_channel_update(c);
+	trace_gk20a_channel_update(c->hw_chid);
 
 	wake_up(&c->submit_wq);
 
 	mutex_lock(&c->submit_lock);
 	mutex_lock(&c->jobs_lock);
 	list_for_each_entry_safe(job, n, &c->jobs, list) {
+		struct gk20a *g = c->g;
+
 		bool completed = gk20a_fence_is_expired(job->post_fence);
 		if (!completed)
 			break;
@@ -1434,12 +1616,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 		gk20a_fence_put(job->pre_fence);
 		gk20a_fence_put(job->post_fence);
 
-		/* job is done. release its reference to vm */
+		/* job is done. release its vm reference (taken in add_job) */
 		gk20a_vm_put(vm);
+		/* another bookkeeping taken in add_job. caller must hold a ref
+		 * so this wouldn't get freed here. */
+		gk20a_channel_put(c);
 
 		list_del_init(&job->list);
 		kfree(job);
-		gk20a_idle(c->g->dev);
+		gk20a_idle(g->dev);
 	}
 
 	/*
@@ -1719,10 +1904,13 @@ clean_up:
 int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 {
 	struct channel_gk20a *c = g->fifo.channel+chid;
-	c->g = g;
-	c->in_use = false;
+	c->g = NULL;
 	c->hw_chid = chid;
 	c->bound = false;
+	spin_lock_init(&c->ref_obtain_lock);
+	atomic_set(&c->ref_count, 0);
+	c->referenceable = false;
+	init_waitqueue_head(&c->ref_count_dec_wq);
 	mutex_init(&c->ioctl_lock);
 	mutex_init(&c->jobs_lock);
 	mutex_init(&c->submit_lock);
@@ -1733,6 +1921,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 #endif
 	INIT_LIST_HEAD(&c->dbg_s_list);
 	mutex_init(&c->dbg_s_lock);
+	list_add(&c->free_chs, &g->fifo.free_chs);
 
 	return 0;
 }
@@ -2066,8 +2255,7 @@ int gk20a_channel_suspend(struct gk20a *g)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
-		if (ch->in_use) {
-
+		if (gk20a_channel_get(ch)) {
 			gk20a_dbg_info("suspend channel %d", chid);
 			/* disable channel */
 			g->ops.fifo.disable_channel(ch);
@@ -2079,6 +2267,8 @@ int gk20a_channel_suspend(struct gk20a *g)
 				flush_work(&ch->update_fn_work);
 
 			channels_in_use = true;
+
+			gk20a_channel_put(ch);
 		}
 	}
 
@@ -2086,8 +2276,10 @@ int gk20a_channel_suspend(struct gk20a *g)
 		g->ops.fifo.update_runlist(g, 0, ~0, false, true);
 
 		for (chid = 0; chid < f->num_channels; chid++) {
-			if (f->channel[chid].in_use)
+			if (gk20a_channel_get(&f->channel[chid])) {
 				g->ops.fifo.unbind_channel(&f->channel[chid]);
+				gk20a_channel_put(&f->channel[chid]);
+			}
 		}
 	}
 
@@ -2095,8 +2287,6 @@ int gk20a_channel_suspend(struct gk20a *g)
 	return 0;
 }
 
-/* in this context the "channel" is the host1x channel which
- * maps to *all* gk20a channels */
 int gk20a_channel_resume(struct gk20a *g)
 {
 	struct fifo_gk20a *f = &g->fifo;
@@ -2106,10 +2296,11 @@ int gk20a_channel_resume(struct gk20a *g)
 	gk20a_dbg_fn("");
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (f->channel[chid].in_use) {
+		if (gk20a_channel_get(&f->channel[chid])) {
 			gk20a_dbg_info("resume channel %d", chid);
 			g->ops.fifo.bind_channel(&f->channel[chid]);
 			channels_in_use = true;
+			gk20a_channel_put(&f->channel[chid]);
 		}
 	}
 
@@ -2129,10 +2320,11 @@ void gk20a_channel_semaphore_wakeup(struct gk20a *g)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *c = g->fifo.channel+chid;
-		if (c->in_use) {
+		if (gk20a_channel_get(c)) {
 			gk20a_channel_event(c);
 			wake_up_interruptible_all(&c->semaphore_wq);
 			gk20a_channel_update(c, 0);
+			gk20a_channel_put(c);
 		}
 	}
 }
@@ -2225,10 +2417,18 @@ long gk20a_channel_ioctl(struct file *filp,
 			return -EFAULT;
 	}
 
+	/* take a ref or return timeout if channel refs can't be taken */
+	ch = gk20a_channel_get(ch);
+	if (!ch)
+		return -ETIMEDOUT;
+
 	/* protect our sanity for threaded userspace - most of the channel is
 	 * not thread safe */
 	mutex_lock(&ch->ioctl_lock);
 
+	/* this ioctl call keeps a ref to the file which keeps a ref to the
+	 * channel */
+
 	switch (cmd) {
 	case NVGPU_IOCTL_CHANNEL_OPEN:
 		err = gk20a_channel_open_ioctl(ch->g,
@@ -2449,9 +2649,11 @@ long gk20a_channel_ioctl(struct file *filp,
 	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
 		err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
 
-	gk20a_dbg_fn("end");
-
 	mutex_unlock(&ch->ioctl_lock);
 
+	gk20a_channel_put(ch);
+
+	gk20a_dbg_fn("end");
+
 	return err;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f022fe36..2ea5b4be 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -19,12 +19,13 @@
 #define CHANNEL_GK20A_H
 
 #include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
 #include <linux/mutex.h>
-#include <uapi/linux/nvgpu.h>
 #include <linux/poll.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <uapi/linux/nvgpu.h>
 
 struct gk20a;
 struct gr_gk20a;
@@ -77,8 +78,15 @@ struct channel_gk20a_poll_events {
 
 /* this is the priv element of struct nvhost_channel */
 struct channel_gk20a {
-	struct gk20a *g;
-	bool in_use;
+	struct gk20a *g; /* set only when channel is active */
+
+	struct list_head free_chs;
+
+	spinlock_t ref_obtain_lock;
+	bool referenceable;
+	atomic_t ref_count;
+	wait_queue_head_t ref_count_dec_wq;
+
 	int hw_chid;
 	bool bound;
 	bool first_init;
@@ -171,7 +179,10 @@ static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
 }
 int channel_gk20a_commit_va(struct channel_gk20a *c);
 int gk20a_init_channel_support(struct gk20a *, u32 chid);
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish);
+
+/* must be inside gk20a_busy()..gk20a_idle() */
+void gk20a_channel_close(struct channel_gk20a *ch);
+
 bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
 					    u32 timeout_delta_ms);
 void gk20a_disable_channel(struct channel_gk20a *ch,
@@ -202,6 +213,15 @@ void gk20a_channel_event(struct channel_gk20a *ch);
 
 void gk20a_init_channel(struct gpu_ops *gops);
 
+/* returns ch if reference was obtained */
+struct channel_gk20a *__must_check _gk20a_channel_get(struct channel_gk20a *ch,
+						      const char *caller);
+#define gk20a_channel_get(ch) _gk20a_channel_get(ch, __func__)
+
+
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller);
+#define gk20a_channel_put(ch) _gk20a_channel_put(ch, __func__)
+
 int gk20a_wait_channel_idle(struct channel_gk20a *ch);
 struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g);
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 8cc852c7..7a707fbd 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -154,8 +154,23 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 
 static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
 {
-	struct channel_gk20a *ch20a = priv;
-	gk20a_channel_update(ch20a, nr_completed);
+	struct channel_gk20a *ch = priv;
+	struct gk20a *g = ch->g;
+
+	/* need busy for possible channel deletion */
+	if (gk20a_busy(ch->g->dev)) {
+		gk20a_err(dev_from_gk20a(ch->g),
+				"failed to busy while syncpt update");
+		/* Last gk20a_idle()s are in channel_update, so we shouldn't
+		 * get here. If we do, the channel is badly broken now */
+		return;
+	}
+
+	/* note: channel_get() is in __gk20a_channel_syncpt_incr() */
+	gk20a_channel_update(ch, nr_completed);
+	gk20a_channel_put(ch);
+
+	gk20a_idle(g->dev);
 }
 
 static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
@@ -209,14 +224,37 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 	thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 2);
 
 	if (register_irq) {
-		err = nvhost_intr_register_notifier(sp->host1x_pdev,
-				sp->id, thresh,
-				gk20a_channel_syncpt_update, c);
-
-		/* Adding interrupt action should never fail. A proper error
-		 * handling here would require us to decrement the syncpt max
-		 * back to its original value. */
-		WARN(err, "failed to set submit complete interrupt");
+		err = gk20a_busy(c->g->dev);
+		if (err)
+			gk20a_err(dev_from_gk20a(c->g),
+				  "failed to add syncpt interrupt notifier for channel %d",
+				  c->hw_chid);
+		else {
+			struct channel_gk20a *referenced = gk20a_channel_get(c);
+
+			WARN_ON(!referenced);
+			gk20a_idle(c->g->dev);
+
+			if (referenced) {
+				/* note: channel_put() is in
+				 * gk20a_channel_syncpt_update() */
+
+				err = nvhost_intr_register_notifier(
+					sp->host1x_pdev,
+					sp->id, thresh,
+					gk20a_channel_syncpt_update, c);
+				if (err)
+					gk20a_channel_put(referenced);
+
+				/* Adding interrupt action should
+				 * never fail. A proper error handling
+				 * here would require us to decrement
+				 * the syncpt max back to its original
+				 * value. */
+				WARN(err,
+				     "failed to set submit complete interrupt");
+			}
+		}
 	}
 
 	*fence = gk20a_fence_from_syncpt(sp->host1x_pdev, sp->id, thresh,
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index 0f1c31dd..bda0dab0 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,6 +36,7 @@ static struct platform_device *gk20a_device;
 
 struct ch_state {
 	int pid;
+	int refs;
 	u8 inst_block[0];
 };
 
@@ -118,9 +119,10 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
 	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
 
-	gk20a_debug_output(o, "%d-%s, pid %d: ", hw_chid,
+	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
 			g->dev->name,
-			ch_state->pid);
+			ch_state->pid,
+			ch_state->refs);
 	gk20a_debug_output(o, "%s in use %s %s\n",
 			ccsr_channel_enable_v(channel) ? "" : "not",
 			ccsr_chan_status_str[status],
@@ -231,16 +233,30 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
 	}
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (f->channel[chid].in_use)
-			ch_state[chid] = kmalloc(sizeof(struct ch_state) + ram_in_alloc_size_v(), GFP_KERNEL);
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (gk20a_channel_get(ch)) {
+			ch_state[chid] =
+				kmalloc(sizeof(struct ch_state) +
+					ram_in_alloc_size_v(), GFP_KERNEL);
+			/* ref taken stays to below loop with
+			 * successful allocs */
+			if (!ch_state[chid])
+				gk20a_channel_put(ch);
+		}
 	}
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (ch_state[chid] && f->channel[chid].inst_block.cpu_va) {
-			ch_state[chid]->pid = f->channel[chid].pid;
-			memcpy(&ch_state[chid]->inst_block[0],
-			       f->channel[chid].inst_block.cpu_va,
-			       ram_in_alloc_size_v());
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (ch_state[chid]) {
+			if (ch->inst_block.cpu_va) {
+				ch_state[chid]->pid = ch->pid;
+				ch_state[chid]->refs =
+					atomic_read(&ch->ref_count);
+				memcpy(&ch_state[chid]->inst_block[0],
+						ch->inst_block.cpu_va,
+						ram_in_alloc_size_v());
+			}
+			gk20a_channel_put(ch);
 		}
 	}
 	for (chid = 0; chid < f->num_channels; chid++) {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 56b954a9..4ef310b2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -515,6 +515,9 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 
 	init_runlist(g, f);
 
+	INIT_LIST_HEAD(&f->free_chs);
+	mutex_init(&f->free_chs_mutex);
+
 	for (chid = 0; chid < f->num_channels; chid++) {
 		f->channel[chid].userd_cpu_va =
 			f->userd.cpu_va + chid * f->userd_entry_size;
@@ -527,7 +530,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 		gk20a_init_channel_support(g, chid);
 		gk20a_init_tsg_support(g, chid);
 	}
-	mutex_init(&f->ch_inuse_mutex);
 	mutex_init(&f->tsg_inuse_mutex);
 
 	f->remove_support = gk20a_remove_fifo_support;
@@ -637,6 +639,7 @@ int gk20a_init_fifo_support(struct gk20a *g)
 	return err;
 }
 
+/* return with a reference to the channel, caller must put it back */
 static struct channel_gk20a *
 channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
 {
@@ -644,10 +647,16 @@ channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
 	if (unlikely(!f->channel))
 		return NULL;
 	for (ci = 0; ci < f->num_channels; ci++) {
-		struct channel_gk20a *c = f->channel+ci;
-		if (c->inst_block.cpu_va &&
-		    (inst_ptr == gk20a_mem_phys(&c->inst_block)))
-			return f->channel+ci;
+		struct channel_gk20a *ch = gk20a_channel_get(&f->channel[ci]);
+		/* only alive channels are searched */
+		if (!ch)
+			continue;
+
+		if (ch->inst_block.cpu_va &&
+		    (inst_ptr == gk20a_mem_phys(&ch->inst_block)))
+			return ch;
+
+		gk20a_channel_put(ch);
 	}
 	return NULL;
 }
@@ -803,6 +812,7 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
 	return true;
 }
 
+/* caller must hold a channel reference */
 static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
 		struct channel_gk20a *ch)
 {
@@ -854,14 +864,38 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
 		"TSG %d generated a mmu fault", tsg->tsgid);
 
 	mutex_lock(&tsg->ch_list_lock);
-	list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-		ret = gk20a_fifo_set_ctx_mmu_error(g, ch);
+	list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+		if (gk20a_channel_get(ch)) {
+			if (!gk20a_fifo_set_ctx_mmu_error(g, ch))
+				ret = false;
+			gk20a_channel_put(ch);
+		}
+	}
 	mutex_unlock(&tsg->ch_list_lock);
 
 	return ret;
 }
 
-static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
+{
+	struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
+	struct channel_gk20a *ch;
+
+	mutex_lock(&tsg->ch_list_lock);
+	list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+		if (gk20a_channel_get(ch)) {
+			gk20a_channel_abort(ch);
+			gk20a_channel_put(ch);
+		}
+	}
+	mutex_unlock(&tsg->ch_list_lock);
+}
+
+static bool gk20a_fifo_handle_mmu_fault(
+	struct gk20a *g,
+	u32 mmu_fault_engines, /* queried from HW if 0 */
+	u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
+	bool id_is_tsg)
 {
 	bool fake_fault;
 	unsigned long fault_id;
@@ -894,10 +928,8 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
 		gr_gpfifo_ctl_semaphore_access_f(0));
 
-	/* If we have recovery in progress, MMU fault id is invalid */
-	if (g->fifo.mmu_fault_engines) {
-		fault_id = g->fifo.mmu_fault_engines;
-		g->fifo.mmu_fault_engines = 0;
+	if (mmu_fault_engines) {
+		fault_id = mmu_fault_engines;
 		fake_fault = true;
 	} else {
 		fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
@@ -914,6 +946,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		struct fifo_mmu_fault_info_gk20a f;
 		struct channel_gk20a *ch = NULL;
 		struct tsg_gk20a *tsg = NULL;
+		struct channel_gk20a *referenced_channel = 0;
 		/* read and parse engine status */
 		u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
 		u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -953,22 +986,34 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		/* get the channel/TSG */
 		if (fake_fault) {
 			/* use next_id if context load is failing */
-			u32 id = (ctx_status ==
-				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-				fifo_engine_status_next_id_v(status) :
-				fifo_engine_status_id_v(status);
-			u32 type = (ctx_status ==
-				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-				fifo_engine_status_next_id_type_v(status) :
-				fifo_engine_status_id_type_v(status);
+			u32 id, type;
+
+			if (hw_id == ~(u32)0) {
+				id = (ctx_status ==
+				      fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+					fifo_engine_status_next_id_v(status) :
+					fifo_engine_status_id_v(status);
+				type = (ctx_status ==
+					fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+					fifo_engine_status_next_id_type_v(status) :
+					fifo_engine_status_id_type_v(status);
+			} else {
+				id = hw_id;
+				type = id_is_tsg ?
+					fifo_engine_status_id_type_tsgid_v() :
+					fifo_engine_status_id_type_chid_v();
+			}
 
 			if (type == fifo_engine_status_id_type_tsgid_v())
 				tsg = &g->fifo.tsg[id];
-			else if (type == fifo_engine_status_id_type_chid_v())
+			else if (type == fifo_engine_status_id_type_chid_v()) {
 				ch = &g->fifo.channel[id];
+				referenced_channel = gk20a_channel_get(ch);
+			}
 		} else {
 			/* read channel based on instruction pointer */
 			ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
+			referenced_channel = ch;
 		}
 
 		if (ch && gk20a_is_channel_marked_as_tsg(ch))
@@ -977,7 +1022,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		/* check if engine reset should be deferred */
 		if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g,
 				engine_id, &f, fake_fault)) {
-			g->fifo.mmu_fault_engines = fault_id;
+			g->fifo.deferred_fault_engines = fault_id;
 
 			/* handled during channel free */
 			g->fifo.deferred_reset_pending = true;
@@ -988,19 +1033,31 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		 * syncpoints */
 
 		if (tsg) {
-			struct channel_gk20a *ch = NULL;
 			if (!g->fifo.deferred_reset_pending)
 				verbose =
 				       gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
-			mutex_lock(&tsg->ch_list_lock);
-			list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-				gk20a_channel_abort(ch);
-			mutex_unlock(&tsg->ch_list_lock);
+
+			gk20a_fifo_abort_tsg(g, ch->tsgid);
+
+			/* put back the ref taken early above */
+			if (referenced_channel) {
+				gk20a_channel_put(ch);
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+						"mmu error in freed tsg channel %d on tsgid %d",
+						ch->hw_chid, ch->tsgid);
+			}
 		} else if (ch) {
-			if (!g->fifo.deferred_reset_pending)
-				verbose =
-					gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
-			gk20a_channel_abort(ch);
+			if (referenced_channel) {
+				if (!g->fifo.deferred_reset_pending)
+					verbose = gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+				gk20a_channel_abort(ch);
+				gk20a_channel_put(ch);
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+						"mmu error in freed channel %d",
+						ch->hw_chid);
+			}
 		} else if (f.inst_ptr ==
 				gk20a_mem_phys(&g->mm.bar1.inst_block)) {
 			gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
@@ -1133,46 +1190,69 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
 
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
 {
-	u32 engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
+	u32 engines;
+
+	/* stop context switching to prevent engine assignments from
+	   changing until channel is recovered */
+	mutex_lock(&g->dbg_sessions_lock);
+	gr_gk20a_disable_ctxsw(g);
+
+	engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
+
 	if (engines)
-		gk20a_fifo_recover(g, engines, verbose);
+		gk20a_fifo_recover(g, engines, hw_chid, false, verbose);
 	else {
-		struct channel_gk20a *ch =
-			g->fifo.channel + hw_chid;
+		struct channel_gk20a *ch = &g->fifo.channel[hw_chid];
 
-		gk20a_channel_abort(ch);
+		if (gk20a_channel_get(ch)) {
+			gk20a_channel_abort(ch);
 
-		if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
-			gk20a_debug_dump(g->dev);
+			if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
+				gk20a_debug_dump(g->dev);
+
+			gk20a_channel_put(ch);
+		}
 	}
+
+	gr_gk20a_enable_ctxsw(g);
+	mutex_unlock(&g->dbg_sessions_lock);
 }
 
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 {
-	u32 engines = gk20a_fifo_engines_on_id(g, tsgid, true);
+	u32 engines;
+
+	/* stop context switching to prevent engine assignments from
+	   changing until TSG is recovered */
+	mutex_lock(&g->dbg_sessions_lock);
+	gr_gk20a_disable_ctxsw(g);
+
+	engines = gk20a_fifo_engines_on_id(g, tsgid, true);
+
 	if (engines)
-		gk20a_fifo_recover(g, engines, verbose);
+		gk20a_fifo_recover(g, engines, tsgid, true, verbose);
 	else {
 		struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
-		struct channel_gk20a *ch;
 
 		if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg))
 			gk20a_debug_dump(g->dev);
 
-		mutex_lock(&tsg->ch_list_lock);
-		list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-			gk20a_channel_abort(ch);
-		mutex_unlock(&tsg->ch_list_lock);
+		gk20a_fifo_abort_tsg(g, tsgid);
 	}
+
+	gr_gk20a_enable_ctxsw(g);
+	mutex_unlock(&g->dbg_sessions_lock);
 }
 
 void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
-		bool verbose)
+			u32 hw_id, bool id_is_tsg,
+			bool verbose)
 {
 	unsigned long engine_id, i;
 	unsigned long _engine_ids = __engine_ids;
 	unsigned long engine_ids = 0;
 	u32 val;
+	u32 mmu_fault_engines = 0;
 
 	if (verbose)
 		gk20a_debug_dump(g->dev);
@@ -1181,7 +1261,6 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 		g->ops.ltc.flush(g);
 
 	/* store faulted engines in advance */
-	g->fifo.mmu_fault_engines = 0;
 	for_each_set_bit(engine_id, &_engine_ids, 32) {
 		u32 ref_type;
 		u32 ref_id;
@@ -1196,11 +1275,10 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 			gk20a_fifo_get_faulty_id_type(g, i, &id, &type);
 			if (ref_type == type && ref_id == id) {
 				engine_ids |= BIT(i);
-				g->fifo.mmu_fault_engines |=
+				mmu_fault_engines |=
 					BIT(gk20a_engine_id_to_mmu_id(i));
 			}
 		}
-
 	}
 
 	/*
@@ -1214,7 +1292,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 			fifo_intr_0_sched_error_reset_f());
 
 	g->ops.fifo.trigger_mmu_fault(g, engine_ids);
-	gk20a_fifo_handle_mmu_fault(g);
+	gk20a_fifo_handle_mmu_fault(g, engine_ids, hw_id, id_is_tsg);
 
 	val = gk20a_readl(g, fifo_intr_en_0_r());
 	val |= fifo_intr_en_0_mmu_fault_f(1)
@@ -1222,25 +1300,32 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 	gk20a_writel(g, fifo_intr_en_0_r(), val);
 }
 
+/* force reset channel and tsg (if it's part of one) */
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
 {
 	struct tsg_gk20a *tsg = NULL;
 	struct channel_gk20a *ch_tsg = NULL;
+	struct gk20a *g = ch->g;
 
 	if (gk20a_is_channel_marked_as_tsg(ch)) {
-		tsg = &ch->g->fifo.tsg[ch->hw_chid];
+		tsg = &g->fifo.tsg[ch->hw_chid];
 
 		mutex_lock(&tsg->ch_list_lock);
+
 		list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
-			gk20a_set_error_notifier(ch_tsg,
-			       NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+			if (gk20a_channel_get(ch_tsg)) {
+				gk20a_set_error_notifier(ch_tsg,
+				       NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+				gk20a_channel_put(ch_tsg);
+			}
 		}
+
 		mutex_unlock(&tsg->ch_list_lock);
-		gk20a_fifo_recover_tsg(ch->g, ch->tsgid, verbose);
+		gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
 	} else {
 		gk20a_set_error_notifier(ch,
 			NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
-		gk20a_fifo_recover_ch(ch->g, ch->hw_chid, verbose);
+		gk20a_fifo_recover_ch(g, ch->hw_chid, verbose);
 	}
 
 	return 0;
@@ -1300,11 +1385,14 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 		struct channel_gk20a *ch = &f->channel[id];
 
 		if (non_chid) {
-			gk20a_fifo_recover(g, BIT(engine_id), true);
+			gk20a_fifo_recover(g, BIT(engine_id), id, true, true);
 			ret = true;
 			goto err;
 		}
 
+		if (!gk20a_channel_get(ch))
+			goto err;
+
 		if (gk20a_channel_update_and_check_timeout(ch,
 			GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
 			gk20a_set_error_notifier(ch,
@@ -1313,7 +1401,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 				"fifo sched ctxsw timeout error:"
 				"engine = %u, ch = %d", engine_id, id);
 			gk20a_gr_debug_dump(g->dev);
-			gk20a_fifo_recover(g, BIT(engine_id),
+			gk20a_fifo_recover(g, BIT(engine_id), id, false,
 				ch->timeout_debug_dump);
 			ret = true;
 		} else {
@@ -1324,6 +1412,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 				id);
 			ret = false;
 		}
+		gk20a_channel_put(ch);
 		return ret;
 	}
 
@@ -1336,7 +1425,7 @@ err:
 
 static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
 {
-	bool print_channel_reset_log = false, reset_engine = false;
+	bool print_channel_reset_log = false;
 	struct device *dev = dev_from_gk20a(g);
 	u32 handled = 0;
 
@@ -1367,8 +1456,8 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
 	}
 
 	if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
-		print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g);
-		reset_engine  = true;
+		print_channel_reset_log =
+			gk20a_fifo_handle_mmu_fault(g, 0, ~(u32)0, false);
 		handled |= fifo_intr_0_mmu_fault_pending_f();
 	}
 
@@ -1452,9 +1541,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
 				== fifo_pbdma_status_id_type_chid_v()) {
 			struct channel_gk20a *ch = &f->channel[id];
 
-			gk20a_set_error_notifier(ch,
-				NVGPU_CHANNEL_PBDMA_ERROR);
-			gk20a_fifo_recover_ch(g, id, true);
+			if (gk20a_channel_get(ch)) {
+				gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_PBDMA_ERROR);
+				gk20a_fifo_recover_ch(g, id, true);
+				gk20a_channel_put(ch);
+			}
 		} else if (fifo_pbdma_status_id_type_v(status)
 				== fifo_pbdma_status_id_type_tsgid_v()) {
 			struct tsg_gk20a *tsg = &f->tsg[id];
@@ -1462,8 +1554,11 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
 
 			mutex_lock(&tsg->ch_list_lock);
 			list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-				gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_PBDMA_ERROR);
+				if (gk20a_channel_get(ch)) {
+					gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_PBDMA_ERROR);
+					gk20a_channel_put(ch);
+				}
 			}
 			mutex_unlock(&tsg->ch_list_lock);
 			gk20a_fifo_recover_tsg(g, id, true);
@@ -1559,6 +1654,8 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 		+ msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
 	u32 ret = 0;
 
+	gk20a_dbg_fn("%d", id);
+
 	/* issue preempt */
 	if (is_tsg)
 		gk20a_writel(g, fifo_preempt_r(),
@@ -1569,6 +1666,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 			fifo_preempt_chid_f(id) |
 			fifo_preempt_type_channel_f());
 
+	gk20a_dbg_fn("%d", id);
 	/* wait for preempt */
 	ret = -EBUSY;
 	do {
@@ -1583,6 +1681,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 	} while (time_before(jiffies, end_jiffies) ||
 			!tegra_platform_is_silicon());
 
+	gk20a_dbg_fn("%d", id);
 	if (ret) {
 		if (is_tsg) {
 			struct tsg_gk20a *tsg = &g->fifo.tsg[id];
@@ -1593,8 +1692,11 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 
 			mutex_lock(&tsg->ch_list_lock);
 			list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+				if (!gk20a_channel_get(ch))
+					continue;
 				gk20a_set_error_notifier(ch,
 					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk20a_channel_put(ch);
 			}
 			mutex_unlock(&tsg->ch_list_lock);
 			gk20a_fifo_recover_tsg(g, id, true);
@@ -1604,9 +1706,12 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 			gk20a_err(dev_from_gk20a(g),
 				"preempt channel %d timeout\n", id);
 
-			gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
-			gk20a_fifo_recover_ch(g, id, true);
+			if (gk20a_channel_get(ch)) {
+				gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk20a_fifo_recover_ch(g, id, true);
+				gk20a_channel_put(ch);
+			}
 		}
 	}
 
@@ -1790,7 +1895,9 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
 		    (f->engine_info[i].runlist_id == runlist_id))
 			engines |= BIT(i);
 	}
-	gk20a_fifo_recover(g, engines, true);
+
+	if (engines)
+		gk20a_fifo_recover(g, engines, ~(u32)0, false, true);
 }
 
 static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
@@ -1994,6 +2101,8 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
 	u32 mutex_ret;
 	u32 ret = 0;
 
+	gk20a_dbg_fn("");
+
 	runlist = &f->runlist_info[runlist_id];
 
 	mutex_lock(&runlist->mutex);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index dd320ae1..fdf843d2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A graphics fifo (gr host)
  *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -106,7 +106,9 @@ struct fifo_gk20a {
 	u32 userd_entry_size;
 
 	struct channel_gk20a *channel;
-	struct mutex ch_inuse_mutex; /* protect unused chid look up */
+	/* zero-kref'd channels here */
+	struct list_head free_chs;
+	struct mutex free_chs_mutex;
 
 	struct tsg_gk20a *tsg;
 	struct mutex tsg_inuse_mutex;
@@ -130,7 +132,7 @@ struct fifo_gk20a {
 
 	} intr;
 
-	u32 mmu_fault_engines;
+	u32 deferred_fault_engines;
 	bool deferred_reset_pending;
 	struct mutex deferred_reset_mutex;
 };
@@ -157,7 +159,12 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
 int gk20a_fifo_suspend(struct gk20a *g);
 
 bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
-void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose);
+
+void gk20a_fifo_recover(struct gk20a *g,
+			u32 engine_ids, /* if zero, will be queried from HW */
+			u32 hw_id, /* if ~0, will be queried from HW */
+			bool hw_id_is_tsg, /* ignored if hw_id == ~0 */
+			bool verbose);
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose);
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose);
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 9c201f32..498de7e7 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1388,6 +1388,9 @@ static int gk20a_probe(struct platform_device *dev)
 		return -ENOMEM;
 	}
 
+	init_waitqueue_head(&gk20a->sw_irq_stall_last_handled_wq);
+	init_waitqueue_head(&gk20a->sw_irq_nonstall_last_handled_wq);
+
 #ifdef CONFIG_PM_GENERIC_DOMAINS_OF
 	gk20a_domain = container_of(dev_to_genpd(&dev->dev),
 			     struct gk20a_domain_data, gpd);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a52d97f3..d8e3586f 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -538,6 +538,15 @@ struct gk20a {
 	u32 max_ltc_count;
 	u32 ltc_count;
 
+	atomic_t hw_irq_stall_count;
+	atomic_t hw_irq_nonstall_count;
+
+	atomic_t sw_irq_stall_last_handled;
+	wait_queue_head_t sw_irq_stall_last_handled_wq;
+
+	atomic_t sw_irq_nonstall_last_handled;
+	wait_queue_head_t sw_irq_nonstall_last_handled_wq;
+
 	struct devfreq *devfreq;
 
 	struct gk20a_scale_profile *scale_profile;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index b2fea5b8..edd4c6c8 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5138,22 +5138,25 @@ static int gk20a_gr_handle_notify_pending(struct gk20a *g,
  * Also used by regops to translate current ctx to chid and tsgid.
  * For performance, we don't want to go through 128 channels every time.
  * curr_ctx should be the value read from gr_fecs_current_ctx_r().
- * A small tlb is used here to cache translation */
-static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
-				      int *curr_tsgid)
+ * A small tlb is used here to cache translation.
+ *
+ * Returned channel must be freed with gk20a_channel_put() */
+static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
+	struct gk20a *g, u32 curr_ctx, int *curr_tsgid)
 {
 	struct fifo_gk20a *f = &g->fifo;
 	struct gr_gk20a *gr = &g->gr;
 	u32 chid = -1;
 	int tsgid = NVGPU_INVALID_TSG_ID;
 	u32 i;
+	struct channel_gk20a *ret = NULL;
 
 	/* when contexts are unloaded from GR, the valid bit is reset
 	 * but the instance pointer information remains intact. So the
 	 * valid bit must be checked to be absolutely certain that a
 	 * valid context is currently resident. */
 	if (!gr_fecs_current_ctx_valid_v(curr_ctx))
-		return -1;
+		return NULL;
 
 	spin_lock(&gr->ch_tlb_lock);
 
@@ -5162,25 +5165,30 @@ static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
 		if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
 			chid = gr->chid_tlb[i].hw_chid;
 			tsgid = gr->chid_tlb[i].tsgid;
+			ret = gk20a_channel_get(&f->channel[chid]);
 			goto unlock;
 		}
 	}
 
 	/* slow path */
-	for (chid = 0; chid < f->num_channels; chid++)
-		if (f->channel[chid].in_use) {
-			if ((u32)(gk20a_mem_phys(&f->channel[chid].inst_block) >>
-				ram_in_base_shift_v()) ==
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (!gk20a_channel_get(ch))
+			continue;
+
+		if ((u32)(gk20a_mem_phys(&ch->inst_block) >>
+					ram_in_base_shift_v()) ==
 				gr_fecs_current_ctx_ptr_v(curr_ctx)) {
-				tsgid = f->channel[chid].tsgid;
-				break;
-			}
+			tsgid = ch->tsgid;
+			/* found it */
+			ret = ch;
+			break;
+		}
+		gk20a_channel_put(ch);
 	}
 
-	if (chid >= f->num_channels) {
-		chid = -1;
+	if (!ret)
 		goto unlock;
-	}
 
 	/* add to free tlb entry */
 	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
@@ -5205,7 +5213,7 @@ unlock:
 	spin_unlock(&gr->ch_tlb_lock);
 	if (curr_tsgid)
 		*curr_tsgid = tsgid;
-	return chid;
+	return ret;
 }
 
 int gk20a_gr_lock_down_sm(struct gk20a *g,
@@ -5399,6 +5407,7 @@ int gk20a_gr_isr(struct gk20a *g)
 	u32 obj_table;
 	int need_reset = 0;
 	u32 gr_intr = gk20a_readl(g, gr_intr_r());
+	struct channel_gk20a *ch = NULL;
 
 	gk20a_dbg_fn("");
 	gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
@@ -5424,13 +5433,13 @@ int gk20a_gr_isr(struct gk20a *g)
 		gr_fe_object_table_r(isr_data.sub_chan)) : 0;
 	isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
 
-	isr_data.chid =
-		gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx, NULL);
-	if (isr_data.chid == -1) {
+	ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, NULL);
+	if (!ch) {
 		gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
 			   isr_data.curr_ctx);
 		goto clean_up;
 	}
+	isr_data.chid = ch->hw_chid;
 
 	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
 		"channel %d: addr 0x%08x, "
@@ -5512,8 +5521,6 @@ int gk20a_gr_isr(struct gk20a *g)
 
 	if (gr_intr & gr_intr_exception_pending_f()) {
 		u32 exception = gk20a_readl(g, gr_exception_r());
-		struct fifo_gk20a *f = &g->fifo;
-		struct channel_gk20a *ch = &f->channel[isr_data.chid];
 
 		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
 
@@ -5572,9 +5579,20 @@ int gk20a_gr_isr(struct gk20a *g)
 	}
 
 	if (need_reset)
-		gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
+		gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A),
+				   ~(u32)0, false, true);
 
 clean_up:
+	if (gr_intr && !ch) {
+		/* Clear interrupts for unused channel. This is
+		   probably an interrupt during gk20a_free_channel() */
+		gk20a_err(dev_from_gk20a(g),
+			  "unhandled gr interrupt 0x%08x for unreferenceable channel, clearing",
+			  gr_intr);
+		gk20a_writel(g, gr_intr_r(), gr_intr);
+		gr_intr = 0;
+	}
+
 	gk20a_writel(g, gr_gpfifo_ctl_r(),
 		grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
 		gr_gpfifo_ctl_semaphore_access_f(1));
@@ -5583,6 +5601,9 @@ clean_up:
 		gk20a_err(dev_from_gk20a(g),
 			   "unhandled gr interrupt 0x%08x", gr_intr);
 
+	if (ch)
+		gk20a_channel_put(ch);
+
 	return 0;
 }
 
@@ -6670,28 +6691,34 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 
 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
 {
-	int curr_gr_chid, curr_gr_ctx, curr_gr_tsgid;
+	int curr_gr_ctx, curr_gr_tsgid;
 	struct gk20a *g = ch->g;
+	struct channel_gk20a *curr_ch;
+	bool ret = false;
 
 	curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
-	curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx,
-						  &curr_gr_tsgid);
+	curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
+					      &curr_gr_tsgid);
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
-			"curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-			" ch->hw_chid=%d", curr_gr_chid,
-			curr_gr_tsgid, ch->tsgid, ch->hw_chid);
-
-	if (curr_gr_chid == -1)
+		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
+		  " ch->hw_chid=%d",
+		  curr_ch ? curr_ch->hw_chid : -1,
+		  curr_gr_tsgid,
+		  ch->tsgid,
+		  ch->hw_chid);
+
+	if (!curr_ch)
 		return false;
 
-	if (ch->hw_chid == curr_gr_chid)
-		return true;
+	if (ch->hw_chid == curr_ch->hw_chid)
+		ret = true;
 
 	if (gk20a_is_channel_marked_as_tsg(ch) && (ch->tsgid == curr_gr_tsgid))
-		return true;
+		ret = true;
 
-	return false;
+	gk20a_channel_put(curr_ch);
+	return ret;
 }
 
 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
diff --git a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
index 06b00a25..0a773d10 100644
--- a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
@@ -40,6 +40,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g)
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_0_r());
 
+	atomic_inc(&g->hw_irq_stall_count);
+
 	trace_mc_gk20a_intr_stall_done(g->dev->name);
 
 	return IRQ_WAKE_THREAD;
@@ -63,18 +65,22 @@ irqreturn_t mc_gk20a_isr_nonstall(struct gk20a *g)
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_1_r());
 
+	atomic_inc(&g->hw_irq_nonstall_count);
+
 	return IRQ_WAKE_THREAD;
 }
 
 irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 {
 	u32 mc_intr_0;
+	int hw_irq_count;
 
 	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
 
 	trace_mc_gk20a_intr_thread_stall(g->dev->name);
 
 	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+	hw_irq_count = atomic_read(&g->hw_irq_stall_count);
 
 	gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
 
@@ -94,12 +100,17 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 	if (mc_intr_0 & mc_intr_0_pbus_pending_f())
 		gk20a_pbus_isr(g);
 
+	/* sync handled irq counter before re-enabling interrupts */
+	atomic_set(&g->sw_irq_stall_last_handled, hw_irq_count);
+
 	gk20a_writel(g, mc_intr_en_0_r(),
 		mc_intr_en_0_inta_hardware_f());
 
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_0_r());
 
+	wake_up_all(&g->sw_irq_stall_last_handled_wq);
+
 	trace_mc_gk20a_intr_thread_stall_done(g->dev->name);
 
 	return IRQ_HANDLED;
@@ -108,10 +119,12 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
 {
 	u32 mc_intr_1;
+	int hw_irq_count;
 
 	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
 
 	mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+	hw_irq_count = atomic_read(&g->hw_irq_nonstall_count);
 
 	gk20a_dbg(gpu_dbg_intr, "non-stall intr %08x\n", mc_intr_1);
 
@@ -125,12 +138,17 @@ irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
 		&& g->ops.ce2.isr_nonstall)
 		g->ops.ce2.isr_nonstall(g);
 
+	/* sync handled irq counter before re-enabling interrupts */
+	atomic_set(&g->sw_irq_nonstall_last_handled, hw_irq_count);
+
 	gk20a_writel(g, mc_intr_en_1_r(),
 		mc_intr_en_1_inta_hardware_f());
 
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_1_r());
 
+	wake_up_all(&g->sw_irq_stall_last_handled_wq);
+
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 68a31eca..23ff8677 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -283,6 +283,9 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 
 	init_runlist(g, f);
 
+	INIT_LIST_HEAD(&f->free_chs);
+	mutex_init(&f->free_chs_mutex);
+
 	for (chid = 0; chid < f->num_channels; chid++) {
 		f->channel[chid].userd_cpu_va =
 			f->userd.cpu_va + chid * f->userd_entry_size;
@@ -294,7 +297,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 
 		gk20a_init_channel_support(g, chid);
 	}
-	mutex_init(&f->ch_inuse_mutex);
 
 	f->deferred_reset_pending = false;
 	mutex_init(&f->deferred_reset_mutex);
diff --git a/include/trace/events/gk20a.h b/include/trace/events/gk20a.h
index ad738f43..461ff6e8 100644
--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -140,12 +140,54 @@ DEFINE_EVENT(gk20a, gk20a_mm_g_elpg_flush_locked_done,
 	TP_ARGS(name)
 );
 
-TRACE_EVENT(gk20a_channel_update,
-	TP_PROTO(const void *channel),
+DECLARE_EVENT_CLASS(gk20a_channel,
+	TP_PROTO(int channel),
 	TP_ARGS(channel),
-	TP_STRUCT__entry(__field(const void *, channel)),
+	TP_STRUCT__entry(__field(int, channel)),
 	TP_fast_assign(__entry->channel = channel;),
-	TP_printk("channel=%p", __entry->channel)
+	TP_printk("ch id %d", __entry->channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_channel_update,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_free_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_open_new_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_release_used_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+
+DECLARE_EVENT_CLASS(gk20a_channel_getput,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller),
+	TP_STRUCT__entry(
+		__field(int, channel)
+		__field(const char *, caller)
+	),
+	TP_fast_assign(
+		__entry->channel = channel;
+		__entry->caller = caller;
+	),
+	TP_printk("channel %d caller %s", __entry->channel, __entry->caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_get,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_put,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_put_nofree,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
 );
 
 TRACE_EVENT(gk20a_push_cmdbuf,
-- 
cgit v1.2.2