12 files changed, 635 insertions, 184 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4a3076b5..b4fdfb44 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * Color decompression engine support
 *
- * Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA Corporation.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -74,7 +74,7 @@ __must_hold(&cde_app->mutex)
        trace_gk20a_cde_remove_ctx(cde_ctx);
        /* free the channel */
-        gk20a_free_channel(cde_ctx->ch, true);
+        gk20a_channel_close(ch);
        /* ..then release mapped memory */
        gk20a_deinit_cde_img(cde_ctx);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c12f196d..5a71e874 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -42,8 +42,8 @@
 #define NVMAP_HANDLE_PARAM_SIZE 1
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
 static void free_priv_cmdbuf(struct channel_gk20a *c,
                             struct priv_cmd_entry *e);
@@ -61,29 +61,33 @@ static int channel_gk20a_update_runlist(struct channel_gk20a *c,
                                        bool add);
 static void gk20a_free_error_notifiers(struct channel_gk20a *ch);
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+/* allocate GPU channel */
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 {
        struct channel_gk20a *ch = NULL;
-        int chid;
-        mutex_lock(&f->ch_inuse_mutex);
+        mutex_lock(&f->free_chs_mutex);
-        for (chid = 0; chid < f->num_channels; chid++) {
+        if (!list_empty(&f->free_chs)) {
-                if (!f->channel[chid].in_use) {
+                ch = list_first_entry(&f->free_chs, struct channel_gk20a,
-                        f->channel[chid].in_use = true;
+                                free_chs);
-                        ch = &f->channel[chid];
+                list_del(&ch->free_chs);
-                        break;
+                WARN_ON(atomic_read(&ch->ref_count));
-                }
+                WARN_ON(ch->referenceable);
        }
-        mutex_unlock(&f->ch_inuse_mutex);
+        mutex_unlock(&f->free_chs_mutex);
        return ch;
 }
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+static void free_channel(struct fifo_gk20a *f,
+                struct channel_gk20a *ch)
 {
-        mutex_lock(&f->ch_inuse_mutex);
+        trace_gk20a_release_used_channel(ch->hw_chid);
-        f->channel[c->hw_chid].in_use = false;
+        /* refcount is zero here and channel is in a freed/dead state */
-        mutex_unlock(&f->ch_inuse_mutex);
+        mutex_lock(&f->free_chs_mutex);
+        /* add to head to increase visibility of timing-related bugs */
+        list_add(&ch->free_chs, &f->free_chs);
+        mutex_unlock(&f->free_chs_mutex);
 }
 int channel_gk20a_commit_va(struct channel_gk20a *c)
@@ -361,6 +365,11 @@ void gk20a_channel_abort(struct channel_gk20a *ch)
        struct channel_gk20a_job *job, *n;
        bool released_job_semaphore = false;
+        gk20a_dbg_fn("");
+        /* make sure new kickoffs are prevented */
+        ch->has_timedout = true;
        /* ensure no fences are pending */
        mutex_lock(&ch->submit_lock);
        if (ch->sync)
@@ -416,6 +425,8 @@ void gk20a_disable_channel(struct channel_gk20a *ch,
                           bool finish,
                           unsigned long finish_timeout)
 {
+        gk20a_dbg_fn("");
        if (finish) {
                int err = gk20a_channel_finish(ch, finish_timeout);
                WARN_ON(err);
@@ -627,8 +638,9 @@ void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
                                (u32)(nsec >> 32);
                ch->error_notifier->info32 = error;
                ch->error_notifier->status = 0xffff;
                gk20a_err(dev_from_gk20a(ch->g),
-                    "error notifier set to %d for ch %d\n", error, ch->hw_chid);
+                    "error notifier set to %d for ch %d", error, ch->hw_chid);
        }
 }
@@ -643,7 +655,53 @@ static void gk20a_free_error_notifiers(struct channel_gk20a *ch)
        }
 }
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
+/* Returns delta of cyclic integers a and b. If a is ahead of b, delta
+ * is positive */
+static int cyclic_delta(int a, int b)
+{
+        return a - b;
+}
+static void gk20a_wait_for_deferred_interrupts(struct gk20a *g)
+{
+        int stall_irq_threshold = atomic_read(&g->hw_irq_stall_count);
+        int nonstall_irq_threshold = atomic_read(&g->hw_irq_nonstall_count);
+        /* wait until all stalling irqs are handled */
+        wait_event(g->sw_irq_stall_last_handled_wq,
+                   cyclic_delta(stall_irq_threshold,
+                                atomic_read(&g->sw_irq_stall_last_handled))
+                   <= 0);
+        /* wait until all non-stalling irqs are handled */
+        wait_event(g->sw_irq_nonstall_last_handled_wq,
+                   cyclic_delta(nonstall_irq_threshold,
+                                atomic_read(&g->sw_irq_nonstall_last_handled))
+                   <= 0);
+}
+static void gk20a_wait_until_counter_is_N(
+        struct channel_gk20a *ch, atomic_t *counter, int wait_value,
+        wait_queue_head_t *wq, const char *caller, const char *counter_name)
+{
+        while (true) {
+                if (wait_event_timeout(
+                            *wq,
+                            atomic_read(counter) == wait_value,
+                            msecs_to_jiffies(5000)) > 0)
+                        break;
+                gk20a_warn(dev_from_gk20a(ch->g),
+                           "%s: channel %d, still waiting, %s left: %d, waiting for: %d",
+                           caller, ch->hw_chid, counter_name,
+                           atomic_read(counter), wait_value);
+        }
+}
+/* call ONLY when no references to the channel exist: after the last put */
+static void gk20a_free_channel(struct channel_gk20a *ch)
 {
        struct gk20a *g = ch->g;
        struct fifo_gk20a *f = &g->fifo;
@@ -654,13 +712,50 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
        gk20a_dbg_fn("");
+        WARN_ON(ch->g == NULL);
+        trace_gk20a_free_channel(ch->hw_chid);
+        /* prevent new kickoffs */
+        ch->has_timedout = true;
+        wmb();
+        /* wait until there's only our ref to the channel */
+        gk20a_wait_until_counter_is_N(
+                ch, &ch->ref_count, 1, &ch->ref_count_dec_wq,
+                __func__, "references");
+        /* wait until all pending interrupts for recently completed
+         * jobs are handled */
+        gk20a_wait_for_deferred_interrupts(g);
+        /* prevent new refs */
+        spin_lock(&ch->ref_obtain_lock);
+        if (!ch->referenceable) {
+                spin_unlock(&ch->ref_obtain_lock);
+                gk20a_err(dev_from_gk20a(ch->g),
+                          "Extra %s() called to channel %u",
+                          __func__, ch->hw_chid);
+                return;
+        }
+        ch->referenceable = false;
+        spin_unlock(&ch->ref_obtain_lock);
+        /* matches with the initial reference in gk20a_open_new_channel() */
+        atomic_dec(&ch->ref_count);
+        /* wait until no more refs to the channel */
+        gk20a_wait_until_counter_is_N(
+                ch, &ch->ref_count, 0, &ch->ref_count_dec_wq,
+                __func__, "references");
        /* if engine reset was deferred, perform it now */
        mutex_lock(&f->deferred_reset_mutex);
        if (g->fifo.deferred_reset_pending) {
                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
                           " deferred, running now");
-                gk20a_fifo_reset_engine(g, g->fifo.mmu_fault_engines);
+                gk20a_fifo_reset_engine(g, g->fifo.deferred_fault_engines);
-                g->fifo.mmu_fault_engines = 0;
+                g->fifo.deferred_fault_engines = 0;
                g->fifo.deferred_reset_pending = false;
        }
        mutex_unlock(&f->deferred_reset_mutex);
@@ -674,7 +769,7 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
        gk20a_dbg_info("freeing bound channel context, timeout=%ld",
                        timeout);
-        gk20a_disable_channel(ch, finish && !ch->has_timedout, timeout);
+        gk20a_disable_channel(ch, !ch->has_timedout, timeout);
        gk20a_free_error_notifiers(ch);
@@ -714,6 +809,10 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
        spin_unlock(&ch->update_fn_lock);
        cancel_work_sync(&ch->update_fn_work);
+        /* make sure we don't have deferred interrupts pending that
+         * could still touch the channel */
+        gk20a_wait_for_deferred_interrupts(g);
 unbind:
        if (gk20a_is_channel_marked_as_tsg(ch))
                gk20a_tsg_unbind_channel(ch);
@@ -743,8 +842,66 @@ unbind:
        mutex_unlock(&ch->dbg_s_lock);
 release:
+        /* make sure we catch accesses of unopened channels in case
+         * there's non-refcounted channel pointers hanging around */
+        ch->g = NULL;
+        wmb();
        /* ALWAYS last */
-        release_used_channel(f, ch);
+        free_channel(f, ch);
+}
+/* Try to get a reference to the channel. Return nonzero on success. If fails,
+ * the channel is dead or being freed elsewhere and you must not touch it.
+ *
+ * Always when a channel_gk20a pointer is seen and about to be used, a
+ * reference must be held to it - either by you or the caller, which should be
+ * documented well or otherwise clearly seen. This usually boils down to the
+ * file from ioctls directly, or an explicit get in exception handlers when the
+ * channel is found by a hw_chid.
+ *
+ * Most global functions in this file require a reference to be held by the
+ * caller.
+ */
+struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch,
+                                         const char *caller) {
+        struct channel_gk20a *ret;
+        spin_lock(&ch->ref_obtain_lock);
+        if (likely(ch->referenceable)) {
+                atomic_inc(&ch->ref_count);
+                ret = ch;
+        } else
+                ret = NULL;
+        spin_unlock(&ch->ref_obtain_lock);
+        if (ret)
+                trace_gk20a_channel_get(ch->hw_chid, caller);
+        return ret;
+}
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller)
+{
+        trace_gk20a_channel_put(ch->hw_chid, caller);
+        atomic_dec(&ch->ref_count);
+        wake_up_all(&ch->ref_count_dec_wq);
+        /* More puts than gets. Channel is probably going to get
+         * stuck. */
+        WARN_ON(atomic_read(&ch->ref_count) < 0);
+        /* Also, more puts than gets. ref_count can go to 0 only if
+         * the channel is closing. Channel is probably going to get
+         * stuck. */
+        WARN_ON(atomic_read(&ch->ref_count) == 0 && ch->referenceable);
+}
+void gk20a_channel_close(struct channel_gk20a *ch)
+{
+        gk20a_free_channel(ch);
 }
 int gk20a_channel_release(struct inode *inode, struct file *filp)
@@ -758,14 +915,14 @@ int gk20a_channel_release(struct inode *inode, struct file *filp)
        trace_gk20a_channel_release(dev_name(&g->dev->dev));
-        err = gk20a_busy(ch->g->dev);
+        err = gk20a_busy(g->dev);
        if (err) {
                gk20a_err(dev_from_gk20a(g), "failed to release channel %d",
                        ch->hw_chid);
                return err;
        }
-        gk20a_free_channel(ch, true);
+        gk20a_channel_close(ch);
-        gk20a_idle(ch->g->dev);
+        gk20a_idle(g->dev);
        filp->private_data = NULL;
        return 0;
@@ -808,22 +965,31 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
        struct fifo_gk20a *f = &g->fifo;
        struct channel_gk20a *ch;
-        ch = acquire_unused_channel(f);
+        gk20a_dbg_fn("");
+        ch = allocate_channel(f);
        if (ch == NULL) {
                /* TBD: we want to make this virtualizable */
                gk20a_err(dev_from_gk20a(g), "out of hw chids");
                return NULL;
        }
+        trace_gk20a_open_new_channel(ch->hw_chid);
+        BUG_ON(ch->g);
        ch->g = g;
        if (g->ops.fifo.alloc_inst(g, ch)) {
-                ch->in_use = false;
+                ch->g = NULL;
+                free_channel(f, ch);
                gk20a_err(dev_from_gk20a(g),
                           "failed to open gk20a channel, out of inst mem");
                return NULL;
        }
+        /* now the channel is in a limbo out of the free list but not marked as
+         * alive and used (i.e. get-able) yet */
        ch->pid = current->pid;
        /* By default, channel is regular (non-TSG) channel */
@@ -854,6 +1020,13 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
        spin_lock_init(&ch->update_fn_lock);
        INIT_WORK(&ch->update_fn_work, gk20a_channel_update_runcb_fn);
+        /* Mark the channel alive, get-able, with 1 initial use
+         * references. The initial reference will be decreased in
+         * gk20a_free_channel() */
+        ch->referenceable = true;
+        atomic_set(&ch->ref_count, 1);
+        wmb();
        return ch;
 }
@@ -1379,7 +1552,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
        struct mapped_buffer_node **mapped_buffers = NULL;
        int err = 0, num_mapped_buffers;
-        /* job needs reference to this vm */
+        /* job needs reference to this vm (released in channel_update) */
        gk20a_vm_get(vm);
        err = gk20a_vm_get_buffers(vm, &mapped_buffers, &num_mapped_buffers);
@@ -1395,14 +1568,21 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
                return -ENOMEM;
        }
-        job->num_mapped_buffers = num_mapped_buffers;
+        /* put() is done in gk20a_channel_update() when the job is done */
-        job->mapped_buffers = mapped_buffers;
+        c = gk20a_channel_get(c);
-        job->pre_fence = gk20a_fence_get(pre_fence);
-        job->post_fence = gk20a_fence_get(post_fence);
-        mutex_lock(&c->jobs_lock);
+        if (c) {
-        list_add_tail(&job->list, &c->jobs);
+                job->num_mapped_buffers = num_mapped_buffers;
-        mutex_unlock(&c->jobs_lock);
+                job->mapped_buffers = mapped_buffers;
+                job->pre_fence = gk20a_fence_get(pre_fence);
+                job->post_fence = gk20a_fence_get(post_fence);
+                mutex_lock(&c->jobs_lock);
+                list_add_tail(&job->list, &c->jobs);
+                mutex_unlock(&c->jobs_lock);
+        } else {
+                return -ETIMEDOUT;
+        }
        return 0;
 }
@@ -1412,13 +1592,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
        struct vm_gk20a *vm = c->vm;
        struct channel_gk20a_job *job, *n;
-        trace_gk20a_channel_update(c);
+        trace_gk20a_channel_update(c->hw_chid);
        wake_up(&c->submit_wq);
        mutex_lock(&c->submit_lock);
        mutex_lock(&c->jobs_lock);
        list_for_each_entry_safe(job, n, &c->jobs, list) {
+                struct gk20a *g = c->g;
                bool completed = gk20a_fence_is_expired(job->post_fence);
                if (!completed)
                        break;
@@ -1434,12 +1616,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
                gk20a_fence_put(job->pre_fence);
                gk20a_fence_put(job->post_fence);
-                /* job is done. release its reference to vm */
+                /* job is done. release its vm reference (taken in add_job) */
                gk20a_vm_put(vm);
+                /* another bookkeeping taken in add_job. caller must hold a ref
+                 * so this wouldn't get freed here. */
+                gk20a_channel_put(c);
                list_del_init(&job->list);
                kfree(job);
-                gk20a_idle(c->g->dev);
+                gk20a_idle(g->dev);
        }
        /*
@@ -1719,10 +1904,13 @@ clean_up:
 int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 {
        struct channel_gk20a *c = g->fifo.channel+chid;
-        c->g = g;
+        c->g = NULL;
-        c->in_use = false;
        c->hw_chid = chid;
        c->bound = false;
+        spin_lock_init(&c->ref_obtain_lock);
+        atomic_set(&c->ref_count, 0);
+        c->referenceable = false;
+        init_waitqueue_head(&c->ref_count_dec_wq);
        mutex_init(&c->ioctl_lock);
        mutex_init(&c->jobs_lock);
        mutex_init(&c->submit_lock);
@@ -1733,6 +1921,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 #endif
        INIT_LIST_HEAD(&c->dbg_s_list);
        mutex_init(&c->dbg_s_lock);
+        list_add(&c->free_chs, &g->fifo.free_chs);
        return 0;
 }
@@ -2066,8 +2255,7 @@ int gk20a_channel_suspend(struct gk20a *g)
        for (chid = 0; chid < f->num_channels; chid++) {
                struct channel_gk20a *ch = &f->channel[chid];
-                if (ch->in_use) {
+                if (gk20a_channel_get(ch)) {
                        gk20a_dbg_info("suspend channel %d", chid);
                        /* disable channel */
                        g->ops.fifo.disable_channel(ch);
@@ -2079,6 +2267,8 @@ int gk20a_channel_suspend(struct gk20a *g)
                                flush_work(&ch->update_fn_work);
                        channels_in_use = true;
+                        gk20a_channel_put(ch);
                }
        }
@@ -2086,8 +2276,10 @@ int gk20a_channel_suspend(struct gk20a *g)
                g->ops.fifo.update_runlist(g, 0, ~0, false, true);
                for (chid = 0; chid < f->num_channels; chid++) {
-                        if (f->channel[chid].in_use)
+                        if (gk20a_channel_get(&f->channel[chid])) {
                                g->ops.fifo.unbind_channel(&f->channel[chid]);
+                                gk20a_channel_put(&f->channel[chid]);
+                        }
                }
        }
@@ -2095,8 +2287,6 @@ int gk20a_channel_suspend(struct gk20a *g)
        return 0;
 }
-/* in this context the "channel" is the host1x channel which
- * maps to *all* gk20a channels */
 int gk20a_channel_resume(struct gk20a *g)
 {
        struct fifo_gk20a *f = &g->fifo;
@@ -2106,10 +2296,11 @@ int gk20a_channel_resume(struct gk20a *g)
        gk20a_dbg_fn("");
        for (chid = 0; chid < f->num_channels; chid++) {
-                if (f->channel[chid].in_use) {
+                if (gk20a_channel_get(&f->channel[chid])) {
                        gk20a_dbg_info("resume channel %d", chid);
                        g->ops.fifo.bind_channel(&f->channel[chid]);
                        channels_in_use = true;
+                        gk20a_channel_put(&f->channel[chid]);
                }
        }
@@ -2129,10 +2320,11 @@ void gk20a_channel_semaphore_wakeup(struct gk20a *g)
        for (chid = 0; chid < f->num_channels; chid++) {
                struct channel_gk20a *c = g->fifo.channel+chid;
-                if (c->in_use) {
+                if (gk20a_channel_get(c)) {
                        gk20a_channel_event(c);
                        wake_up_interruptible_all(&c->semaphore_wq);
                        gk20a_channel_update(c, 0);
+                        gk20a_channel_put(c);
                }
        }
 }
@@ -2225,10 +2417,18 @@ long gk20a_channel_ioctl(struct file *filp,
                        return -EFAULT;
        }
+        /* take a ref or return timeout if channel refs can't be taken */
+        ch = gk20a_channel_get(ch);
+        if (!ch)
+                return -ETIMEDOUT;
        /* protect our sanity for threaded userspace - most of the channel is
         * not thread safe */
        mutex_lock(&ch->ioctl_lock);
+        /* this ioctl call keeps a ref to the file which keeps a ref to the
+         * channel */
        switch (cmd) {
        case NVGPU_IOCTL_CHANNEL_OPEN:
                err = gk20a_channel_open_ioctl(ch->g,
@@ -2449,9 +2649,11 @@ long gk20a_channel_ioctl(struct file *filp,
        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
                err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
-        gk20a_dbg_fn("end");
        mutex_unlock(&ch->ioctl_lock);
+        gk20a_channel_put(ch);
+        gk20a_dbg_fn("end");
        return err;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f022fe36..2ea5b4be 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -19,12 +19,13 @@
 #define CHANNEL_GK20A_H
 #include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
 #include <linux/mutex.h>
-#include <uapi/linux/nvgpu.h>
 #include <linux/poll.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <uapi/linux/nvgpu.h>
 struct gk20a;
 struct gr_gk20a;
@@ -77,8 +78,15 @@ struct channel_gk20a_poll_events {
 /* this is the priv element of struct nvhost_channel */
 struct channel_gk20a {
-        struct gk20a *g;
+        struct gk20a *g; /* set only when channel is active */
-        bool in_use;
+        struct list_head free_chs;
+        spinlock_t ref_obtain_lock;
+        bool referenceable;
+        atomic_t ref_count;
+        wait_queue_head_t ref_count_dec_wq;
        int hw_chid;
        bool bound;
        bool first_init;
@@ -171,7 +179,10 @@ static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
 }
 int channel_gk20a_commit_va(struct channel_gk20a *c);
 int gk20a_init_channel_support(struct gk20a *, u32 chid);
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish);
+/* must be inside gk20a_busy()..gk20a_idle() */
+void gk20a_channel_close(struct channel_gk20a *ch);
 bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
                                            u32 timeout_delta_ms);
 void gk20a_disable_channel(struct channel_gk20a *ch,
@@ -202,6 +213,15 @@ void gk20a_channel_event(struct channel_gk20a *ch);
 void gk20a_init_channel(struct gpu_ops *gops);
+/* returns ch if reference was obtained */
+struct channel_gk20a *__must_check _gk20a_channel_get(struct channel_gk20a *ch,
+                                                      const char *caller);
+#define gk20a_channel_get(ch) _gk20a_channel_get(ch, __func__)
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller);
+#define gk20a_channel_put(ch) _gk20a_channel_put(ch, __func__)
 int gk20a_wait_channel_idle(struct channel_gk20a *ch);
 struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g);
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 8cc852c7..7a707fbd 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -154,8 +154,23 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
 {
-        struct channel_gk20a *ch20a = priv;
+        struct channel_gk20a *ch = priv;
-        gk20a_channel_update(ch20a, nr_completed);
+        struct gk20a *g = ch->g;
+        /* need busy for possible channel deletion */
+        if (gk20a_busy(ch->g->dev)) {
+                gk20a_err(dev_from_gk20a(ch->g),
+                                "failed to busy while syncpt update");
+                /* Last gk20a_idle()s are in channel_update, so we shouldn't
+                 * get here. If we do, the channel is badly broken now */
+                return;
+        }
+        /* note: channel_get() is in __gk20a_channel_syncpt_incr() */
+        gk20a_channel_update(ch, nr_completed);
+        gk20a_channel_put(ch);
+        gk20a_idle(g->dev);
 }
 static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
@@ -209,14 +224,37 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
        thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 2);
        if (register_irq) {
-                err = nvhost_intr_register_notifier(sp->host1x_pdev,
+                err = gk20a_busy(c->g->dev);
-                                sp->id, thresh,
+                if (err)
-                                gk20a_channel_syncpt_update, c);
+                        gk20a_err(dev_from_gk20a(c->g),
+                                  "failed to add syncpt interrupt notifier for channel %d",
-                /* Adding interrupt action should never fail. A proper error
+                                  c->hw_chid);
-                 * handling here would require us to decrement the syncpt max
+                else {
-                 * back to its original value. */
+                        struct channel_gk20a *referenced = gk20a_channel_get(c);
-                WARN(err, "failed to set submit complete interrupt");
+                        WARN_ON(!referenced);
+                        gk20a_idle(c->g->dev);
+                        if (referenced) {
+                                /* note: channel_put() is in
+                                 * gk20a_channel_syncpt_update() */
+                                err = nvhost_intr_register_notifier(
+                                        sp->host1x_pdev,
+                                        sp->id, thresh,
+                                        gk20a_channel_syncpt_update, c);
+                                if (err)
+                                        gk20a_channel_put(referenced);
+                                /* Adding interrupt action should
+                                 * never fail. A proper error handling
+                                 * here would require us to decrement
+                                 * the syncpt max back to its original
+                                 * value. */
+                                WARN(err,
+                                     "failed to set submit complete interrupt");
+                        }
+                }
        }
        *fence = gk20a_fence_from_syncpt(sp->host1x_pdev, sp->id, thresh,
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index 0f1c31dd..bda0dab0 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,6 +36,7 @@ static struct platform_device *gk20a_device;
 struct ch_state {
        int pid;
+        int refs;
        u8 inst_block[0];
 };
@@ -118,9 +119,10 @@ static void gk20a_debug_show_channel(struct gk20a *g,
        syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
        syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
-        gk20a_debug_output(o, "%d-%s, pid %d: ", hw_chid,
+        gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
                        g->dev->name,
-                        ch_state->pid);
+                        ch_state->pid,
+                        ch_state->refs);
        gk20a_debug_output(o, "%s in use %s %s\n",
                        ccsr_channel_enable_v(channel) ? "" : "not",
                        ccsr_chan_status_str[status],
@@ -231,16 +233,30 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
        }
        for (chid = 0; chid < f->num_channels; chid++) {
-                if (f->channel[chid].in_use)
+                struct channel_gk20a *ch = &f->channel[chid];
-                        ch_state[chid] = kmalloc(sizeof(struct ch_state) + ram_in_alloc_size_v(), GFP_KERNEL);
+                if (gk20a_channel_get(ch)) {
+                        ch_state[chid] =
+                                kmalloc(sizeof(struct ch_state) +
+                                        ram_in_alloc_size_v(), GFP_KERNEL);
+                        /* ref taken stays to below loop with
+                         * successful allocs */
+                        if (!ch_state[chid])
+                                gk20a_channel_put(ch);
+                }
        }
        for (chid = 0; chid < f->num_channels; chid++) {
-                if (ch_state[chid] && f->channel[chid].inst_block.cpu_va) {
+                struct channel_gk20a *ch = &f->channel[chid];
-                        ch_state[chid]->pid = f->channel[chid].pid;
+                if (ch_state[chid]) {
-                        memcpy(&ch_state[chid]->inst_block[0],
+                        if (ch->inst_block.cpu_va) {
-                               f->channel[chid].inst_block.cpu_va,
+                                ch_state[chid]->pid = ch->pid;
-                               ram_in_alloc_size_v());
+                                ch_state[chid]->refs =
+                                        atomic_read(&ch->ref_count);
+                                memcpy(&ch_state[chid]->inst_block[0],
+                                                ch->inst_block.cpu_va,
+                                                ram_in_alloc_size_v());
+                        }
+                        gk20a_channel_put(ch);
                }
        }
        for (chid = 0; chid < f->num_channels; chid++) {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 56b954a9..4ef310b2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -515,6 +515,9 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
        init_runlist(g, f);
+        INIT_LIST_HEAD(&f->free_chs);
+        mutex_init(&f->free_chs_mutex);
        for (chid = 0; chid < f->num_channels; chid++) {
                f->channel[chid].userd_cpu_va =
                        f->userd.cpu_va + chid * f->userd_entry_size;
@@ -527,7 +530,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
                gk20a_init_channel_support(g, chid);
                gk20a_init_tsg_support(g, chid);
        }
-        mutex_init(&f->ch_inuse_mutex);
        mutex_init(&f->tsg_inuse_mutex);
        f->remove_support = gk20a_remove_fifo_support;
@@ -637,6 +639,7 @@ int gk20a_init_fifo_support(struct gk20a *g)
        return err;
 }
+/* return with a reference to the channel, caller must put it back */
 static struct channel_gk20a *
 channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
 {
@@ -644,10 +647,16 @@ channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
        if (unlikely(!f->channel))
                return NULL;
        for (ci = 0; ci < f->num_channels; ci++) {
-                struct channel_gk20a *c = f->channel+ci;
+                struct channel_gk20a *ch = gk20a_channel_get(&f->channel[ci]);
-                if (c->inst_block.cpu_va &&
+                /* only alive channels are searched */
-                    (inst_ptr == gk20a_mem_phys(&c->inst_block)))
+                if (!ch)
-                        return f->channel+ci;
+                        continue;
+                if (ch->inst_block.cpu_va &&
+                    (inst_ptr == gk20a_mem_phys(&ch->inst_block)))
+                        return ch;
+                gk20a_channel_put(ch);
        }
        return NULL;
 }
@@ -803,6 +812,7 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
        return true;
 }
+/* caller must hold a channel reference */
 static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
                struct channel_gk20a *ch)
 {
@@ -854,14 +864,38 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
                "TSG %d generated a mmu fault", tsg->tsgid);
        mutex_lock(&tsg->ch_list_lock);
-        list_for_each_entry(ch, &tsg->ch_list, ch_entry)
+        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-                ret = gk20a_fifo_set_ctx_mmu_error(g, ch);
+                if (gk20a_channel_get(ch)) {
+                        if (!gk20a_fifo_set_ctx_mmu_error(g, ch))
+                                ret = false;
+                        gk20a_channel_put(ch);
+                }
+        }
        mutex_unlock(&tsg->ch_list_lock);
        return ret;
 }
-static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
+{
+        struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
+        struct channel_gk20a *ch;
+        mutex_lock(&tsg->ch_list_lock);
+        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+                if (gk20a_channel_get(ch)) {
+                        gk20a_channel_abort(ch);
+                        gk20a_channel_put(ch);
+                }
+        }
+        mutex_unlock(&tsg->ch_list_lock);
+}
+static bool gk20a_fifo_handle_mmu_fault(
+        struct gk20a *g,
+        u32 mmu_fault_engines, /* queried from HW if 0 */
+        u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
+        bool id_is_tsg)
 {
        bool fake_fault;
        unsigned long fault_id;
@@ -894,10 +928,8 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
                grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
                gr_gpfifo_ctl_semaphore_access_f(0));
-        /* If we have recovery in progress, MMU fault id is invalid */
+        if (mmu_fault_engines) {
-        if (g->fifo.mmu_fault_engines) {
+                fault_id = mmu_fault_engines;
-                fault_id = g->fifo.mmu_fault_engines;
-                g->fifo.mmu_fault_engines = 0;
                fake_fault = true;
        } else {
                fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
@@ -914,6 +946,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
                struct fifo_mmu_fault_info_gk20a f;
                struct channel_gk20a *ch = NULL;
                struct tsg_gk20a *tsg = NULL;
+                struct channel_gk20a *referenced_channel = 0;
                /* read and parse engine status */
                u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -953,22 +986,34 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
                /* get the channel/TSG */
                if (fake_fault) {
                        /* use next_id if context load is failing */
-                        u32 id = (ctx_status ==
+                        u32 id, type;
-                                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-                                fifo_engine_status_next_id_v(status) :
+                        if (hw_id == ~(u32)0) {
-                                fifo_engine_status_id_v(status);
+                                id = (ctx_status ==
-                        u32 type = (ctx_status ==
+                                      fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-                                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                        fifo_engine_status_next_id_v(status) :
-                                fifo_engine_status_next_id_type_v(status) :
+                                        fifo_engine_status_id_v(status);
-                                fifo_engine_status_id_type_v(status);
+                                type = (ctx_status ==
+                                        fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                        fifo_engine_status_next_id_type_v(status) :
+                                        fifo_engine_status_id_type_v(status);
+                        } else {
+                                id = hw_id;
+                                type = id_is_tsg ?
+                                        fifo_engine_status_id_type_tsgid_v() :
+                                        fifo_engine_status_id_type_chid_v();
+                        }
                        if (type == fifo_engine_status_id_type_tsgid_v())
                                tsg = &g->fifo.tsg[id];
-                        else if (type == fifo_engine_status_id_type_chid_v())
+                        else if (type == fifo_engine_status_id_type_chid_v()) {
                                ch = &g->fifo.channel[id];
+                                referenced_channel = gk20a_channel_get(ch);
+                        }
                } else {
                        /* read channel based on instruction pointer */
                        ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
+                        referenced_channel = ch;
                }
                if (ch && gk20a_is_channel_marked_as_tsg(ch))
@@ -977,7 +1022,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
                /* check if engine reset should be deferred */
                if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g,
                                engine_id, &f, fake_fault)) {
-                        g->fifo.mmu_fault_engines = fault_id;
+                        g->fifo.deferred_fault_engines = fault_id;
                        /* handled during channel free */
                        g->fifo.deferred_reset_pending = true;
@@ -988,19 +1033,31 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
                 * syncpoints */
                if (tsg) {
-                        struct channel_gk20a *ch = NULL;
                        if (!g->fifo.deferred_reset_pending)
                                verbose =
                                       gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
-                        mutex_lock(&tsg->ch_list_lock);
-                        list_for_each_entry(ch, &tsg->ch_list, ch_entry)
+                        gk20a_fifo_abort_tsg(g, ch->tsgid);
-                                gk20a_channel_abort(ch);
-                        mutex_unlock(&tsg->ch_list_lock);
+                        /* put back the ref taken early above */
+                        if (referenced_channel) {
+                                gk20a_channel_put(ch);
+                        } else {
+                                gk20a_err(dev_from_gk20a(g),
+                                                "mmu error in freed tsg channel %d on tsgid %d",
+                                                ch->hw_chid, ch->tsgid);
+                        }
                } else if (ch) {
-                        if (!g->fifo.deferred_reset_pending)
+                        if (referenced_channel) {
-                                verbose =
+                                if (!g->fifo.deferred_reset_pending)
-                                        gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+                                        verbose = gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
-                        gk20a_channel_abort(ch);
+                                gk20a_channel_abort(ch);
+                                gk20a_channel_put(ch);
+                        } else {
+                                gk20a_err(dev_from_gk20a(g),
+                                                "mmu error in freed channel %d",
+                                                ch->hw_chid);
+                        }
                } else if (f.inst_ptr ==
                                gk20a_mem_phys(&g->mm.bar1.inst_block)) {
                        gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
@@ -1133,46 +1190,69 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
 {
-        u32 engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
+        u32 engines;
+        /* stop context switching to prevent engine assignments from
+           changing until channel is recovered */
+        mutex_lock(&g->dbg_sessions_lock);
+        gr_gk20a_disable_ctxsw(g);
+        engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
        if (engines)
-                gk20a_fifo_recover(g, engines, verbose);
+                gk20a_fifo_recover(g, engines, hw_chid, false, verbose);
        else {
-                struct channel_gk20a *ch =
+                struct channel_gk20a *ch = &g->fifo.channel[hw_chid];
-                        g->fifo.channel + hw_chid;
-                gk20a_channel_abort(ch);
+                if (gk20a_channel_get(ch)) {
+                        gk20a_channel_abort(ch);
-                if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
+                        if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
-                        gk20a_debug_dump(g->dev);
+                                gk20a_debug_dump(g->dev);
+                        gk20a_channel_put(ch);
+                }
        }
+        gr_gk20a_enable_ctxsw(g);
+        mutex_unlock(&g->dbg_sessions_lock);
 }
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 {
-        u32 engines = gk20a_fifo_engines_on_id(g, tsgid, true);
+        u32 engines;
+        /* stop context switching to prevent engine assignments from
+           changing until TSG is recovered */
+        mutex_lock(&g->dbg_sessions_lock);
+        gr_gk20a_disable_ctxsw(g);
+        engines = gk20a_fifo_engines_on_id(g, tsgid, true);
        if (engines)
-                gk20a_fifo_recover(g, engines, verbose);
+                gk20a_fifo_recover(g, engines, tsgid, true, verbose);
        else {
                struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
-                struct channel_gk20a *ch;
                if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg))
                        gk20a_debug_dump(g->dev);
-                mutex_lock(&tsg->ch_list_lock);
+                gk20a_fifo_abort_tsg(g, tsgid);
-                list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-                        gk20a_channel_abort(ch);
-                mutex_unlock(&tsg->ch_list_lock);
        }
+        gr_gk20a_enable_ctxsw(g);
+        mutex_unlock(&g->dbg_sessions_lock);
 }
 void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
-                bool verbose)
+                        u32 hw_id, bool id_is_tsg,
+                        bool verbose)
 {
        unsigned long engine_id, i;
        unsigned long _engine_ids = __engine_ids;
        unsigned long engine_ids = 0;
        u32 val;
+        u32 mmu_fault_engines = 0;
        if (verbose)
                gk20a_debug_dump(g->dev);
@@ -1181,7 +1261,6 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
                g->ops.ltc.flush(g);
        /* store faulted engines in advance */
-        g->fifo.mmu_fault_engines = 0;
        for_each_set_bit(engine_id, &_engine_ids, 32) {
                u32 ref_type;
                u32 ref_id;
@@ -1196,11 +1275,10 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
                        gk20a_fifo_get_faulty_id_type(g, i, &id, &type);
                        if (ref_type == type && ref_id == id) {
                                engine_ids |= BIT(i);
-                                g->fifo.mmu_fault_engines |=
+                                mmu_fault_engines |=
                                        BIT(gk20a_engine_id_to_mmu_id(i));
                        }
                }
        }
        /*
@@ -1214,7 +1292,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
                        fifo_intr_0_sched_error_reset_f());
        g->ops.fifo.trigger_mmu_fault(g, engine_ids);
-        gk20a_fifo_handle_mmu_fault(g);
+        gk20a_fifo_handle_mmu_fault(g, engine_ids, hw_id, id_is_tsg);
        val = gk20a_readl(g, fifo_intr_en_0_r());
        val |= fifo_intr_en_0_mmu_fault_f(1)
@@ -1222,25 +1300,32 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
        gk20a_writel(g, fifo_intr_en_0_r(), val);
 }
+/* force reset channel and tsg (if it's part of one) */
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
 {
        struct tsg_gk20a *tsg = NULL;
        struct channel_gk20a *ch_tsg = NULL;
+        struct gk20a *g = ch->g;
        if (gk20a_is_channel_marked_as_tsg(ch)) {
-                tsg = &ch->g->fifo.tsg[ch->hw_chid];
+                tsg = &g->fifo.tsg[ch->hw_chid];
                mutex_lock(&tsg->ch_list_lock);
                list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
-                        gk20a_set_error_notifier(ch_tsg,
+                        if (gk20a_channel_get(ch_tsg)) {
-                               NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+                                gk20a_set_error_notifier(ch_tsg,
+                                       NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+                                gk20a_channel_put(ch_tsg);
+                        }
                }
                mutex_unlock(&tsg->ch_list_lock);
-                gk20a_fifo_recover_tsg(ch->g, ch->tsgid, verbose);
+                gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
        } else {
                gk20a_set_error_notifier(ch,
                        NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
-                gk20a_fifo_recover_ch(ch->g, ch->hw_chid, verbose);
+                gk20a_fifo_recover_ch(g, ch->hw_chid, verbose);
        }
        return 0;
@@ -1300,11 +1385,14 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
                struct channel_gk20a *ch = &f->channel[id];
                if (non_chid) {
-                        gk20a_fifo_recover(g, BIT(engine_id), true);
+                        gk20a_fifo_recover(g, BIT(engine_id), id, true, true);
                        ret = true;
                        goto err;
                }
+                if (!gk20a_channel_get(ch))
+                        goto err;
                if (gk20a_channel_update_and_check_timeout(ch,
                        GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
                        gk20a_set_error_notifier(ch,
@@ -1313,7 +1401,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
                                "fifo sched ctxsw timeout error:"
                                "engine = %u, ch = %d", engine_id, id);
                        gk20a_gr_debug_dump(g->dev);
-                        gk20a_fifo_recover(g, BIT(engine_id),
+                        gk20a_fifo_recover(g, BIT(engine_id), id, false,
                                ch->timeout_debug_dump);
                        ret = true;
                } else {
@@ -1324,6 +1412,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
                                id);
                        ret = false;
                }
+                gk20a_channel_put(ch);
                return ret;
        }
@@ -1336,7 +1425,7 @@ err:
 static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
 {
-        bool print_channel_reset_log = false, reset_engine = false;
+        bool print_channel_reset_log = false;
        struct device *dev = dev_from_gk20a(g);
        u32 handled = 0;
@@ -1367,8 +1456,8 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
        }
        if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
-                print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g);
+                print_channel_reset_log =
-                reset_engine  = true;
+                        gk20a_fifo_handle_mmu_fault(g, 0, ~(u32)0, false);
                handled |= fifo_intr_0_mmu_fault_pending_f();
        }
@@ -1452,9 +1541,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
                                == fifo_pbdma_status_id_type_chid_v()) {
                        struct channel_gk20a *ch = &f->channel[id];
-                        gk20a_set_error_notifier(ch,
+                        if (gk20a_channel_get(ch)) {
-                                NVGPU_CHANNEL_PBDMA_ERROR);
+                                gk20a_set_error_notifier(ch,
-                        gk20a_fifo_recover_ch(g, id, true);
+                                                NVGPU_CHANNEL_PBDMA_ERROR);
+                                gk20a_fifo_recover_ch(g, id, true);
+                                gk20a_channel_put(ch);
+                        }
                } else if (fifo_pbdma_status_id_type_v(status)
                                == fifo_pbdma_status_id_type_tsgid_v()) {
                        struct tsg_gk20a *tsg = &f->tsg[id];
@@ -1462,8 +1554,11 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
                        mutex_lock(&tsg->ch_list_lock);
                        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-                                gk20a_set_error_notifier(ch,
+                                if (gk20a_channel_get(ch)) {
-                                        NVGPU_CHANNEL_PBDMA_ERROR);
+                                        gk20a_set_error_notifier(ch,
+                                                NVGPU_CHANNEL_PBDMA_ERROR);
+                                        gk20a_channel_put(ch);
+                                }
                        }
                        mutex_unlock(&tsg->ch_list_lock);
                        gk20a_fifo_recover_tsg(g, id, true);
@@ -1559,6 +1654,8 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
                + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
        u32 ret = 0;
+        gk20a_dbg_fn("%d", id);
        /* issue preempt */
        if (is_tsg)
                gk20a_writel(g, fifo_preempt_r(),
@@ -1569,6 +1666,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
                        fifo_preempt_chid_f(id) |
                        fifo_preempt_type_channel_f());
+        gk20a_dbg_fn("%d", id);
        /* wait for preempt */
        ret = -EBUSY;
        do {
@@ -1583,6 +1681,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
        } while (time_before(jiffies, end_jiffies) ||
                        !tegra_platform_is_silicon());
+        gk20a_dbg_fn("%d", id);
        if (ret) {
                if (is_tsg) {
                        struct tsg_gk20a *tsg = &g->fifo.tsg[id];
@@ -1593,8 +1692,11 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
                        mutex_lock(&tsg->ch_list_lock);
                        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+                                if (!gk20a_channel_get(ch))
+                                        continue;
                                gk20a_set_error_notifier(ch,
                                        NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+                                gk20a_channel_put(ch);
                        }
                        mutex_unlock(&tsg->ch_list_lock);
                        gk20a_fifo_recover_tsg(g, id, true);
@@ -1604,9 +1706,12 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
                        gk20a_err(dev_from_gk20a(g),
                                "preempt channel %d timeout\n", id);
-                        gk20a_set_error_notifier(ch,
+                        if (gk20a_channel_get(ch)) {
-                                        NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+                                gk20a_set_error_notifier(ch,
-                        gk20a_fifo_recover_ch(g, id, true);
+                                                NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+                                gk20a_fifo_recover_ch(g, id, true);
+                                gk20a_channel_put(ch);
+                        }
                }
        }
@@ -1790,7 +1895,9 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
                    (f->engine_info[i].runlist_id == runlist_id))
                        engines |= BIT(i);
        }
-        gk20a_fifo_recover(g, engines, true);
+        if (engines)
+                gk20a_fifo_recover(g, engines, ~(u32)0, false, true);
 }
 static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
@@ -1994,6 +2101,8 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
        u32 mutex_ret;
        u32 ret = 0;
+        gk20a_dbg_fn("");
        runlist = &f->runlist_info[runlist_id];
        mutex_lock(&runlist->mutex);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index dd320ae1..fdf843d2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A graphics fifo (gr host)
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -106,7 +106,9 @@ struct fifo_gk20a {
        u32 userd_entry_size;
        struct channel_gk20a *channel;
-        struct mutex ch_inuse_mutex; /* protect unused chid look up */
+        /* zero-kref'd channels here */
+        struct list_head free_chs;
+        struct mutex free_chs_mutex;
        struct tsg_gk20a *tsg;
        struct mutex tsg_inuse_mutex;
@@ -130,7 +132,7 @@ struct fifo_gk20a {
        } intr;
-        u32 mmu_fault_engines;
+        u32 deferred_fault_engines;
        bool deferred_reset_pending;
        struct mutex deferred_reset_mutex;
 };
@@ -157,7 +159,12 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
 int gk20a_fifo_suspend(struct gk20a *g);
 bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
-void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose);
+void gk20a_fifo_recover(struct gk20a *g,
+                        u32 engine_ids, /* if zero, will be queried from HW */
+                        u32 hw_id, /* if ~0, will be queried from HW */
+                        bool hw_id_is_tsg, /* ignored if hw_id == ~0 */
+                        bool verbose);
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose);
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose);
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 9c201f32..498de7e7 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1388,6 +1388,9 @@ static int gk20a_probe(struct platform_device *dev)
                return -ENOMEM;
        }
+        init_waitqueue_head(&gk20a->sw_irq_stall_last_handled_wq);
+        init_waitqueue_head(&gk20a->sw_irq_nonstall_last_handled_wq);
 #ifdef CONFIG_PM_GENERIC_DOMAINS_OF
        gk20a_domain = container_of(dev_to_genpd(&dev->dev),
                             struct gk20a_domain_data, gpd);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a52d97f3..d8e3586f 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -538,6 +538,15 @@ struct gk20a {
        u32 max_ltc_count;
        u32 ltc_count;
+        atomic_t hw_irq_stall_count;
+        atomic_t hw_irq_nonstall_count;
+        atomic_t sw_irq_stall_last_handled;
+        wait_queue_head_t sw_irq_stall_last_handled_wq;
+        atomic_t sw_irq_nonstall_last_handled;
+        wait_queue_head_t sw_irq_nonstall_last_handled_wq;
        struct devfreq *devfreq;
        struct gk20a_scale_profile *scale_profile;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index b2fea5b8..edd4c6c8 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5138,22 +5138,25 @@ static int gk20a_gr_handle_notify_pending(struct gk20a *g,
 * Also used by regops to translate current ctx to chid and tsgid.
 * For performance, we don't want to go through 128 channels every time.
 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
- * A small tlb is used here to cache translation */
+ * A small tlb is used here to cache translation.
-static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
+ *
-                                      int *curr_tsgid)
+ * Returned channel must be freed with gk20a_channel_put() */
+static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
+        struct gk20a *g, u32 curr_ctx, int *curr_tsgid)
 {
        struct fifo_gk20a *f = &g->fifo;
        struct gr_gk20a *gr = &g->gr;
        u32 chid = -1;
        int tsgid = NVGPU_INVALID_TSG_ID;
        u32 i;
+        struct channel_gk20a *ret = NULL;
        /* when contexts are unloaded from GR, the valid bit is reset
         * but the instance pointer information remains intact. So the
         * valid bit must be checked to be absolutely certain that a
         * valid context is currently resident. */
        if (!gr_fecs_current_ctx_valid_v(curr_ctx))
-                return -1;
+                return NULL;
        spin_lock(&gr->ch_tlb_lock);
@@ -5162,25 +5165,30 @@ static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
                if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
                        chid = gr->chid_tlb[i].hw_chid;
                        tsgid = gr->chid_tlb[i].tsgid;
+                        ret = gk20a_channel_get(&f->channel[chid]);
                        goto unlock;
                }
        }
        /* slow path */
-        for (chid = 0; chid < f->num_channels; chid++)
+        for (chid = 0; chid < f->num_channels; chid++) {
-                if (f->channel[chid].in_use) {
+                struct channel_gk20a *ch = &f->channel[chid];
-                        if ((u32)(gk20a_mem_phys(&f->channel[chid].inst_block) >>
+                if (!gk20a_channel_get(ch))
-                                ram_in_base_shift_v()) ==
+                        continue;
+                if ((u32)(gk20a_mem_phys(&ch->inst_block) >>
+                                        ram_in_base_shift_v()) ==
                                gr_fecs_current_ctx_ptr_v(curr_ctx)) {
-                                tsgid = f->channel[chid].tsgid;
+                        tsgid = ch->tsgid;
-                                break;
+                        /* found it */
-                        }
+                        ret = ch;
+                        break;
+                }
+                gk20a_channel_put(ch);
        }
-        if (chid >= f->num_channels) {
+        if (!ret)
-                chid = -1;
                goto unlock;
-        }
        /* add to free tlb entry */
        for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
@@ -5205,7 +5213,7 @@ unlock:
        spin_unlock(&gr->ch_tlb_lock);
        if (curr_tsgid)
                *curr_tsgid = tsgid;
-        return chid;
+        return ret;
 }
 int gk20a_gr_lock_down_sm(struct gk20a *g,
@@ -5399,6 +5407,7 @@ int gk20a_gr_isr(struct gk20a *g)
        u32 obj_table;
        int need_reset = 0;
        u32 gr_intr = gk20a_readl(g, gr_intr_r());
+        struct channel_gk20a *ch = NULL;
        gk20a_dbg_fn("");
        gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
@@ -5424,13 +5433,13 @@ int gk20a_gr_isr(struct gk20a *g)
                gr_fe_object_table_r(isr_data.sub_chan)) : 0;
        isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
-        isr_data.chid =
+        ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, NULL);
-                gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx, NULL);
+        if (!ch) {
-        if (isr_data.chid == -1) {
                gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
                           isr_data.curr_ctx);
                goto clean_up;
        }
+        isr_data.chid = ch->hw_chid;
        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
                "channel %d: addr 0x%08x, "
@@ -5512,8 +5521,6 @@ int gk20a_gr_isr(struct gk20a *g)
        if (gr_intr & gr_intr_exception_pending_f()) {
                u32 exception = gk20a_readl(g, gr_exception_r());
-                struct fifo_gk20a *f = &g->fifo;
-                struct channel_gk20a *ch = &f->channel[isr_data.chid];
                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
@@ -5572,9 +5579,20 @@ int gk20a_gr_isr(struct gk20a *g)
        }
        if (need_reset)
-                gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
+                gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A),
+                                   ~(u32)0, false, true);
 clean_up:
+        if (gr_intr && !ch) {
+                /* Clear interrupts for unused channel. This is
+                   probably an interrupt during gk20a_free_channel() */
+                gk20a_err(dev_from_gk20a(g),
+                          "unhandled gr interrupt 0x%08x for unreferenceable channel, clearing",
+                          gr_intr);
+                gk20a_writel(g, gr_intr_r(), gr_intr);
+                gr_intr = 0;
+        }
        gk20a_writel(g, gr_gpfifo_ctl_r(),
                grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
                gr_gpfifo_ctl_semaphore_access_f(1));
@@ -5583,6 +5601,9 @@ clean_up:
                gk20a_err(dev_from_gk20a(g),
                           "unhandled gr interrupt 0x%08x", gr_intr);
+        if (ch)
+                gk20a_channel_put(ch);
        return 0;
 }
@@ -6670,28 +6691,34 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
 {
-        int curr_gr_chid, curr_gr_ctx, curr_gr_tsgid;
+        int curr_gr_ctx, curr_gr_tsgid;
        struct gk20a *g = ch->g;
+        struct channel_gk20a *curr_ch;
+        bool ret = false;
        curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
-        curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx,
+        curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
-                                                  &curr_gr_tsgid);
+                                              &curr_gr_tsgid);
        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
-                        "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
+                  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-                        " ch->hw_chid=%d", curr_gr_chid,
+                  " ch->hw_chid=%d",
-                        curr_gr_tsgid, ch->tsgid, ch->hw_chid);
+                  curr_ch ? curr_ch->hw_chid : -1,
+                  curr_gr_tsgid,
-        if (curr_gr_chid == -1)
+                  ch->tsgid,
+                  ch->hw_chid);
+        if (!curr_ch)
                return false;
-        if (ch->hw_chid == curr_gr_chid)
+        if (ch->hw_chid == curr_ch->hw_chid)
-                return true;
+                ret = true;
        if (gk20a_is_channel_marked_as_tsg(ch) && (ch->tsgid == curr_gr_tsgid))
-                return true;
+                ret = true;
-        return false;
+        gk20a_channel_put(curr_ch);
+        return ret;
 }
 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
diff --git a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
index 06b00a25..0a773d10 100644
--- a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
@@ -40,6 +40,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g)
        /* flush previous write */
        gk20a_readl(g, mc_intr_en_0_r());
+        atomic_inc(&g->hw_irq_stall_count);
        trace_mc_gk20a_intr_stall_done(g->dev->name);
        return IRQ_WAKE_THREAD;
@@ -63,18 +65,22 @@ irqreturn_t mc_gk20a_isr_nonstall(struct gk20a *g)
        /* flush previous write */
        gk20a_readl(g, mc_intr_en_1_r());
+        atomic_inc(&g->hw_irq_nonstall_count);
        return IRQ_WAKE_THREAD;
 }
 irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 {
        u32 mc_intr_0;
+        int hw_irq_count;
        gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
        trace_mc_gk20a_intr_thread_stall(g->dev->name);
        mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+        hw_irq_count = atomic_read(&g->hw_irq_stall_count);
        gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
@@ -94,12 +100,17 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
        if (mc_intr_0 & mc_intr_0_pbus_pending_f())
                gk20a_pbus_isr(g);
+        /* sync handled irq counter before re-enabling interrupts */
+        atomic_set(&g->sw_irq_stall_last_handled, hw_irq_count);
        gk20a_writel(g, mc_intr_en_0_r(),
                mc_intr_en_0_inta_hardware_f());
        /* flush previous write */
        gk20a_readl(g, mc_intr_en_0_r());
+        wake_up_all(&g->sw_irq_stall_last_handled_wq);
        trace_mc_gk20a_intr_thread_stall_done(g->dev->name);
        return IRQ_HANDLED;
@@ -108,10 +119,12 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
 {
        u32 mc_intr_1;
+        int hw_irq_count;
        gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
        mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+        hw_irq_count = atomic_read(&g->hw_irq_nonstall_count);
        gk20a_dbg(gpu_dbg_intr, "non-stall intr %08x\n", mc_intr_1);
@@ -125,12 +138,17 @@ irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
                && g->ops.ce2.isr_nonstall)
                g->ops.ce2.isr_nonstall(g);
+        /* sync handled irq counter before re-enabling interrupts */
+        atomic_set(&g->sw_irq_nonstall_last_handled, hw_irq_count);
        gk20a_writel(g, mc_intr_en_1_r(),
                mc_intr_en_1_inta_hardware_f());
        /* flush previous write */
        gk20a_readl(g, mc_intr_en_1_r());
+        wake_up_all(&g->sw_irq_stall_last_handled_wq);
        return IRQ_HANDLED;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 68a31eca..23ff8677 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -283,6 +283,9 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
        init_runlist(g, f);
+        INIT_LIST_HEAD(&f->free_chs);
+        mutex_init(&f->free_chs_mutex);
        for (chid = 0; chid < f->num_channels; chid++) {
                f->channel[chid].userd_cpu_va =
                        f->userd.cpu_va + chid * f->userd_entry_size;
@@ -294,7 +297,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
                gk20a_init_channel_support(g, chid);
        }
-        mutex_init(&f->ch_inuse_mutex);
        f->deferred_reset_pending = false;
        mutex_init(&f->deferred_reset_mutex);