11 files changed, 241 insertions, 226 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 2c2850c6..6eecebf5 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -177,7 +177,7 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 }
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
-                                u32 timeslice_period, bool interleave)
+                                u32 timeslice_period)
 {
        void *inst_ptr;
        int shift = 0, value = 0;
@@ -205,30 +205,6 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
                gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
                ccsr_channel_enable_set_true_f());
-        if (c->interleave != interleave) {
-                mutex_lock(&c->g->interleave_lock);
-                c->interleave = interleave;
-                if (interleave)
-                        if (c->g->num_interleaved_channels >=
-                                        MAX_INTERLEAVED_CHANNELS) {
-                                gk20a_err(dev_from_gk20a(c->g),
-                                        "Change of priority would exceed runlist length, only changing timeslice\n");
-                                c->interleave = false;
-                        } else
-                                c->g->num_interleaved_channels += 1;
-                else
-                        c->g->num_interleaved_channels -= 1;
-                mutex_unlock(&c->g->interleave_lock);
-                gk20a_dbg_info("Set channel %d to interleave %d",
-                        c->hw_chid, c->interleave);
-                gk20a_fifo_set_channel_priority(
-                                c->g, 0, c->hw_chid, c->interleave);
-                c->g->ops.fifo.update_runlist(
-                                c->g, 0, ~0, true, false);
-        }
        return 0;
 }
@@ -711,6 +687,32 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
        return 0;
 }
+static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
+                                                u32 level)
+{
+        struct gk20a *g = ch->g;
+        int ret;
+        if (gk20a_is_channel_marked_as_tsg(ch)) {
+                gk20a_err(dev_from_gk20a(g), "invalid operation for TSG!\n");
+                return -EINVAL;
+        }
+        switch (level) {
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW:
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM:
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH:
+                ret = g->ops.fifo.set_runlist_interleave(g, ch->hw_chid,
+                                                        false, 0, level);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret ? ret : g->ops.fifo.update_runlist(g, 0, ~0, true, true);
+}
 static int gk20a_init_error_notifier(struct channel_gk20a *ch,
                struct nvgpu_set_error_notifier *args)
 {
@@ -899,17 +901,6 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        }
        mutex_unlock(&f->deferred_reset_mutex);
-        if (ch->interleave) {
-                ch->interleave = false;
-                gk20a_fifo_set_channel_priority(
-                                ch->g, 0, ch->hw_chid, ch->interleave);
-                mutex_lock(&f->g->interleave_lock);
-                WARN_ON(f->g->num_interleaved_channels == 0);
-                f->g->num_interleaved_channels -= 1;
-                mutex_unlock(&f->g->interleave_lock);
-        }
        if (!ch->bound)
                goto release;
@@ -1154,11 +1145,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
        ch->has_timedout = false;
        ch->wdt_enabled = true;
        ch->obj_class = 0;
-        ch->interleave = false;
        ch->clean_up.scheduled = false;
-        gk20a_fifo_set_channel_priority(
+        ch->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
-                        ch->g, 0, ch->hw_chid, ch->interleave);
        /* The channel is *not* runnable at this point. It still needs to have
         * an address space bound and allocate a gpfifo and grctx. */
@@ -2613,7 +2601,6 @@ unsigned int gk20a_channel_poll(struct file *filep, poll_table *wait)
 int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
 {
        u32 timeslice_timeout;
-        bool interleave = false;
        if (gk20a_is_channel_marked_as_tsg(ch)) {
                gk20a_err(dev_from_gk20a(ch->g),
@@ -2630,8 +2617,6 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
                timeslice_timeout = ch->g->timeslice_medium_priority_us;
                break;
        case NVGPU_PRIORITY_HIGH:
-                if (ch->g->interleave_high_priority)
-                        interleave = true;
                timeslice_timeout = ch->g->timeslice_high_priority_us;
                break;
        default:
@@ -2640,7 +2625,7 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
        }
        return channel_gk20a_set_schedule_params(ch,
-                        timeslice_timeout, interleave);
+                        timeslice_timeout);
 }
 static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
@@ -3045,6 +3030,18 @@ long gk20a_channel_ioctl(struct file *filp,
                err = gk20a_channel_set_wdt_status(ch,
                                (struct nvgpu_channel_wdt_args *)buf);
                break;
+        case NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE:
+                err = gk20a_busy(dev);
+                if (err) {
+                        dev_err(&dev->dev,
+                                "%s: failed to host gk20a for ioctl cmd: 0x%x",
+                                __func__, cmd);
+                        break;
+                }
+                err = gk20a_channel_set_runlist_interleave(ch,
+                        ((struct nvgpu_runlist_interleave_args *)buf)->level);
+                gk20a_idle(dev);
+                break;
        default:
                dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
                err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4aea9d19..3f5a657a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -188,8 +188,7 @@ struct channel_gk20a {
        spinlock_t update_fn_lock; /* make access to the two above atomic */
        struct work_struct update_fn_work;
-        /* true if channel is interleaved with lower priority channels */
+        u32 interleave_level;
-        bool interleave;
 };
 static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 769960af..28cc3086 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -303,12 +303,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
        if (!runlist->active_tsgs)
                goto clean_up_runlist_info;
-        runlist->high_prio_channels =
-                kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
-                        GFP_KERNEL);
-        if (!runlist->high_prio_channels)
-                goto clean_up_runlist_info;
        runlist_size  = ram_rl_entry_size_v() * f->num_runlist_entries;
        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
                int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -337,9 +331,6 @@ clean_up_runlist_info:
        kfree(runlist->active_tsgs);
        runlist->active_tsgs = NULL;
-        kfree(runlist->high_prio_channels);
-        runlist->high_prio_channels = NULL;
        kfree(f->runlist_info);
        f->runlist_info = NULL;
@@ -2162,32 +2153,153 @@ static inline u32 gk20a_get_tsg_runlist_entry_0(struct tsg_gk20a *tsg)
        return runlist_entry_0;
 }
-/* add all active high priority channels */
+/* recursively construct a runlist with interleaved bare channels and TSGs */
-static inline u32 gk20a_fifo_runlist_add_high_prio_entries(
+static u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
-                struct fifo_gk20a *f,
+                                struct fifo_runlist_info_gk20a *runlist,
-                struct fifo_runlist_info_gk20a *runlist,
+                                u32 cur_level,
-                u32 *runlist_entry)
+                                u32 *runlist_entry,
+                                bool interleave_enabled,
+                                bool prev_empty,
+                                u32 *entries_left)
 {
-        struct channel_gk20a *ch = NULL;
+        bool last_level = cur_level == NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH;
-        unsigned long high_prio_chid;
+        struct channel_gk20a *ch;
-        u32 count = 0;
+        bool skip_next = false;
+        u32 chid, tsgid, count = 0;
+        gk20a_dbg_fn("");
-        for_each_set_bit(high_prio_chid,
+        /* for each bare channel, CH, on this level, insert all higher-level
-                        runlist->high_prio_channels, f->num_channels) {
+           channels and TSGs before inserting CH. */
-                ch = &f->channel[high_prio_chid];
+        for_each_set_bit(chid, runlist->active_channels, f->num_channels) {
+                ch = &f->channel[chid];
+                if (ch->interleave_level != cur_level)
+                        continue;
+                if (gk20a_is_channel_marked_as_tsg(ch))
+                        continue;
+                if (!last_level && !skip_next) {
+                        runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+                        /* if interleaving is disabled, higher-level channels
+                           and TSGs only need to be inserted once */
+                        if (!interleave_enabled)
+                                skip_next = true;
+                }
+                if (!(*entries_left))
+                        return NULL;
+                gk20a_dbg_info("add channel %d to runlist", chid);
+                runlist_entry[0] = ram_rl_entry_chid_f(chid);
+                runlist_entry[1] = 0;
+                runlist_entry += 2;
+                count++;
+                (*entries_left)--;
+        }
-                if (!gk20a_is_channel_marked_as_tsg(ch) &&
+        /* for each TSG, T, on this level, insert all higher-level channels
-                     test_bit(high_prio_chid, runlist->active_channels) == 1) {
+           and TSGs before inserting T. */
-                        gk20a_dbg_info("add high prio channel %lu to runlist",
+        for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
-                                        high_prio_chid);
+                struct tsg_gk20a *tsg = &f->tsg[tsgid];
-                        runlist_entry[0] = ram_rl_entry_chid_f(high_prio_chid);
+                if (tsg->interleave_level != cur_level)
+                        continue;
+                if (!last_level && !skip_next) {
+                        runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+                        if (!interleave_enabled)
+                                skip_next = true;
+                }
+                if (!(*entries_left))
+                        return NULL;
+                /* add TSG entry */
+                gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
+                runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
+                runlist_entry[1] = 0;
+                runlist_entry += 2;
+                count++;
+                (*entries_left)--;
+                mutex_lock(&tsg->ch_list_lock);
+                /* add runnable channels bound to this TSG */
+                list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+                        if (!test_bit(ch->hw_chid,
+                                      runlist->active_channels))
+                                continue;
+                        if (!(*entries_left)) {
+                                mutex_unlock(&tsg->ch_list_lock);
+                                return NULL;
+                        }
+                        gk20a_dbg_info("add channel %d to runlist",
+                                ch->hw_chid);
+                        runlist_entry[0] = ram_rl_entry_chid_f(ch->hw_chid);
                        runlist_entry[1] = 0;
                        runlist_entry += 2;
                        count++;
+                        (*entries_left)--;
                }
+                mutex_unlock(&tsg->ch_list_lock);
        }
-        return count;
+        /* append entries from higher level if this level is empty */
+        if (!count && !last_level)
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        true,
+                                                        entries_left);
+        /*
+         * if previous and this level have entries, append
+         * entries from higher level.
+         *
+         * ex. dropping from MEDIUM to LOW, need to insert HIGH
+         */
+        if (interleave_enabled && count && !prev_empty && !last_level)
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+        return runlist_entry;
+}
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                bool is_tsg,
+                                u32 runlist_id,
+                                u32 new_level)
+{
+        gk20a_dbg_fn("");
+        if (is_tsg)
+                g->fifo.tsg[id].interleave_level = new_level;
+        else
+                g->fifo.channel[id].interleave_level = new_level;
+        return 0;
 }
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
@@ -2198,14 +2310,11 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
        struct fifo_gk20a *f = &g->fifo;
        struct fifo_runlist_info_gk20a *runlist = NULL;
        u32 *runlist_entry_base = NULL;
-        u32 *runlist_entry = NULL;
        u64 runlist_iova;
        u32 old_buf, new_buf;
-        u32 chid, tsgid;
        struct channel_gk20a *ch = NULL;
        struct tsg_gk20a *tsg = NULL;
        u32 count = 0;
-        u32 count_channels_in_tsg;
        runlist = &f->runlist_info[runlist_id];
        /* valid channel, add/remove it from active list.
@@ -2254,91 +2363,23 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
        if (hw_chid != ~0 || /* add/remove a valid channel */
            add /* resume to add all channels back */) {
-                runlist_entry = runlist_entry_base;
+                u32 max_entries = f->num_runlist_entries;
+                u32 *runlist_end;
-                /* Runlist manipulation:
-                   Insert an entry of all high priority channels inbetween
-                   all lower priority channels. This ensure that the maximum
-                   delay a runnable high priority channel has to wait is one
-                   medium timeslice + any context switching overhead +
-                   wait on other high priority channels.
-                   add non-TSG channels first */
-                for_each_set_bit(chid,
-                        runlist->active_channels, f->num_channels) {
-                        ch = &f->channel[chid];
-                        if (!gk20a_is_channel_marked_as_tsg(ch) &&
-                                !ch->interleave) {
-                                u32 added;
-                                gk20a_dbg_info("add normal prio channel %d to runlist",
-                                        chid);
-                                runlist_entry[0] = ram_rl_entry_chid_f(chid);
-                                runlist_entry[1] = 0;
-                                runlist_entry += 2;
-                                count++;
-                                added = gk20a_fifo_runlist_add_high_prio_entries(
-                                                f,
-                                                runlist,
-                                                runlist_entry);
-                                count += added;
-                                runlist_entry += 2 * added;
-                        }
-                }
-                /* if there were no lower priority channels, then just
+                runlist_end = gk20a_runlist_construct_locked(f,
-                 * add the high priority channels once. */
+                                                runlist,
-                if (count == 0) {
+                                                0,
-                        count = gk20a_fifo_runlist_add_high_prio_entries(
+                                                runlist_entry_base,
-                                        f,
+                                                g->runlist_interleave,
-                                        runlist,
+                                                true,
-                                        runlist_entry);
+                                                &max_entries);
-                        runlist_entry += 2 * count;
+                if (!runlist_end) {
+                        ret = -E2BIG;
+                        goto clean_up;
                }
-                /* now add TSG entries and channels bound to TSG */
+                count = (runlist_end - runlist_entry_base) / 2;
-                mutex_lock(&f->tsg_inuse_mutex);
+                WARN_ON(count > f->num_runlist_entries);
-                for_each_set_bit(tsgid,
-                                runlist->active_tsgs, f->num_channels) {
-                        u32 added;
-                        tsg = &f->tsg[tsgid];
-                        /* add TSG entry */
-                        gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
-                        runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
-                        runlist_entry[1] = 0;
-                        runlist_entry += 2;
-                        count++;
-                        /* add runnable channels bound to this TSG */
-                        count_channels_in_tsg = 0;
-                        mutex_lock(&tsg->ch_list_lock);
-                        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-                                if (!test_bit(ch->hw_chid,
-                                                runlist->active_channels))
-                                        continue;
-                                gk20a_dbg_info("add channel %d to runlist",
-                                        ch->hw_chid);
-                                runlist_entry[0] =
-                                        ram_rl_entry_chid_f(ch->hw_chid);
-                                runlist_entry[1] = 0;
-                                runlist_entry += 2;
-                                count++;
-                                count_channels_in_tsg++;
-                        }
-                        mutex_unlock(&tsg->ch_list_lock);
-                        WARN_ON(tsg->num_active_channels !=
-                                count_channels_in_tsg);
-                        added = gk20a_fifo_runlist_add_high_prio_entries(
-                                        f,
-                                        runlist,
-                                        runlist_entry);
-                        count += added;
-                        runlist_entry += 2 * added;
-                }
-                mutex_unlock(&f->tsg_inuse_mutex);
        } else  /* suspend to remove all channels */
                count = 0;
@@ -2493,42 +2534,6 @@ u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
        return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
 }
-int gk20a_fifo_set_channel_priority(
-                struct gk20a *g,
-                u32 runlist_id,
-                u32 hw_chid,
-                bool interleave)
-{
-        struct fifo_runlist_info_gk20a *runlist = NULL;
-        struct fifo_gk20a *f = &g->fifo;
-        struct channel_gk20a *ch = NULL;
-        if (hw_chid >= f->num_channels)
-                return -EINVAL;
-        if (runlist_id >= f->max_runlists)
-                return -EINVAL;
-        ch = &f->channel[hw_chid];
-        gk20a_dbg_fn("");
-        runlist = &f->runlist_info[runlist_id];
-        mutex_lock(&runlist->mutex);
-        if (ch->interleave)
-                set_bit(hw_chid, runlist->high_prio_channels);
-        else
-                clear_bit(hw_chid, runlist->high_prio_channels);
-        gk20a_dbg_fn("done");
-        mutex_unlock(&runlist->mutex);
-        return 0;
-}
 struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
                u32 hw_chid)
 {
@@ -2545,4 +2550,5 @@ void gk20a_init_fifo(struct gpu_ops *gops)
        gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
        gops->fifo.get_num_fifos = gk20a_fifo_get_num_fifos;
        gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
+        gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index ee4e7328..0979bf2b 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -31,7 +31,6 @@
 struct fifo_runlist_info_gk20a {
        unsigned long *active_channels;
        unsigned long *active_tsgs;
-        unsigned long *high_prio_channels;
        /* Each engine has its own SW and HW runlist buffer.*/
        struct mem_desc mem[MAX_RUNLIST_BUFFERS];
        u32  cur_buffer;
@@ -184,8 +183,6 @@ void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
 int gk20a_fifo_wait_engine_idle(struct gk20a *g);
 u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
 u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
-int gk20a_fifo_set_channel_priority(struct gk20a *g, u32 runlist_id,
-                u32 hw_chid, bool interleave);
 u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
                int *__id, bool *__is_tsg);
 bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
@@ -198,4 +195,9 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
                u32 hw_chid);
 void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg);
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                bool is_tsg,
+                                u32 runlist_id,
+                                u32 new_level);
 #endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index fa2c61e1..0fee58e8 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -672,9 +672,6 @@ static int gk20a_init_support(struct platform_device *dev)
        mutex_init(&g->ch_wdt_lock);
        mutex_init(&g->poweroff_lock);
-        mutex_init(&g->interleave_lock);
-        g->num_interleaved_channels = 0;
        g->remove_support = gk20a_remove_support;
        return 0;
@@ -1439,14 +1436,11 @@ static int gk20a_probe(struct platform_device *dev)
        if (tegra_platform_is_silicon())
                gk20a->timeouts_enabled = true;
-        gk20a->interleave_high_priority = true;
+        gk20a->runlist_interleave = true;
        gk20a->timeslice_low_priority_us = 1300;
        gk20a->timeslice_medium_priority_us = 2600;
-        if (gk20a->interleave_high_priority)
+        gk20a->timeslice_high_priority_us = 5200;
-                gk20a->timeslice_high_priority_us = 3000;
-        else
-                gk20a->timeslice_high_priority_us = 5200;
        /* Set up initial power settings. For non-slicon platforms, disable *
         * power features and for silicon platforms, read from platform data */
@@ -1527,11 +1521,11 @@ static int gk20a_probe(struct platform_device *dev)
                                        platform->debugfs,
                                        &gk20a->timeslice_high_priority_us);
-        gk20a->debugfs_interleave_high_priority =
+        gk20a->debugfs_runlist_interleave =
-                        debugfs_create_bool("interleave_high_priority",
+                        debugfs_create_bool("runlist_interleave",
                                        S_IRUGO|S_IWUSR,
                                        platform->debugfs,
-                                        &gk20a->interleave_high_priority);
+                                        &gk20a->runlist_interleave);
        gr_gk20a_debugfs_init(gk20a);
        gk20a_pmu_debugfs_init(dev);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index afdbeef7..faccf04a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -54,8 +54,6 @@ struct acr_gm20b;
    32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
-#define MAX_INTERLEAVED_CHANNELS                32
 struct cooling_device_gk20a {
        struct thermal_cooling_device *gk20a_cooling_dev;
        unsigned int gk20a_freq_state;
@@ -268,6 +266,9 @@ struct gpu_ops {
                u32 (*get_num_fifos)(struct gk20a *g);
                u32 (*get_pbdma_signature)(struct gk20a *g);
                int (*channel_set_priority)(struct channel_gk20a *ch, u32 priority);
+                int (*set_runlist_interleave)(struct gk20a *g, u32 id,
+                                        bool is_tsg, u32 runlist_id,
+                                        u32 new_level);
        } fifo;
        struct pmu_v {
                /*used for change of enum zbc update cmd id from ver 0 to ver1*/
@@ -536,10 +537,7 @@ struct gk20a {
        u32 timeslice_low_priority_us;
        u32 timeslice_medium_priority_us;
        u32 timeslice_high_priority_us;
-        u32 interleave_high_priority;
+        u32 runlist_interleave;
-        struct mutex interleave_lock;
-        u32 num_interleaved_channels;
        bool slcg_enabled;
        bool blcg_enabled;
@@ -564,7 +562,7 @@ struct gk20a {
        struct dentry *debugfs_timeslice_low_priority_us;
        struct dentry *debugfs_timeslice_medium_priority_us;
        struct dentry *debugfs_timeslice_high_priority_us;
-        struct dentry *debugfs_interleave_high_priority;
+        struct dentry *debugfs_runlist_interleave;
 #endif
        struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 4421744c..b41cca08 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -228,6 +228,7 @@ int gk20a_tsg_open(struct gk20a *g, struct file *filp)
        tsg->tsg_gr_ctx = NULL;
        tsg->vm = NULL;
+        tsg->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
        filp->private_data = tsg;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index bcc4d0c4..7e0a75d1 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -49,6 +49,8 @@ struct tsg_gk20a {
        struct gr_ctx_desc *tsg_gr_ctx;
        struct vm_gk20a *vm;
+        u32 interleave_level;
 };
 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
index d1deffb9..3fded03c 100644
--- a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
@@ -1,7 +1,7 @@
 /*
 * GM20B Fifo
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -121,4 +121,5 @@ void gm20b_init_fifo(struct gpu_ops *gops)
        gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
        gops->fifo.get_num_fifos = gm20b_fifo_get_num_fifos;
        gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
+        gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index e776e97c..b4bb7f38 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -1,7 +1,7 @@
 /*
 * Virtualized GPU Fifo
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -194,12 +194,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
        if (!runlist->active_channels)
                goto clean_up_runlist_info;
-        runlist->high_prio_channels =
-                kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
-                        GFP_KERNEL);
-        if (!runlist->high_prio_channels)
-                goto clean_up_runlist_info;
        runlist_size  = sizeof(u16) * f->num_channels;
        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
                int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -222,9 +216,6 @@ clean_up_runlist:
                gk20a_gmmu_free(g, &runlist->mem[i]);
 clean_up_runlist_info:
-        kfree(runlist->high_prio_channels);
-        runlist->high_prio_channels = NULL;
        kfree(runlist->active_channels);
        runlist->active_channels = NULL;
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 442a84ac..0787d4e4 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -834,6 +834,28 @@ struct nvgpu_channel_wdt_args {
 #define NVGPU_IOCTL_CHANNEL_DISABLE_WDT         1
 #define NVGPU_IOCTL_CHANNEL_ENABLE_WDT          2
+/*
+ * Interleaving channels in a runlist is an approach to improve
+ * GPU scheduling by allowing certain channels to appear multiple
+ * times on the runlist. The number of times a channel appears is
+ * governed by the following levels:
+ *
+ * low (L)   : appears once
+ * medium (M): if L, appears L times
+ *             else, appears once
+ * high (H)  : if L, appears (M + 1) x L times
+ *             else if M, appears M times
+ *             else, appears once
+ */
+struct nvgpu_runlist_interleave_args {
+        __u32 level;
+        __u32 reserved;
+};
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW      0
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM   1
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH     2
+#define NVGPU_RUNLIST_INTERLEAVE_NUM_LEVELS     3
 #define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD        \
        _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args)
 #define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \
@@ -876,9 +898,11 @@ struct nvgpu_channel_wdt_args {
        _IOWR(NVGPU_IOCTL_MAGIC, 118, struct nvgpu_cycle_stats_snapshot_args)
 #define NVGPU_IOCTL_CHANNEL_WDT \
        _IOW(NVGPU_IOCTL_MAGIC, 119, struct nvgpu_channel_wdt_args)
+#define NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE \
+        _IOW(NVGPU_IOCTL_MAGIC, 120, struct nvgpu_runlist_interleave_args)
 #define NVGPU_IOCTL_CHANNEL_LAST        \
-        _IOC_NR(NVGPU_IOCTL_CHANNEL_WDT)
+        _IOC_NR(NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE)
 #define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args)
 /*