42 files changed, 2537 insertions, 303 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index d0e25aa2..94173976 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -54,6 +54,16 @@ config GK20A_CYCLE_STATS
        help
          Say Y here to enable the cycle stats debugging features.
+config GK20A_CTXSW_TRACE
+        bool "Support GK20A Context Switch tracing"
+        depends on GK20A
+        default n
+        help
+          Enable support for the GK20A Context Switch Tracing. In this mode,
+          FECS collects timestamps for contexts loaded on GR engine. This
+          allows tracking context switches on GR engine, as well as
+          identifying processes that submitted work.
 config TEGRA_GK20A
        bool "Enable the GK20A GPU on Tegra"
        depends on TEGRA_GRHOST || TEGRA_HOST1X
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index f6b3a673..df660eb7 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -8,9 +8,9 @@ ccflags-y += -Werror
 ccflags-y += -Wno-error=cpp
 ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y)
-ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu
+ccflags-y += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu
-ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include
+ccflags-y += -I$(srctree)/../kernel-t18x/include
-ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include/uapi
+ccflags-y += -I$(srctree)/../kernel-t18x/include/uapi
 endif
 obj-$(CONFIG_GK20A) := nvgpu.o
@@ -46,6 +46,8 @@ nvgpu-y := \
        gk20a/cde_gk20a.o \
        gk20a/platform_gk20a_generic.o \
        gk20a/tsg_gk20a.o \
+        gk20a/ctxsw_trace_gk20a.o \
+        gk20a/fecs_trace_gk20a.o \
        gk20a/mc_gk20a.o \
        gm20b/hal_gm20b.o \
        gm20b/ltc_gm20b.o \
@@ -64,7 +66,6 @@ nvgpu-y := \
        gm20b/debug_gm20b.o \
        gm20b/cde_gm20b.o \
        gm20b/therm_gm20b.o
 nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o
 nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
@@ -78,6 +79,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
        vgpu/debug_vgpu.o \
        vgpu/vgpu.o \
        vgpu/dbg_vgpu.o \
+        vgpu/fecs_trace_vgpu.o \
        vgpu/gk20a/vgpu_hal_gk20a.o \
        vgpu/gk20a/vgpu_gr_gk20a.o \
        vgpu/gm20b/vgpu_hal_gm20b.o \
@@ -94,7 +96,5 @@ nvgpu-$(CONFIG_GK20A_CYCLE_STATS) += \
        gk20a/css_gr_gk20a.o
 ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y)
-ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu
+include ../kernel-t18x/drivers/gpu/nvgpu/Makefile
-ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include
-obj-$(CONFIG_GK20A)     += ../../../../kernel-t18x/drivers/gpu/nvgpu/
 endif
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index b6b38541..0571ca1f 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -279,13 +279,15 @@ static int gk20a_as_ioctl_get_va_regions(
        for (i = 0; i < write_entries; ++i) {
                struct nvgpu_as_va_region region;
+                struct gk20a_allocator *vma = vm->fixed.init ?
+                        &vm->fixed : &vm->vma[i];
                memset(&region, 0, sizeof(struct nvgpu_as_va_region));
                region.page_size = vm->gmmu_page_sizes[i];
-                region.offset = vm->vma[i].base;
+                region.offset = vma->base;
                /* No __aeabi_uldivmod() on some platforms... */
-                region.pages = (vm->vma[i].end - vm->vma[i].start) >>
+                region.pages = (vma->end - vma->start) >>
                        ilog2(region.page_size);
                if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 1f63bbd8..20976992 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -28,6 +28,7 @@
 #include <linux/vmalloc.h>
 #include "debug_gk20a.h"
+#include "ctxsw_trace_gk20a.h"
 #include "gk20a.h"
 #include "dbg_gpu_gk20a.h"
@@ -44,6 +45,9 @@
 #define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT       64      /* channels */
+#define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000
+#define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000
 static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
 static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
@@ -177,7 +181,7 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 }
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
-                                u32 timeslice_period, bool interleave)
+                                u32 timeslice_period)
 {
        void *inst_ptr;
        int shift = 0, value = 0;
@@ -205,30 +209,6 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
                gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
                ccsr_channel_enable_set_true_f());
-        if (c->interleave != interleave) {
-                mutex_lock(&c->g->interleave_lock);
-                c->interleave = interleave;
-                if (interleave)
-                        if (c->g->num_interleaved_channels >=
-                                        MAX_INTERLEAVED_CHANNELS) {
-                                gk20a_err(dev_from_gk20a(c->g),
-                                        "Change of priority would exceed runlist length, only changing timeslice\n");
-                                c->interleave = false;
-                        } else
-                                c->g->num_interleaved_channels += 1;
-                else
-                        c->g->num_interleaved_channels -= 1;
-                mutex_unlock(&c->g->interleave_lock);
-                gk20a_dbg_info("Set channel %d to interleave %d",
-                        c->hw_chid, c->interleave);
-                gk20a_fifo_set_channel_priority(
-                                c->g, 0, c->hw_chid, c->interleave);
-                c->g->ops.fifo.update_runlist(
-                                c->g, 0, ~0, true, false);
-        }
        return 0;
 }
@@ -238,6 +218,12 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
        u64 timeout;
        int val_len;
+        val = pbdma_acquire_retry_man_2_f() |
+                pbdma_acquire_retry_exp_2_f();
+        if (!c->g->timeouts_enabled)
+                return val;
        timeout = gk20a_get_channel_watchdog_timeout(c);
        do_div(timeout, 2); /* set acquire timeout to half of channel wdt */
        timeout *= 1000000UL; /* ms -> ns */
@@ -256,11 +242,10 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
                man = timeout;
        }
-        val = pbdma_acquire_retry_man_2_f() |
+        val |= pbdma_acquire_timeout_exp_f(exp) |
-                pbdma_acquire_retry_exp_2_f() |
-                pbdma_acquire_timeout_exp_f(exp) |
                pbdma_acquire_timeout_man_f(man) |
                pbdma_acquire_timeout_en_enable_f();
        return val;
 }
@@ -711,11 +696,39 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
        return 0;
 }
-static int gk20a_init_error_notifier(struct channel_gk20a *ch,
+static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
-                struct nvgpu_set_error_notifier *args) {
+                                                u32 level)
-        void *va;
+{
+        struct gk20a *g = ch->g;
+        int ret;
+        if (gk20a_is_channel_marked_as_tsg(ch)) {
+                gk20a_err(dev_from_gk20a(g), "invalid operation for TSG!\n");
+                return -EINVAL;
+        }
+        switch (level) {
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW:
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM:
+        case NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH:
+                ret = g->ops.fifo.set_runlist_interleave(g, ch->hw_chid,
+                                                        false, 0, level);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret ? ret : g->ops.fifo.update_runlist(g, 0, ~0, true, true);
+}
+static int gk20a_init_error_notifier(struct channel_gk20a *ch,
+                struct nvgpu_set_error_notifier *args)
+{
+        struct device *dev = dev_from_gk20a(ch->g);
        struct dma_buf *dmabuf;
+        void *va;
+        u64 end = args->offset + sizeof(struct nvgpu_notification);
        if (!args->mem) {
                pr_err("gk20a_init_error_notifier: invalid memory handle\n");
@@ -731,6 +744,13 @@ static int gk20a_init_error_notifier(struct channel_gk20a *ch,
                pr_err("Invalid handle: %d\n", args->mem);
                return -EINVAL;
        }
+        if (end > dmabuf->size || end < sizeof(struct nvgpu_notification)) {
+                dma_buf_put(dmabuf);
+                gk20a_err(dev, "gk20a_init_error_notifier: invalid offset\n");
+                return -EINVAL;
+        }
        /* map handle */
        va = dma_buf_vmap(dmabuf);
        if (!va) {
@@ -890,17 +910,6 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        }
        mutex_unlock(&f->deferred_reset_mutex);
-        if (ch->interleave) {
-                ch->interleave = false;
-                gk20a_fifo_set_channel_priority(
-                                ch->g, 0, ch->hw_chid, ch->interleave);
-                mutex_lock(&f->g->interleave_lock);
-                WARN_ON(f->g->num_interleaved_channels == 0);
-                f->g->num_interleaved_channels -= 1;
-                mutex_unlock(&f->g->interleave_lock);
-        }
        if (!ch->bound)
                goto release;
@@ -912,6 +921,9 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        gk20a_free_error_notifiers(ch);
+        if (g->ops.fecs_trace.unbind_channel)
+                g->ops.fecs_trace.unbind_channel(g, ch);
        /* release channel ctx */
        g->ops.gr.free_channel_ctx(ch);
@@ -1145,11 +1157,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
        ch->has_timedout = false;
        ch->wdt_enabled = true;
        ch->obj_class = 0;
-        ch->interleave = false;
        ch->clean_up.scheduled = false;
-        gk20a_fifo_set_channel_priority(
+        ch->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
-                        ch->g, 0, ch->hw_chid, ch->interleave);
        /* The channel is *not* runnable at this point. It still needs to have
         * an address space bound and allocate a gpfifo and grctx. */
@@ -1697,6 +1706,10 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
        /* Need global lock since multiple channels can timeout at a time */
        mutex_lock(&g->ch_wdt_lock);
+        gk20a_debug_dump(g->dev);
+        gk20a_gr_debug_dump(g->dev);
        /* Get timed out job and reset the timer */
        mutex_lock(&ch->timeout.lock);
        job = ch->timeout.job;
@@ -2399,6 +2412,7 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
        u32 offset;
        unsigned long timeout;
        int remain, ret = 0;
+        u64 end;
        gk20a_dbg_fn("");
@@ -2414,6 +2428,7 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
        case NVGPU_WAIT_TYPE_NOTIFIER:
                id = args->condition.notifier.dmabuf_fd;
                offset = args->condition.notifier.offset;
+                end = offset + sizeof(struct notification);
                dmabuf = dma_buf_get(id);
                if (IS_ERR(dmabuf)) {
@@ -2422,6 +2437,12 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
                        return -EINVAL;
                }
+                if (end > dmabuf->size || end < sizeof(struct notification)) {
+                        dma_buf_put(dmabuf);
+                        gk20a_err(d, "invalid notifier offset\n");
+                        return -EINVAL;
+                }
                notif = dma_buf_vmap(dmabuf);
                if (!notif) {
                        gk20a_err(d, "failed to map notifier memory");
@@ -2596,7 +2617,6 @@ unsigned int gk20a_channel_poll(struct file *filep, poll_table *wait)
 int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
 {
        u32 timeslice_timeout;
-        bool interleave = false;
        if (gk20a_is_channel_marked_as_tsg(ch)) {
                gk20a_err(dev_from_gk20a(ch->g),
@@ -2613,8 +2633,6 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
                timeslice_timeout = ch->g->timeslice_medium_priority_us;
                break;
        case NVGPU_PRIORITY_HIGH:
-                if (ch->g->interleave_high_priority)
-                        interleave = true;
                timeslice_timeout = ch->g->timeslice_high_priority_us;
                break;
        default:
@@ -2623,7 +2641,22 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
        }
        return channel_gk20a_set_schedule_params(ch,
-                        timeslice_timeout, interleave);
+                        timeslice_timeout);
+}
+int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice)
+{
+        if (gk20a_is_channel_marked_as_tsg(ch)) {
+                gk20a_err(dev_from_gk20a(ch->g),
+                        "invalid operation for TSG!\n");
+                return -EINVAL;
+        }
+        if (timeslice < NVGPU_CHANNEL_MIN_TIMESLICE_US ||
+                timeslice > NVGPU_CHANNEL_MAX_TIMESLICE_US)
+                return -EINVAL;
+        return channel_gk20a_set_schedule_params(ch, timeslice);
 }
 static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
@@ -2778,6 +2811,7 @@ void gk20a_init_channel(struct gpu_ops *gops)
        gops->fifo.free_inst = channel_gk20a_free_inst;
        gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc;
        gops->fifo.channel_set_priority = gk20a_channel_set_priority;
+        gops->fifo.channel_set_timeslice = gk20a_channel_set_timeslice;
 }
 long gk20a_channel_ioctl(struct file *filp,
@@ -3028,6 +3062,30 @@ long gk20a_channel_ioctl(struct file *filp,
                err = gk20a_channel_set_wdt_status(ch,
                                (struct nvgpu_channel_wdt_args *)buf);
                break;
+        case NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE:
+                err = gk20a_busy(dev);
+                if (err) {
+                        dev_err(&dev->dev,
+                                "%s: failed to host gk20a for ioctl cmd: 0x%x",
+                                __func__, cmd);
+                        break;
+                }
+                err = gk20a_channel_set_runlist_interleave(ch,
+                        ((struct nvgpu_runlist_interleave_args *)buf)->level);
+                gk20a_idle(dev);
+                break;
+        case NVGPU_IOCTL_CHANNEL_SET_TIMESLICE:
+                err = gk20a_busy(dev);
+                if (err) {
+                        dev_err(&dev->dev,
+                                "%s: failed to host gk20a for ioctl cmd: 0x%x",
+                                __func__, cmd);
+                        break;
+                }
+                err = ch->g->ops.fifo.channel_set_timeslice(ch,
+                        ((struct nvgpu_timeslice_args *)buf)->timeslice_us);
+                gk20a_idle(dev);
+                break;
        default:
                dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
                err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4aea9d19..e3fbba3e 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -188,8 +188,7 @@ struct channel_gk20a {
        spinlock_t update_fn_lock; /* make access to the two above atomic */
        struct work_struct update_fn_work;
-        /* true if channel is interleaved with lower priority channels */
+        u32 interleave_level;
-        bool interleave;
 };
 static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
@@ -276,5 +275,6 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
                int timeslice_period,
                int *__timeslice_timeout, int *__timeslice_scale);
 int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority);
+int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice);
 #endif /* CHANNEL_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 8ff53d17..87f0bf74 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -115,8 +115,10 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
        }
        num_wait_cmds = nvhost_sync_num_pts(sync_fence);
-        if (num_wait_cmds == 0)
+        if (num_wait_cmds == 0) {
+                sync_fence_put(sync_fence);
                return 0;
+        }
        err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd);
        if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
index 9ed5fef3..b2ae224f 100644
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -29,6 +29,7 @@
 #include "hw_gr_gk20a.h"
 #include "hw_fb_gk20a.h"
 #include "hw_proj_gk20a.h"
+#include "hw_timer_gk20a.h"
 int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
 {
@@ -272,6 +273,8 @@ static int nvgpu_gpu_ioctl_inval_icache(
        struct nvgpu_dbg_gpu_reg_op ops;
        ch = gk20a_get_channel_from_file(args->channel_fd);
+        if (!ch)
+                return -EINVAL;
        ops.op     = REGOP(READ_32);
        ops.type   = REGOP(TYPE_GR_CTX);
@@ -528,6 +531,94 @@ static int gk20a_ctrl_get_buffer_info(
                                        &args->out.id, &args->out.length);
 }
+static inline u64 get_cpu_timestamp_tsc(void)
+{
+        return ((u64) get_cycles());
+}
+static inline u64 get_cpu_timestamp_jiffies(void)
+{
+        return (get_jiffies_64() - INITIAL_JIFFIES);
+}
+static inline u64 get_cpu_timestamp_timeofday(void)
+{
+        struct timeval tv;
+        do_gettimeofday(&tv);
+        return timeval_to_jiffies(&tv);
+}
+static inline int get_timestamps_zipper(struct gk20a *g,
+                u64 (*get_cpu_timestamp)(void),
+                struct nvgpu_gpu_get_cpu_time_correlation_info_args *args)
+{
+        int err = 0;
+        int i = 0;
+        u32 gpu_timestamp_hi_new = 0;
+        u32 gpu_timestamp_hi_old = 0;
+        if (gk20a_busy(g->dev)) {
+                gk20a_err(dev_from_gk20a(g), "GPU not powered on\n");
+                err = -EINVAL;
+                goto end;
+        }
+        /* get zipper reads of gpu and cpu counter values */
+        gpu_timestamp_hi_old = gk20a_readl(g, timer_time_1_r());
+        for (i = 0; i < args->count; i++) {
+                u32 gpu_timestamp_lo = 0;
+                u32 gpu_timestamp_hi = 0;
+                gpu_timestamp_lo = gk20a_readl(g, timer_time_0_r());
+                args->samples[i].cpu_timestamp = get_cpu_timestamp();
+                rmb(); /* maintain zipper read order */
+                gpu_timestamp_hi_new = gk20a_readl(g, timer_time_1_r());
+                /* pick the appropriate gpu counter hi bits */
+                gpu_timestamp_hi = (gpu_timestamp_lo & (1L << 31)) ?
+                        gpu_timestamp_hi_old : gpu_timestamp_hi_new;
+                args->samples[i].gpu_timestamp =
+                        ((u64)gpu_timestamp_hi << 32) | (u64)gpu_timestamp_lo;
+                gpu_timestamp_hi_old = gpu_timestamp_hi_new;
+        }
+end:
+        gk20a_idle(g->dev);
+        return err;
+}
+static int nvgpu_gpu_get_cpu_time_correlation_info(
+        struct gk20a *g,
+        struct nvgpu_gpu_get_cpu_time_correlation_info_args *args)
+{
+        int err = 0;
+        u64 (*get_cpu_timestamp)(void) = NULL;
+        if (args->count > NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT)
+                return -EINVAL;
+        switch (args->source_id) {
+        case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TSC:
+                get_cpu_timestamp = get_cpu_timestamp_tsc;
+                break;
+        case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_JIFFIES:
+                get_cpu_timestamp = get_cpu_timestamp_jiffies;
+                break;
+        case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TIMEOFDAY:
+                get_cpu_timestamp = get_cpu_timestamp_timeofday;
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g), "invalid cpu clock source id\n");
+                return -EINVAL;
+        }
+        err = get_timestamps_zipper(g, get_cpu_timestamp, args);
+        return err;
+}
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct platform_device *dev = filp->private_data;
@@ -760,6 +851,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
                        (struct nvgpu_gpu_get_buffer_info_args *)buf);
                break;
+        case NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO:
+                err = nvgpu_gpu_get_cpu_time_correlation_info(g,
+                        (struct nvgpu_gpu_get_cpu_time_correlation_info_args *)buf);
+                break;
        default:
                dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
                err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
new file mode 100644
index 00000000..9e7c04ad
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <asm/barrier.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/circ_buf.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/nvgpu.h>
+#include <linux/hashtable.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <uapi/linux/nvgpu.h>
+#include "ctxsw_trace_gk20a.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_gr_gk20a.h"
+#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE      (128*PAGE_SIZE)
+/* Userland-facing FIFO (one global + eventually one per VM) */
+struct gk20a_ctxsw_dev {
+        struct gk20a *g;
+        struct nvgpu_ctxsw_ring_header *hdr;
+        struct nvgpu_ctxsw_trace_entry *ents;
+        struct nvgpu_ctxsw_trace_filter filter;
+        bool write_enabled;
+        wait_queue_head_t readout_wq;
+        size_t size;
+        atomic_t vma_ref;
+        struct mutex lock;
+};
+struct gk20a_ctxsw_trace {
+        struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
+};
+static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->write_idx == hdr->read_idx);
+}
+static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
+}
+static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
+}
+static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
+}
+ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
+        loff_t *off)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+        struct nvgpu_ctxsw_trace_entry __user *entry =
+                (struct nvgpu_ctxsw_trace_entry *) buf;
+        size_t copied = 0;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
+                "filp=%p buf=%p size=%zu", filp, buf, size);
+        mutex_lock(&dev->lock);
+        while (ring_is_empty(hdr)) {
+                mutex_unlock(&dev->lock);
+                if (filp->f_flags & O_NONBLOCK)
+                        return -EAGAIN;
+                err = wait_event_interruptible(dev->readout_wq,
+                        !ring_is_empty(hdr));
+                if (err)
+                        return err;
+                mutex_lock(&dev->lock);
+        }
+        while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
+                if (ring_is_empty(hdr))
+                        break;
+                if (copy_to_user(entry, &dev->ents[hdr->read_idx],
+                        sizeof(*entry))) {
+                        mutex_unlock(&dev->lock);
+                        return -EFAULT;
+                }
+                hdr->read_idx++;
+                if (hdr->read_idx >= hdr->num_ents)
+                        hdr->read_idx = 0;
+                entry++;
+                copied += sizeof(*entry);
+                size -= sizeof(*entry);
+        }
+        gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
+                hdr->read_idx);
+        *off = hdr->read_idx;
+        mutex_unlock(&dev->lock);
+        return copied;
+}
+static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
+{
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
+        dev->write_enabled = true;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
+{
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
+        dev->write_enabled = false;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
+                size_t size)
+{
+        struct nvgpu_ctxsw_ring_header *hdr;
+        if (atomic_read(&dev->vma_ref))
+                return -EBUSY;
+        if ((dev->write_enabled) || (atomic_read(&dev->vma_ref)))
+                return -EBUSY;
+        size = roundup(size, PAGE_SIZE);
+        hdr = vmalloc_user(size);
+        if (!hdr)
+                return -ENOMEM;
+        if (dev->hdr)
+                vfree(dev->hdr);
+        dev->hdr = hdr;
+        dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
+        dev->size = size;
+        hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
+        hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
+        hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
+                / sizeof(struct nvgpu_ctxsw_trace_entry);
+        hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
+        hdr->drop_count = 0;
+        hdr->read_idx = 0;
+        hdr->write_idx = 0;
+        hdr->write_seqno = 0;
+        gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
+                dev->size, dev->hdr, dev->ents, hdr->num_ents);
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_ring_setup_args *args)
+{
+        size_t size = args->size;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
+        if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
+                return -EINVAL;
+        return gk20a_ctxsw_dev_ring_alloc(dev, size);
+}
+static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_trace_filter_args *args)
+{
+        dev->filter = args->filter;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_trace_filter_args *args)
+{
+        args->filter = dev->filter;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
+{
+        struct gk20a *g = dev->g;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+        err = gk20a_busy(g->dev);
+        if (err)
+                return err;
+        if (g->ops.fecs_trace.flush(g))
+                err = g->ops.fecs_trace.flush(g);
+        if (likely(!err))
+                err = g->ops.fecs_trace.poll(g);
+        gk20a_idle(g->dev);
+        return err;
+}
+int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
+{
+        struct gk20a *g;
+        struct gk20a_ctxsw_trace *trace;
+        struct gk20a_ctxsw_dev *dev;
+        int err;
+        size_t size;
+        u32 n;
+        /* only one VM for now */
+        const int vmid = 0;
+        g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        err = gk20a_busy(g->dev);
+        if (err)
+                return err;
+        trace = g->ctxsw_trace;
+        if (!trace) {
+                err = -ENODEV;
+                goto idle;
+        }
+        /* Allow only one user for this device */
+        dev = &trace->devs[vmid];
+        mutex_lock(&dev->lock);
+        if (dev->hdr) {
+                err = -EBUSY;
+                goto done;
+        }
+        /* By default, allocate ring buffer big enough to accommodate
+         * FECS records with default event filter */
+        /* enable all traces by default */
+        NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
+        /* compute max number of entries generated with this filter */
+        n = g->ops.fecs_trace.max_entries(g, &dev->filter);
+        size = sizeof(struct nvgpu_ctxsw_ring_header) +
+                        n * sizeof(struct nvgpu_ctxsw_trace_entry);
+        gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
+                size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
+        err = gk20a_ctxsw_dev_ring_alloc(dev, size);
+        if (!err) {
+                filp->private_data = dev;
+                gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
+                        filp, dev, size);
+        }
+        err = g->ops.fecs_trace.enable(g);
+done:
+        mutex_unlock(&dev->lock);
+idle:
+        gk20a_idle(g->dev);
+        return err;
+}
+int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
+        mutex_lock(&dev->lock);
+        dev->write_enabled = false;
+        if (dev->hdr) {
+                vfree(dev->hdr);
+                dev->hdr = NULL;
+        }
+        g->ops.fecs_trace.disable(g);
+        mutex_unlock(&dev->lock);
+        return 0;
+}
+long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
+        unsigned long arg)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct gk20a *g = dev->g;
+        u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
+        int err = 0;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
+        if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) || (_IOC_NR(cmd) == 0)
+                || (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
+                return -EINVAL;
+        BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
+        memset(buf, 0, sizeof(buf));
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        mutex_lock(&dev->lock);
+        switch (cmd) {
+        case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
+                err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
+                break;
+        case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
+                err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
+                break;
+        case NVGPU_CTXSW_IOCTL_RING_SETUP:
+                err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
+                        (struct nvgpu_ctxsw_ring_setup_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_SET_FILTER:
+                err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
+                        (struct nvgpu_ctxsw_trace_filter_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_GET_FILTER:
+                err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
+                        (struct nvgpu_ctxsw_trace_filter_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_POLL:
+                mutex_unlock(&dev->lock);
+                err = gk20a_ctxsw_dev_ioctl_poll(dev);
+                mutex_lock(&dev->lock);
+                break;
+        default:
+                dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
+                        cmd);
+                err = -ENOTTY;
+        }
+        mutex_unlock(&dev->lock);
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
+        return err;
+}
+unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+        unsigned int mask = 0;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+        mutex_lock(&dev->lock);
+        poll_wait(filp, &dev->readout_wq, wait);
+        if (!ring_is_empty(hdr))
+                mask |= POLLIN | POLLRDNORM;
+        mutex_unlock(&dev->lock);
+        return mask;
+}
+static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+        atomic_inc(&dev->vma_ref);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+                atomic_read(&dev->vma_ref));
+}
+static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+        atomic_dec(&dev->vma_ref);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+                atomic_read(&dev->vma_ref));
+}
+static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
+        .open = gk20a_ctxsw_dev_vma_open,
+        .close = gk20a_ctxsw_dev_vma_close,
+};
+int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        int ret;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
+                vma->vm_start, vma->vm_end);
+        ret = remap_vmalloc_range(vma, dev->hdr, 0);
+        if (likely(!ret)) {
+                vma->vm_private_data = dev;
+                vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
+                vma->vm_ops->open(vma);
+        }
+        return ret;
+}
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+static int gk20a_ctxsw_init_devs(struct gk20a *g)
+{
+        struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+        struct gk20a_ctxsw_dev *dev = trace->devs;
+        int i;
+        for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
+                dev->g = g;
+                dev->hdr = NULL;
+                dev->write_enabled = false;
+                init_waitqueue_head(&dev->readout_wq);
+                mutex_init(&dev->lock);
+                atomic_set(&dev->vma_ref, 0);
+                dev++;
+        }
+        return 0;
+}
+#endif
+int gk20a_ctxsw_trace_init(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
+        if (likely(trace))
+                return 0;
+        trace = kzalloc(sizeof(*trace), GFP_KERNEL);
+        if (unlikely(!trace))
+                return -ENOMEM;
+        g->ctxsw_trace = trace;
+        err = gk20a_ctxsw_init_devs(g);
+        if (err)
+                goto fail;
+        err = g->ops.fecs_trace.init(g);
+        if (unlikely(err))
+                goto fail;
+        return 0;
+fail:
+        kfree(trace);
+        g->ctxsw_trace = NULL;
+        return err;
+#else
+        return 0;
+#endif
+}
+void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        kfree(g->ctxsw_trace);
+        g->ctxsw_trace = NULL;
+        g->ops.fecs_trace.deinit(g);
+#endif
+}
+int gk20a_ctxsw_trace_write(struct gk20a *g,
+                struct nvgpu_ctxsw_trace_entry *entry)
+{
+        struct nvgpu_ctxsw_ring_header *hdr;
+        struct gk20a_ctxsw_dev *dev;
+        int ret = 0;
+        const char *reason;
+        if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
+                return -ENODEV;
+        dev = &g->ctxsw_trace->devs[entry->vmid];
+        hdr = dev->hdr;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "dev=%p hdr=%p", dev, hdr);
+        mutex_lock(&dev->lock);
+        if (unlikely(!hdr)) {
+                /* device has been released */
+                ret = -ENODEV;
+                goto done;
+        }
+        entry->seqno = hdr->write_seqno++;
+        if (!dev->write_enabled) {
+                ret = -EBUSY;
+                reason = "write disabled";
+                goto drop;
+        }
+        if (unlikely(ring_is_full(hdr))) {
+                ret = -ENOSPC;
+                reason = "user fifo full";
+                goto drop;
+        }
+        if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
+                reason = "filtered out";
+                goto filter;
+        }
+        gk20a_dbg(gpu_dbg_ctxsw,
+                "seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
+                entry->seqno, entry->context_id, entry->pid,
+                entry->tag, entry->timestamp);
+        dev->ents[hdr->write_idx] = *entry;
+        /* ensure record is written before updating write index */
+        smp_wmb();
+        hdr->write_idx++;
+        if (unlikely(hdr->write_idx >= hdr->num_ents))
+                hdr->write_idx = 0;
+        gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
+                hdr->read_idx, hdr->write_idx, ring_len(hdr));
+        mutex_unlock(&dev->lock);
+        return ret;
+drop:
+        hdr->drop_count++;
+filter:
+        gk20a_dbg(gpu_dbg_ctxsw,
+                        "dropping seqno=%d context_id=%08x pid=%lld "
+                        "tag=%x time=%llx (%s)",
+                        entry->seqno, entry->context_id, entry->pid,
+                        entry->tag, entry->timestamp, reason);
+done:
+        mutex_unlock(&dev->lock);
+        return ret;
+}
+void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
+{
+        struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
+        wake_up_interruptible(&dev->readout_wq);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
new file mode 100644
index 00000000..c57d95d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __CTXSW_TRACE_GK20A_H
+#define __CTXSW_TRACE_GK20A_H
+#define GK20A_CTXSW_TRACE_NUM_DEVS                      1
+struct gk20a;
+struct nvgpu_ctxsw_trace_entry;
+struct channel_gk20a;
+struct channel_ctx_gk20a;
+struct gk20a_ctxsw_dev;
+struct gk20a_fecs_trace;
+int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp);
+int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp);
+long gk20a_ctxsw_dev_ioctl(struct file *filp,
+                         unsigned int cmd, unsigned long arg);
+ssize_t gk20a_ctxsw_dev_read(struct file *, char __user *, size_t, loff_t *);
+unsigned int gk20a_ctxsw_dev_poll(struct file *, struct poll_table_struct *);
+int gk20a_ctxsw_dev_mmap(struct file *, struct vm_area_struct *);
+int gk20a_ctxsw_trace_init(struct gk20a *);
+int gk20a_ctxsw_trace_setup(struct gk20a *, void *ctx_ptr);
+void gk20a_ctxsw_trace_cleanup(struct gk20a *);
+int gk20a_ctxsw_trace_write(struct gk20a *, struct nvgpu_ctxsw_trace_entry *);
+void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid);
+#endif /* __CTXSW_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
new file mode 100644
index 00000000..bac36403
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <asm/barrier.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/circ_buf.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/nvgpu.h>
+#include <linux/hashtable.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <uapi/linux/nvgpu.h>
+#include "ctxsw_trace_gk20a.h"
+#include "fecs_trace_gk20a.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_gr_gk20a.h"
+/*
+ * If HW circular buffer is getting too many "buffer full" conditions,
+ * increasing this constant should help (it drives Linux' internal buffer size).
+ */
+#define GK20A_FECS_TRACE_NUM_RECORDS            (1 << 6)
+#define GK20A_FECS_TRACE_HASH_BITS              8 /* 2^8 */
+#define GK20A_FECS_TRACE_FRAME_PERIOD_NS        (1000000000ULL/60ULL)
+#define GK20A_FECS_TRACE_PTIMER_SHIFT           5
+struct gk20a_fecs_trace_record {
+        u32 magic_lo;
+        u32 magic_hi;
+        u32 context_id;
+        u32 context_ptr;
+        u32 new_context_id;
+        u32 new_context_ptr;
+        u64 ts[];
+};
+struct gk20a_fecs_trace_hash_ent {
+        u32 context_ptr;
+        pid_t pid;
+        struct hlist_node node;
+};
+struct gk20a_fecs_trace {
+        struct mem_desc trace_buf;
+        DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
+        struct mutex hash_lock;
+        struct mutex poll_lock;
+        u64 sof;
+        u32 sof_mask; /* did we already send a SOF for this VM */
+        struct task_struct *poll_task;
+};
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
+{
+        return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
+}
+static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
+{
+        return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
+}
+static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch)
+{
+        return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL);
+}
+static inline int gk20a_fecs_trace_num_ts(void)
+{
+        return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
+                - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
+}
+struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
+        struct gk20a_fecs_trace *trace, int idx)
+{
+        return (struct gk20a_fecs_trace_record *)
+                ((u8 *) trace->trace_buf.cpu_va
+                + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
+}
+static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
+{
+        /*
+         * testing magic_hi should suffice. magic_lo is sometimes used
+         * as a sequence number in experimental ucode.
+         */
+        return (r->magic_hi
+                == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
+}
+static int gk20a_fecs_trace_get_read_index(struct gk20a *g)
+{
+        return gr_gk20a_elpg_protected_call(g,
+                        gk20a_readl(g, gr_fecs_mailbox1_r()));
+}
+static int gk20a_fecs_trace_get_write_index(struct gk20a *g)
+{
+        return gr_gk20a_elpg_protected_call(g,
+                        gk20a_readl(g, gr_fecs_mailbox0_r()));
+}
+static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
+{
+        gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index);
+        return gr_gk20a_elpg_protected_call(g,
+                        (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
+}
+void gk20a_fecs_trace_hash_dump(struct gk20a *g)
+{
+        u32 bkt;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table");
+        mutex_lock(&trace->hash_lock);
+        hash_for_each(trace->pid_hash_table, bkt, ent, node)
+        {
+                gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
+                        ent, bkt, ent->context_ptr, ent->pid);
+        }
+        mutex_unlock(&trace->hash_lock);
+}
+static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
+{
+        struct gk20a_fecs_trace_hash_ent *he;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);
+        he = kzalloc(sizeof(*he), GFP_KERNEL);
+        if (unlikely(!he)) {
+                gk20a_warn(dev_from_gk20a(g),
+                        "can't alloc new hash entry for context_ptr=%x pid=%d",
+                        context_ptr, pid);
+                return -ENOMEM;
+        }
+        he->context_ptr = context_ptr;
+        he->pid = pid;
+        mutex_lock(&trace->hash_lock);
+        hash_add(trace->pid_hash_table, &he->node, context_ptr);
+        mutex_unlock(&trace->hash_lock);
+        return 0;
+}
+static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
+{
+        struct hlist_node *tmp;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "freeing hash entry context_ptr=%x", context_ptr);
+        mutex_lock(&trace->hash_lock);
+        hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
+                context_ptr) {
+                if (ent->context_ptr == context_ptr) {
+                        hash_del(&ent->node);
+                        gk20a_dbg(gpu_dbg_ctxsw,
+                                "freed hash entry=%p context_ptr=%x", ent,
+                                ent->context_ptr);
+                        kfree(ent);
+                        break;
+                }
+        }
+        mutex_unlock(&trace->hash_lock);
+}
+static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
+{
+        u32 bkt;
+        struct hlist_node *tmp;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);
+        mutex_lock(&trace->hash_lock);
+        hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
+                hash_del(&ent->node);
+                kfree(ent);
+        }
+        mutex_unlock(&trace->hash_lock);
+}
+static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
+{
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        pid_t pid = 0;
+        mutex_lock(&trace->hash_lock);
+        hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
+                if (ent->context_ptr == context_ptr) {
+                        gk20a_dbg(gpu_dbg_ctxsw,
+                                "found context_ptr=%x -> pid=%d",
+                                ent->context_ptr, ent->pid);
+                        pid = ent->pid;
+                        break;
+                }
+        }
+        mutex_unlock(&trace->hash_lock);
+        return pid;
+}
+/*
+ * Converts HW entry format to userspace-facing format and pushes it to the
+ * queue.
+ */
+static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
+{
+        int i;
+        struct nvgpu_ctxsw_trace_entry entry = { };
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        pid_t cur_pid;
+        pid_t new_pid;
+        /* for now, only one VM */
+        const int vmid = 0;
+        struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(
+                trace, index);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "consuming record trace=%p read=%d record=%p", trace, index, r);
+        if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
+                gk20a_warn(dev_from_gk20a(g),
+                        "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
+                        trace, index, r, r->magic_lo, r->magic_hi);
+                return -EINVAL;
+        }
+        cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
+        new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
+                r->context_ptr, cur_pid, r->new_context_ptr, new_pid);
+        entry.context_id = r->context_id;
+        entry.vmid = vmid;
+        /* insert SOF event if needed */
+        if (!(trace->sof_mask & BIT(vmid))) {
+                entry.tag = NVGPU_CTXSW_TAG_SOF;
+                entry.timestamp = trace->sof;
+                entry.context_id = 0;
+                entry.pid = 0;
+                gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp);
+                gk20a_ctxsw_trace_write(g, &entry);
+                trace->sof_mask |= BIT(vmid);
+        }
+        /* break out FECS record into trace events */
+        for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
+                entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
+                entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
+                entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
+                gk20a_dbg(gpu_dbg_ctxsw,
+                        "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
+                        entry.tag, entry.timestamp, r->context_id,
+                        r->new_context_id);
+                switch (entry.tag) {
+                case NVGPU_CTXSW_TAG_RESTORE_START:
+                case NVGPU_CTXSW_TAG_CONTEXT_START:
+                        entry.context_id = r->new_context_id;
+                        entry.pid = new_pid;
+                        break;
+                case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
+                case NVGPU_CTXSW_TAG_FE_ACK:
+                case NVGPU_CTXSW_TAG_FE_ACK_WFI:
+                case NVGPU_CTXSW_TAG_FE_ACK_GFXP:
+                case NVGPU_CTXSW_TAG_FE_ACK_CTAP:
+                case NVGPU_CTXSW_TAG_FE_ACK_CILP:
+                case NVGPU_CTXSW_TAG_SAVE_END:
+                        entry.context_id = r->context_id;
+                        entry.pid = cur_pid;
+                        break;
+                default:
+                        /* tags are not guaranteed to start at the beginning */
+                        WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP));
+                        continue;
+                }
+                gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
+                        entry.tag, entry.context_id, entry.pid);
+                if (!entry.context_id)
+                        continue;
+                gk20a_ctxsw_trace_write(g, &entry);
+        }
+        gk20a_ctxsw_trace_wake_up(g, vmid);
+        return 0;
+}
+static int gk20a_fecs_trace_poll(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        int read = 0;
+        int write = 0;
+        int cnt;
+        int err;
+        err = gk20a_busy(g->dev);
+        if (unlikely(err))
+                return err;
+        mutex_lock(&trace->poll_lock);
+        write = gk20a_fecs_trace_get_write_index(g);
+        if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to acquire write index, write=%d", write);
+                err = write;
+                goto done;
+        }
+        read = gk20a_fecs_trace_get_read_index(g);
+        cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
+        if (!cnt)
+                goto done;
+        gk20a_dbg(gpu_dbg_ctxsw,
+                "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
+                read, gk20a_fecs_trace_get_read_index(g), write, cnt);
+        /* we did not send any SOF yet */
+        trace->sof_mask = 0;
+        /* consume all records */
+        while (read != write) {
+                gk20a_fecs_trace_ring_read(g, read);
+                /* Get to next record. */
+                read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
+                gk20a_fecs_trace_set_read_index(g, read);
+        }
+done:
+        /*
+         * OK, we read out all the entries... a new "frame" starts here.
+         * We remember the Start Of Frame time and insert it on the next
+         * iteration.
+         */
+        trace->sof = gk20a_read_ptimer(g);
+        mutex_unlock(&trace->poll_lock);
+        gk20a_idle(g->dev);
+        return err;
+}
+static int gk20a_fecs_trace_periodic_polling(void *arg)
+{
+        struct gk20a *g = (struct gk20a *)arg;
+        struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS);
+        pr_info("%s: running\n", __func__);
+        while (!kthread_should_stop()) {
+                hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+                gk20a_fecs_trace_poll(g);
+        }
+        return 0;
+}
+static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS
+                        * ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
+                        &trace->trace_buf);
+}
+static void gk20a_fecs_trace_free_ring(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_gmmu_free(g, &trace->trace_buf);
+}
+#ifdef CONFIG_DEBUG_FS
+/*
+ * The sequence iterator functions.  We simply use the count of the
+ * next line as our internal position.
+ */
+static void *gk20a_fecs_trace_debugfs_ring_seq_start(
+                struct seq_file *s, loff_t *pos)
+{
+        if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
+                return NULL;
+        return pos;
+}
+static void *gk20a_fecs_trace_debugfs_ring_seq_next(
+                struct seq_file *s, void *v, loff_t *pos)
+{
+        ++(*pos);
+        if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
+                return NULL;
+        return pos;
+}
+static void gk20a_fecs_trace_debugfs_ring_seq_stop(
+                struct seq_file *s, void *v)
+{
+}
+static int gk20a_fecs_trace_debugfs_ring_seq_show(
+                struct seq_file *s, void *v)
+{
+        loff_t *pos = (loff_t *) v;
+        struct gk20a *g = *(struct gk20a **)s->private;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos);
+        int i;
+        const u32 invalid_tag =
+            ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
+        u32 tag;
+        u64 timestamp;
+        seq_printf(s, "record #%lld (%p)\n", *pos, r);
+        seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo);
+        seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi);
+        if (gk20a_fecs_trace_is_valid_record(r)) {
+                seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr);
+                seq_printf(s, "\tcontext_id=%08x\n", r->context_id);
+                seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr);
+                seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id);
+                for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
+                        tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
+                        if (tag == invalid_tag)
+                                continue;
+                        timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
+                        timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
+                        seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp);
+                }
+        }
+        return 0;
+}
+/*
+ * Tie them all together into a set of seq_operations.
+ */
+const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = {
+        .start = gk20a_fecs_trace_debugfs_ring_seq_start,
+        .next = gk20a_fecs_trace_debugfs_ring_seq_next,
+        .stop = gk20a_fecs_trace_debugfs_ring_seq_stop,
+        .show = gk20a_fecs_trace_debugfs_ring_seq_show
+};
+/*
+ * Time to set up the file operations for our /proc file.  In this case,
+ * all we need is an open function which sets up the sequence ops.
+ */
+static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode,
+        struct file *file)
+{
+        struct gk20a **p;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops,
+                sizeof(struct gk20a *));
+        if (!p)
+                return -ENOMEM;
+        *p = (struct gk20a *)inode->i_private;
+        return 0;
+};
+/*
+ * The file operations structure contains our open function along with
+ * set of the canned seq_ ops.
+ */
+const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = {
+        .owner = THIS_MODULE,
+        .open = gk20a_ctxsw_debugfs_ring_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = seq_release_private
+};
+static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val)
+{
+        *val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops,
+        gk20a_fecs_trace_debugfs_read, NULL, "%llu\n");
+static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val)
+{
+        *val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops,
+        gk20a_fecs_trace_debugfs_write, NULL, "%llu\n");
+static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *plat = platform_get_drvdata(g->dev);
+        debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g,
+                &gk20a_fecs_trace_debugfs_read_fops);
+        debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g,
+                &gk20a_fecs_trace_debugfs_write_fops);
+        debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g,
+                &gk20a_fecs_trace_debugfs_ring_fops);
+}
+static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
+{
+        struct gk20a_platform *plat = platform_get_drvdata(g->dev);
+        debugfs_remove_recursive(plat->debugfs);
+}
+#else
+static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
+{
+}
+static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+static int gk20a_fecs_trace_init(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace;
+        int err;
+        trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL);
+        if (!trace) {
+                gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace");
+                return -ENOMEM;
+        }
+        g->fecs_trace = trace;
+        BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
+        err = gk20a_fecs_trace_alloc_ring(g);
+        if (err) {
+                gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring");
+                goto clean;
+        }
+        mutex_init(&trace->poll_lock);
+        mutex_init(&trace->hash_lock);
+        hash_init(trace->pid_hash_table);
+        gk20a_fecs_trace_debugfs_init(g);
+        return 0;
+clean:
+        kfree(trace);
+        g->fecs_trace = NULL;
+        return err;
+}
+static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
+                struct channel_gk20a *ch)
+{
+        /*
+         * map our circ_buf to the context space and store the GPU VA
+         * in the context header.
+         */
+        u32 lo;
+        u32 hi;
+        phys_addr_t pa;
+        struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        void *ctx_ptr;
+        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
+                        "hw_chid=%d context_ptr=%x inst_block=%llx",
+                        ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block));
+        if (!trace)
+                return -ENOMEM;
+        pa = gk20a_mem_phys(&trace->trace_buf);
+        if (!pa)
+                return -ENOMEM;
+        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+                PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
+                pgprot_writecombine(PAGE_KERNEL));
+        if (!ctx_ptr)
+                return -ENOMEM;
+        lo = u64_lo32(pa);
+        hi = u64_hi32(pa);
+        gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
+                lo, GK20A_FECS_TRACE_NUM_RECORDS);
+        gk20a_mem_wr32(ctx_ptr
+                + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+                0, lo);
+        gk20a_mem_wr32(ctx_ptr
+                + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+                0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
+        gk20a_mem_wr32(ctx_ptr
+                + ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+                0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+                        GK20A_FECS_TRACE_NUM_RECORDS));
+        vunmap(ctx_ptr);
+        gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
+        return 0;
+}
+static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
+{
+        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
+                        "ch=%p context_ptr=%x", ch, context_ptr);
+        if (g->ops.fecs_trace.flush)
+                g->ops.fecs_trace.flush(g);
+        gk20a_fecs_trace_poll(g);
+        gk20a_fecs_trace_hash_del(g, context_ptr);
+        return 0;
+}
+static int gk20a_fecs_trace_reset(struct gk20a *g)
+{
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+        if (g->ops.fecs_trace.flush)
+                g->ops.fecs_trace.flush(g);
+        gk20a_fecs_trace_poll(g);
+        return gk20a_fecs_trace_set_read_index(g, 0);
+}
+static int gk20a_fecs_trace_deinit(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        gk20a_fecs_trace_debugfs_cleanup(g);
+        kthread_stop(trace->poll_task);
+        gk20a_fecs_trace_free_ring(g);
+        gk20a_fecs_trace_free_hash_table(g);
+        kfree(g->fecs_trace);
+        g->fecs_trace = NULL;
+        return 0;
+}
+static int gk20a_gr_max_entries(struct gk20a *g,
+                struct nvgpu_ctxsw_trace_filter *filter)
+{
+        int n;
+        int tag;
+        /* Compute number of entries per record, with given filter */
+        for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
+                n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
+        /* Return max number of entries generated for the whole ring */
+        return n * GK20A_FECS_TRACE_NUM_RECORDS;
+}
+static int gk20a_fecs_trace_enable(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        struct task_struct *task;
+        if (!trace->poll_task) {
+                task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__);
+                if (unlikely(IS_ERR(task))) {
+                        gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task");
+                        return PTR_ERR(task);
+                }
+                trace->poll_task = task;
+        }
+        return 0;
+}
+static int gk20a_fecs_trace_disable(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        if (trace->poll_task) {
+                kthread_stop(trace->poll_task);
+                trace->poll_task = NULL;
+        }
+        return -EPERM;
+}
+void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
+{
+        ops->fecs_trace.init = gk20a_fecs_trace_init;
+        ops->fecs_trace.deinit = gk20a_fecs_trace_deinit;
+        ops->fecs_trace.enable = gk20a_fecs_trace_enable;
+        ops->fecs_trace.disable = gk20a_fecs_trace_disable;
+        ops->fecs_trace.reset = gk20a_fecs_trace_reset;
+        ops->fecs_trace.flush = NULL;
+        ops->fecs_trace.poll = gk20a_fecs_trace_poll;
+        ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel;
+        ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel;
+        ops->fecs_trace.max_entries = gk20a_gr_max_entries;
+}
+#else
+void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
+{
+}
+#endif /* CONFIG_GK20A_CTXSW_TRACE */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
new file mode 100644
index 00000000..4979d6c6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __FECS_TRACE_GK20A_H
+#define __FECS_TRACE_GK20A_H
+struct gpu_ops;
+void gk20a_init_fecs_trace_ops(struct gpu_ops *ops);
+#endif /* __FECS_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 769960af..029a713f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -25,6 +25,7 @@
 #include "gk20a.h"
 #include "debug_gk20a.h"
+#include "ctxsw_trace_gk20a.h"
 #include "semaphore_gk20a.h"
 #include "hw_fifo_gk20a.h"
 #include "hw_pbdma_gk20a.h"
@@ -303,12 +304,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
        if (!runlist->active_tsgs)
                goto clean_up_runlist_info;
-        runlist->high_prio_channels =
-                kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
-                        GFP_KERNEL);
-        if (!runlist->high_prio_channels)
-                goto clean_up_runlist_info;
        runlist_size  = ram_rl_entry_size_v() * f->num_runlist_entries;
        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
                int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -337,9 +332,6 @@ clean_up_runlist_info:
        kfree(runlist->active_tsgs);
        runlist->active_tsgs = NULL;
-        kfree(runlist->high_prio_channels);
-        runlist->high_prio_channels = NULL;
        kfree(f->runlist_info);
        f->runlist_info = NULL;
@@ -471,8 +463,7 @@ static void gk20a_init_fifo_pbdma_intr_descs(struct fifo_gk20a *f)
        /* Can be used for sw-methods, or represents
         * a recoverable timeout. */
        f->intr.pbdma.restartable_0 =
-                pbdma_intr_0_device_pending_f() |
+                pbdma_intr_0_device_pending_f();
-                pbdma_intr_0_acquire_pending_f();
 }
 static int gk20a_init_fifo_setup_sw(struct gk20a *g)
@@ -786,13 +777,17 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
        if (engine_id == top_device_info_type_enum_graphics_v()) {
                if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
                        gk20a_pmu_disable_elpg(g);
-                        /*HALT_PIPELINE method, halt GR engine*/
+                /*HALT_PIPELINE method, halt GR engine*/
-                        if (gr_gk20a_halt_pipe(g))
+                if (gr_gk20a_halt_pipe(g))
-                                gk20a_err(dev_from_gk20a(g),
+                        gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");
-                                        "failed to HALT gr pipe");
+                /* resetting engine will alter read/write index.
-                        /* resetting engine using mc_enable_r() is not
+                 * need to flush circular buffer before re-enabling FECS.
-                        enough, we do full init sequence */
+                 */
-                        gk20a_gr_reset(g);
+                if (g->ops.fecs_trace.reset)
+                        g->ops.fecs_trace.reset(g);
+                /* resetting engine using mc_enable_r() is not
+                enough, we do full init sequence */
+                gk20a_gr_reset(g);
                if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
                        gk20a_pmu_enable_elpg(g);
        }
@@ -1662,6 +1657,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
                        u32 val = gk20a_readl(g, pbdma_acquire_r(pbdma_id));
                        val &= ~pbdma_acquire_timeout_en_enable_f();
                        gk20a_writel(g, pbdma_acquire_r(pbdma_id), val);
+                        if (g->timeouts_enabled) {
+                                reset = true;
+                                gk20a_err(dev_from_gk20a(g),
+                                        "semaphore acquire timeout!");
+                        }
+                        handled |= pbdma_intr_0_acquire_pending_f();
                }
                if (pbdma_intr_0 & pbdma_intr_0_pbentry_pending_f()) {
@@ -2162,32 +2163,153 @@ static inline u32 gk20a_get_tsg_runlist_entry_0(struct tsg_gk20a *tsg)
        return runlist_entry_0;
 }
-/* add all active high priority channels */
+/* recursively construct a runlist with interleaved bare channels and TSGs */
-static inline u32 gk20a_fifo_runlist_add_high_prio_entries(
+static u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
-                struct fifo_gk20a *f,
+                                struct fifo_runlist_info_gk20a *runlist,
-                struct fifo_runlist_info_gk20a *runlist,
+                                u32 cur_level,
-                u32 *runlist_entry)
+                                u32 *runlist_entry,
+                                bool interleave_enabled,
+                                bool prev_empty,
+                                u32 *entries_left)
 {
-        struct channel_gk20a *ch = NULL;
+        bool last_level = cur_level == NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH;
-        unsigned long high_prio_chid;
+        struct channel_gk20a *ch;
-        u32 count = 0;
+        bool skip_next = false;
+        u32 chid, tsgid, count = 0;
+        gk20a_dbg_fn("");
+        /* for each bare channel, CH, on this level, insert all higher-level
+           channels and TSGs before inserting CH. */
+        for_each_set_bit(chid, runlist->active_channels, f->num_channels) {
+                ch = &f->channel[chid];
+                if (ch->interleave_level != cur_level)
+                        continue;
-        for_each_set_bit(high_prio_chid,
+                if (gk20a_is_channel_marked_as_tsg(ch))
-                        runlist->high_prio_channels, f->num_channels) {
+                        continue;
-                ch = &f->channel[high_prio_chid];
+                if (!last_level && !skip_next) {
+                        runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+                        /* if interleaving is disabled, higher-level channels
+                           and TSGs only need to be inserted once */
+                        if (!interleave_enabled)
+                                skip_next = true;
+                }
-                if (!gk20a_is_channel_marked_as_tsg(ch) &&
+                if (!(*entries_left))
-                     test_bit(high_prio_chid, runlist->active_channels) == 1) {
+                        return NULL;
-                        gk20a_dbg_info("add high prio channel %lu to runlist",
-                                        high_prio_chid);
+                gk20a_dbg_info("add channel %d to runlist", chid);
-                        runlist_entry[0] = ram_rl_entry_chid_f(high_prio_chid);
+                runlist_entry[0] = ram_rl_entry_chid_f(chid);
+                runlist_entry[1] = 0;
+                runlist_entry += 2;
+                count++;
+                (*entries_left)--;
+        }
+        /* for each TSG, T, on this level, insert all higher-level channels
+           and TSGs before inserting T. */
+        for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+                struct tsg_gk20a *tsg = &f->tsg[tsgid];
+                if (tsg->interleave_level != cur_level)
+                        continue;
+                if (!last_level && !skip_next) {
+                        runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+                        if (!interleave_enabled)
+                                skip_next = true;
+                }
+                if (!(*entries_left))
+                        return NULL;
+                /* add TSG entry */
+                gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
+                runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
+                runlist_entry[1] = 0;
+                runlist_entry += 2;
+                count++;
+                (*entries_left)--;
+                mutex_lock(&tsg->ch_list_lock);
+                /* add runnable channels bound to this TSG */
+                list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+                        if (!test_bit(ch->hw_chid,
+                                      runlist->active_channels))
+                                continue;
+                        if (!(*entries_left)) {
+                                mutex_unlock(&tsg->ch_list_lock);
+                                return NULL;
+                        }
+                        gk20a_dbg_info("add channel %d to runlist",
+                                ch->hw_chid);
+                        runlist_entry[0] = ram_rl_entry_chid_f(ch->hw_chid);
                        runlist_entry[1] = 0;
                        runlist_entry += 2;
                        count++;
+                        (*entries_left)--;
                }
+                mutex_unlock(&tsg->ch_list_lock);
        }
-        return count;
+        /* append entries from higher level if this level is empty */
+        if (!count && !last_level)
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        true,
+                                                        entries_left);
+        /*
+         * if previous and this level have entries, append
+         * entries from higher level.
+         *
+         * ex. dropping from MEDIUM to LOW, need to insert HIGH
+         */
+        if (interleave_enabled && count && !prev_empty && !last_level)
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+        return runlist_entry;
+}
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                bool is_tsg,
+                                u32 runlist_id,
+                                u32 new_level)
+{
+        gk20a_dbg_fn("");
+        if (is_tsg)
+                g->fifo.tsg[id].interleave_level = new_level;
+        else
+                g->fifo.channel[id].interleave_level = new_level;
+        return 0;
 }
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
@@ -2198,14 +2320,11 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
        struct fifo_gk20a *f = &g->fifo;
        struct fifo_runlist_info_gk20a *runlist = NULL;
        u32 *runlist_entry_base = NULL;
-        u32 *runlist_entry = NULL;
        u64 runlist_iova;
        u32 old_buf, new_buf;
-        u32 chid, tsgid;
        struct channel_gk20a *ch = NULL;
        struct tsg_gk20a *tsg = NULL;
        u32 count = 0;
-        u32 count_channels_in_tsg;
        runlist = &f->runlist_info[runlist_id];
        /* valid channel, add/remove it from active list.
@@ -2254,91 +2373,23 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
        if (hw_chid != ~0 || /* add/remove a valid channel */
            add /* resume to add all channels back */) {
-                runlist_entry = runlist_entry_base;
+                u32 max_entries = f->num_runlist_entries;
+                u32 *runlist_end;
-                /* Runlist manipulation:
-                   Insert an entry of all high priority channels inbetween
-                   all lower priority channels. This ensure that the maximum
-                   delay a runnable high priority channel has to wait is one
-                   medium timeslice + any context switching overhead +
-                   wait on other high priority channels.
-                   add non-TSG channels first */
-                for_each_set_bit(chid,
-                        runlist->active_channels, f->num_channels) {
-                        ch = &f->channel[chid];
-                        if (!gk20a_is_channel_marked_as_tsg(ch) &&
-                                !ch->interleave) {
-                                u32 added;
-                                gk20a_dbg_info("add normal prio channel %d to runlist",
-                                        chid);
-                                runlist_entry[0] = ram_rl_entry_chid_f(chid);
-                                runlist_entry[1] = 0;
-                                runlist_entry += 2;
-                                count++;
-                                added = gk20a_fifo_runlist_add_high_prio_entries(
-                                                f,
-                                                runlist,
-                                                runlist_entry);
-                                count += added;
-                                runlist_entry += 2 * added;
-                        }
-                }
-                /* if there were no lower priority channels, then just
+                runlist_end = gk20a_runlist_construct_locked(f,
-                 * add the high priority channels once. */
+                                                runlist,
-                if (count == 0) {
+                                                0,
-                        count = gk20a_fifo_runlist_add_high_prio_entries(
+                                                runlist_entry_base,
-                                        f,
+                                                g->runlist_interleave,
-                                        runlist,
+                                                true,
-                                        runlist_entry);
+                                                &max_entries);
-                        runlist_entry += 2 * count;
+                if (!runlist_end) {
+                        ret = -E2BIG;
+                        goto clean_up;
                }
-                /* now add TSG entries and channels bound to TSG */
+                count = (runlist_end - runlist_entry_base) / 2;
-                mutex_lock(&f->tsg_inuse_mutex);
+                WARN_ON(count > f->num_runlist_entries);
-                for_each_set_bit(tsgid,
-                                runlist->active_tsgs, f->num_channels) {
-                        u32 added;
-                        tsg = &f->tsg[tsgid];
-                        /* add TSG entry */
-                        gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
-                        runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
-                        runlist_entry[1] = 0;
-                        runlist_entry += 2;
-                        count++;
-                        /* add runnable channels bound to this TSG */
-                        count_channels_in_tsg = 0;
-                        mutex_lock(&tsg->ch_list_lock);
-                        list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-                                if (!test_bit(ch->hw_chid,
-                                                runlist->active_channels))
-                                        continue;
-                                gk20a_dbg_info("add channel %d to runlist",
-                                        ch->hw_chid);
-                                runlist_entry[0] =
-                                        ram_rl_entry_chid_f(ch->hw_chid);
-                                runlist_entry[1] = 0;
-                                runlist_entry += 2;
-                                count++;
-                                count_channels_in_tsg++;
-                        }
-                        mutex_unlock(&tsg->ch_list_lock);
-                        WARN_ON(tsg->num_active_channels !=
-                                count_channels_in_tsg);
-                        added = gk20a_fifo_runlist_add_high_prio_entries(
-                                        f,
-                                        runlist,
-                                        runlist_entry);
-                        count += added;
-                        runlist_entry += 2 * added;
-                }
-                mutex_unlock(&f->tsg_inuse_mutex);
        } else  /* suspend to remove all channels */
                count = 0;
@@ -2493,42 +2544,6 @@ u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
        return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
 }
-int gk20a_fifo_set_channel_priority(
-                struct gk20a *g,
-                u32 runlist_id,
-                u32 hw_chid,
-                bool interleave)
-{
-        struct fifo_runlist_info_gk20a *runlist = NULL;
-        struct fifo_gk20a *f = &g->fifo;
-        struct channel_gk20a *ch = NULL;
-        if (hw_chid >= f->num_channels)
-                return -EINVAL;
-        if (runlist_id >= f->max_runlists)
-                return -EINVAL;
-        ch = &f->channel[hw_chid];
-        gk20a_dbg_fn("");
-        runlist = &f->runlist_info[runlist_id];
-        mutex_lock(&runlist->mutex);
-        if (ch->interleave)
-                set_bit(hw_chid, runlist->high_prio_channels);
-        else
-                clear_bit(hw_chid, runlist->high_prio_channels);
-        gk20a_dbg_fn("done");
-        mutex_unlock(&runlist->mutex);
-        return 0;
-}
 struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
                u32 hw_chid)
 {
@@ -2545,4 +2560,5 @@ void gk20a_init_fifo(struct gpu_ops *gops)
        gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
        gops->fifo.get_num_fifos = gk20a_fifo_get_num_fifos;
        gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
+        gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index ee4e7328..0979bf2b 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -31,7 +31,6 @@
 struct fifo_runlist_info_gk20a {
        unsigned long *active_channels;
        unsigned long *active_tsgs;
-        unsigned long *high_prio_channels;
        /* Each engine has its own SW and HW runlist buffer.*/
        struct mem_desc mem[MAX_RUNLIST_BUFFERS];
        u32  cur_buffer;
@@ -184,8 +183,6 @@ void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
 int gk20a_fifo_wait_engine_idle(struct gk20a *g);
 u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
 u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
-int gk20a_fifo_set_channel_priority(struct gk20a *g, u32 runlist_id,
-                u32 hw_chid, bool interleave);
 u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
                int *__id, bool *__is_tsg);
 bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
@@ -198,4 +195,9 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
                u32 hw_chid);
 void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg);
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                bool is_tsg,
+                                u32 runlist_id,
+                                u32 new_level);
 #endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 6a5986a7..b8753a21 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -60,6 +60,7 @@
 #include "hw_gr_gk20a.h"
 #include "hw_fb_gk20a.h"
 #include "gk20a_scale.h"
+#include "ctxsw_trace_gk20a.h"
 #include "dbg_gpu_gk20a.h"
 #include "gk20a_allocator.h"
 #include "hal.h"
@@ -80,7 +81,7 @@
 /* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
 #define INTERFACE_NAME "nvhost%s-gpu"
-#define GK20A_NUM_CDEVS 6
+#define GK20A_NUM_CDEVS 7
 #define EMC3D_DEFAULT_RATIO 750
@@ -169,6 +170,19 @@ static const struct file_operations gk20a_tsg_ops = {
        .unlocked_ioctl = gk20a_tsg_dev_ioctl,
 };
+static const struct file_operations gk20a_ctxsw_ops = {
+        .owner = THIS_MODULE,
+        .release = gk20a_ctxsw_dev_release,
+        .open = gk20a_ctxsw_dev_open,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_ctxsw_dev_ioctl,
+#endif
+        .unlocked_ioctl = gk20a_ctxsw_dev_ioctl,
+        .poll = gk20a_ctxsw_dev_poll,
+        .read = gk20a_ctxsw_dev_read,
+        .mmap = gk20a_ctxsw_dev_mmap,
+};
 static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
 {
        writel(v, g->sim.regs+r);
@@ -672,9 +686,6 @@ static int gk20a_init_support(struct platform_device *dev)
        mutex_init(&g->ch_wdt_lock);
        mutex_init(&g->poweroff_lock);
-        mutex_init(&g->interleave_lock);
-        g->num_interleaved_channels = 0;
        g->remove_support = gk20a_remove_support;
        return 0;
@@ -884,6 +895,10 @@ static int gk20a_pm_finalize_poweron(struct device *dev)
                goto done;
        }
+        err = gk20a_ctxsw_trace_init(g);
+        if (err)
+                gk20a_warn(dev, "could not initialize ctxsw tracing");
        /* Restore the debug setting */
        g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl);
@@ -1012,6 +1027,11 @@ void gk20a_user_deinit(struct platform_device *dev)
                cdev_del(&g->tsg.cdev);
        }
+        if (g->ctxsw.node) {
+                device_destroy(g->class, g->ctxsw.cdev.dev);
+                cdev_del(&g->ctxsw.cdev);
+        }
        if (g->cdev_region)
                unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
@@ -1077,6 +1097,15 @@ int gk20a_user_init(struct platform_device *dev)
        if (err)
                goto fail;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        err = gk20a_create_device(dev, devno++, "-ctxsw",
+                                  &g->ctxsw.cdev, &g->ctxsw.node,
+                                  &gk20a_ctxsw_ops);
+        if (err)
+                goto fail;
+#endif
        return 0;
 fail:
        gk20a_user_deinit(dev);
@@ -1400,9 +1429,11 @@ static int gk20a_probe(struct platform_device *dev)
        spin_lock_init(&gk20a->mc_enable_lock);
+#ifdef CONFIG_RESET_CONTROLLER
        platform->reset_control = devm_reset_control_get(&dev->dev, NULL);
        if (IS_ERR(platform->reset_control))
                platform->reset_control = NULL;
+#endif
        gk20a_debug_init(dev);
@@ -1439,14 +1470,11 @@ static int gk20a_probe(struct platform_device *dev)
        if (tegra_platform_is_silicon())
                gk20a->timeouts_enabled = true;
-        gk20a->interleave_high_priority = true;
+        gk20a->runlist_interleave = true;
        gk20a->timeslice_low_priority_us = 1300;
        gk20a->timeslice_medium_priority_us = 2600;
-        if (gk20a->interleave_high_priority)
+        gk20a->timeslice_high_priority_us = 5200;
-                gk20a->timeslice_high_priority_us = 3000;
-        else
-                gk20a->timeslice_high_priority_us = 5200;
        /* Set up initial power settings. For non-slicon platforms, disable *
         * power features and for silicon platforms, read from platform data */
@@ -1527,16 +1555,17 @@ static int gk20a_probe(struct platform_device *dev)
                                        platform->debugfs,
                                        &gk20a->timeslice_high_priority_us);
-        gk20a->debugfs_interleave_high_priority =
+        gk20a->debugfs_runlist_interleave =
-                        debugfs_create_bool("interleave_high_priority",
+                        debugfs_create_bool("runlist_interleave",
                                        S_IRUGO|S_IWUSR,
                                        platform->debugfs,
-                                        &gk20a->interleave_high_priority);
+                                        &gk20a->runlist_interleave);
        gr_gk20a_debugfs_init(gk20a);
        gk20a_pmu_debugfs_init(dev);
        gk20a_cde_debugfs_init(dev);
        gk20a_alloc_debugfs_init(dev);
+        gk20a_mm_debugfs_init(dev);
 #endif
        gk20a_init_gr(gk20a);
@@ -1558,6 +1587,8 @@ static int __exit gk20a_remove(struct platform_device *dev)
        if (platform->has_cde)
                gk20a_cde_destroy(g);
+        gk20a_ctxsw_trace_cleanup(g);
        if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
                gk20a_scale_exit(dev);
@@ -1774,7 +1805,10 @@ void gk20a_enable(struct gk20a *g, u32 units)
 void gk20a_reset(struct gk20a *g, u32 units)
 {
        gk20a_disable(g, units);
-        udelay(20);
+        if (units & mc_enable_ce2_enabled_f())
+                udelay(500);
+        else
+                udelay(20);
        gk20a_enable(g, units);
 }
@@ -2095,6 +2129,19 @@ gk20a_request_firmware(struct gk20a *g, const char *fw_name)
        return fw;
 }
+u64 gk20a_read_ptimer(struct gk20a *g)
+{
+        u32 time_hi0 = gk20a_readl(g, timer_time_1_r());
+        u32 time_lo = gk20a_readl(g, timer_time_0_r());
+        u32 time_hi1 = gk20a_readl(g, timer_time_1_r());
+        u32 time_hi = (time_lo & (1L << 31)) ? time_hi0 : time_hi1;
+        u64 time = ((u64)time_hi << 32) | time_lo;
+        return time;
+}
 MODULE_LICENSE("GPL v2");
 module_init(gk20a_init);
 module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 340f358a..8a1f82bc 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -25,6 +25,8 @@ struct channel_gk20a;
 struct gr_gk20a;
 struct sim_gk20a;
 struct gk20a_ctxsw_ucode_segments;
+struct gk20a_fecs_trace;
+struct gk20a_ctxsw_trace;
 struct acr_gm20b;
 #include <linux/sched.h>
@@ -54,8 +56,6 @@ struct acr_gm20b;
    32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
-#define MAX_INTERLEAVED_CHANNELS                32
 struct cooling_device_gk20a {
        struct thermal_cooling_device *gk20a_cooling_dev;
        unsigned int gk20a_freq_state;
@@ -236,6 +236,7 @@ struct gpu_ops {
                void (*slcg_therm_load_gating_prod)(struct gk20a *g, bool prod);
                void (*slcg_xbar_load_gating_prod)(struct gk20a *g, bool prod);
                void (*blcg_bus_load_gating_prod)(struct gk20a *g, bool prod);
+                void (*blcg_ce_load_gating_prod)(struct gk20a *g, bool prod);
                void (*blcg_ctxsw_firmware_load_gating_prod)(struct gk20a *g, bool prod);
                void (*blcg_fb_load_gating_prod)(struct gk20a *g, bool prod);
                void (*blcg_fifo_load_gating_prod)(struct gk20a *g, bool prod);
@@ -267,6 +268,11 @@ struct gpu_ops {
                u32 (*get_num_fifos)(struct gk20a *g);
                u32 (*get_pbdma_signature)(struct gk20a *g);
                int (*channel_set_priority)(struct channel_gk20a *ch, u32 priority);
+                int (*set_runlist_interleave)(struct gk20a *g, u32 id,
+                                        bool is_tsg, u32 runlist_id,
+                                        u32 new_level);
+                int (*channel_set_timeslice)(struct channel_gk20a *ch,
+                                        u32 timeslice);
        } fifo;
        struct pmu_v {
                /*used for change of enum zbc update cmd id from ver 0 to ver1*/
@@ -369,6 +375,19 @@ struct gpu_ops {
                bool use_dma_for_fw_bootstrap;
        } gr_ctx;
        struct {
+                int (*init)(struct gk20a *g);
+                int (*max_entries)(struct gk20a *,
+                        struct nvgpu_ctxsw_trace_filter *);
+                int (*flush)(struct gk20a *g);
+                int (*poll)(struct gk20a *g);
+                int (*enable)(struct gk20a *g);
+                int (*disable)(struct gk20a *g);
+                int (*reset)(struct gk20a *g);
+                int (*bind_channel)(struct gk20a *, struct channel_gk20a *);
+                int (*unbind_channel)(struct gk20a *, struct channel_gk20a *);
+                int (*deinit)(struct gk20a *g);
+        } fecs_trace;
+        struct {
                bool (*support_sparse)(struct gk20a *g);
                bool (*is_debug_mode_enabled)(struct gk20a *g);
                void (*set_debug_mode)(struct gk20a *g, bool enable);
@@ -535,10 +554,7 @@ struct gk20a {
        u32 timeslice_low_priority_us;
        u32 timeslice_medium_priority_us;
        u32 timeslice_high_priority_us;
-        u32 interleave_high_priority;
+        u32 runlist_interleave;
-        struct mutex interleave_lock;
-        u32 num_interleaved_channels;
        bool slcg_enabled;
        bool blcg_enabled;
@@ -563,7 +579,7 @@ struct gk20a {
        struct dentry *debugfs_timeslice_low_priority_us;
        struct dentry *debugfs_timeslice_medium_priority_us;
        struct dentry *debugfs_timeslice_high_priority_us;
-        struct dentry *debugfs_interleave_high_priority;
+        struct dentry *debugfs_runlist_interleave;
 #endif
        struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
@@ -575,6 +591,14 @@ struct gk20a {
        int dbg_powergating_disabled_refcount; /*refcount for pg disable */
        int dbg_timeout_disabled_refcount; /*refcount for timeout disable */
+        /*
+         * When set subsequent VMAs will separate fixed and non-fixed
+         * allocations. This avoids conflicts with fixed and non-fixed allocs
+         * for some tests. The value in separate_fixed_allocs is used to
+         * determine the split boundary.
+         */
+        u64 separate_fixed_allocs;
        void (*remove_support)(struct platform_device *);
        u64 pg_ingating_time_us;
@@ -612,6 +636,11 @@ struct gk20a {
                struct device *node;
        } tsg;
+        struct {
+                struct cdev cdev;
+                struct device *node;
+        } ctxsw;
        struct mutex client_lock;
        int client_refcount; /* open channels and ctrl nodes */
@@ -638,6 +667,9 @@ struct gk20a {
        struct gk20a_scale_profile *scale_profile;
+        struct gk20a_ctxsw_trace *ctxsw_trace;
+        struct gk20a_fecs_trace *fecs_trace;
        struct device_dma_parameters dma_parms;
        struct gk20a_cde_app cde_app;
@@ -715,6 +747,7 @@ enum gk20a_dbg_categories {
        gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
        gpu_dbg_cde     = BIT(10), /* cde info messages */
        gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
+        gpu_dbg_ctxsw   = BIT(12), /* ctxsw tracing */
        gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
@@ -961,4 +994,6 @@ static inline u32 scale_ptimer(u32 timeout , u32 scale10x)
        else
                return (timeout * 10) / scale10x;
 }
+u64 gk20a_read_ptimer(struct gk20a *g);
 #endif /* GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
index 0e6b576b..d433c9bb 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
@@ -3,7 +3,7 @@
 *
 * GK20A Graphics
 *
- * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -30,7 +30,6 @@
 #include "fifo_gk20a.h"
 #include "pmu_gk20a.h"
 #define PTIMER_FP_FACTOR                        1000000
 #define ROOTRW (S_IRWXU|S_IRGRP|S_IROTH)
@@ -100,6 +99,9 @@ static ssize_t blcg_enable_store(struct device *device,
        if (g->ops.clock_gating.blcg_bus_load_gating_prod)
                g->ops.clock_gating.blcg_bus_load_gating_prod(g, g->blcg_enabled);
+        if (g->ops.clock_gating.blcg_ce_load_gating_prod)
+                g->ops.clock_gating.blcg_ce_load_gating_prod(g,
+                                        g->blcg_enabled);
        if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
                g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g, g->blcg_enabled);
        if (g->ops.clock_gating.blcg_fb_load_gating_prod)
@@ -784,8 +786,15 @@ void gk20a_remove_sysfs(struct device *dev)
        device_remove_file(dev, &dev_attr_allow_all);
        device_remove_file(dev, &dev_attr_tpc_fs_mask);
-        if (g->host1x_dev && (dev->parent != &g->host1x_dev->dev))
+        if (g->host1x_dev && (dev->parent != &g->host1x_dev->dev)) {
                sysfs_remove_link(&g->host1x_dev->dev.kobj, dev_name(dev));
+                if (strcmp(dev_name(dev), "gpu.0")) {
+                        struct kobject *kobj = &dev->kobj;
+                        struct device *parent = container_of((kobj->parent),
+                                        struct device, kobj);
+                        sysfs_remove_link(&parent->kobj, "gpu.0");
+                }
+        }
 }
 void gk20a_create_sysfs(struct platform_device *dev)
@@ -817,10 +826,19 @@ void gk20a_create_sysfs(struct platform_device *dev)
        error |= device_create_file(&dev->dev, &dev_attr_allow_all);
        error |= device_create_file(&dev->dev, &dev_attr_tpc_fs_mask);
-        if (g->host1x_dev && (dev->dev.parent != &g->host1x_dev->dev))
+        if (g->host1x_dev && (dev->dev.parent != &g->host1x_dev->dev)) {
                error |= sysfs_create_link(&g->host1x_dev->dev.kobj,
                                           &dev->dev.kobj,
                                           dev_name(&dev->dev));
+                if (strcmp(dev_name(&dev->dev), "gpu.0")) {
+                        struct kobject *kobj = &dev->dev.kobj;
+                        struct device *parent = container_of((kobj->parent),
+                                                struct device, kobj);
+                        error |= sysfs_create_link(&parent->kobj,
+                                           &dev->dev.kobj, "gpu.0");
+                }
+        }
        if (error)
                dev_err(&dev->dev, "Failed to create sysfs attributes!\n");
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 7e37a965..a10650be 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -56,6 +56,7 @@
 #include "debug_gk20a.h"
 #include "semaphore_gk20a.h"
 #include "platform_gk20a.h"
+#include "ctxsw_trace_gk20a.h"
 #define BLK_SIZE (256)
@@ -2855,6 +2856,13 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
                                "fail to load golden ctx image");
                        goto out;
                }
+                if (g->ops.fecs_trace.bind_channel) {
+                        err = g->ops.fecs_trace.bind_channel(g, c);
+                        if (err) {
+                                gk20a_warn(dev_from_gk20a(g),
+                                        "fail to bind channel for ctxsw trace");
+                        }
+                }
                c->first_init = true;
        }
@@ -4217,7 +4225,15 @@ out:
 static void gr_gk20a_load_gating_prod(struct gk20a *g)
 {
        /* slcg prod values */
-        g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
+        if (g->ops.clock_gating.slcg_bus_load_gating_prod)
+                g->ops.clock_gating.slcg_bus_load_gating_prod(g,
+                                g->slcg_enabled);
+        if (g->ops.clock_gating.slcg_chiplet_load_gating_prod)
+                g->ops.clock_gating.slcg_chiplet_load_gating_prod(g,
+                                g->slcg_enabled);
+        if (g->ops.clock_gating.slcg_gr_load_gating_prod)
+                g->ops.clock_gating.slcg_gr_load_gating_prod(g,
+                                g->slcg_enabled);
        if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod)
                g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g,
                                g->slcg_enabled);
@@ -4227,6 +4243,12 @@ static void gr_gk20a_load_gating_prod(struct gk20a *g)
                                g->slcg_enabled);
        /* blcg prod values */
+        if (g->ops.clock_gating.blcg_bus_load_gating_prod)
+                g->ops.clock_gating.blcg_bus_load_gating_prod(g,
+                                g->blcg_enabled);
+        if (g->ops.clock_gating.blcg_ce_load_gating_prod)
+                g->ops.clock_gating.blcg_ce_load_gating_prod(g,
+                                g->blcg_enabled);
        g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
        if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
                g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g,
@@ -7463,6 +7485,7 @@ static int gr_gk20a_dump_gr_status_regs(struct gk20a *g,
        return 0;
 }
+#ifdef CONFIG_DEBUG_FS
 int gr_gk20a_debugfs_init(struct gk20a *g)
 {
        struct gk20a_platform *platform = platform_get_drvdata(g->dev);
@@ -7474,6 +7497,7 @@ int gr_gk20a_debugfs_init(struct gk20a *g)
        return 0;
 }
+#endif
 static void gr_gk20a_init_cyclestats(struct gk20a *g)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index a9ad970a..9718aad2 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -22,6 +22,7 @@
 #include "gk20a_gating_reglist.h"
 #include "channel_gk20a.h"
 #include "gr_ctx_gk20a.h"
+#include "fecs_trace_gk20a.h"
 #include "mm_gk20a.h"
 #include "mc_gk20a.h"
 #include "pmu_gk20a.h"
@@ -57,6 +58,7 @@ int gk20a_init_hal(struct gk20a *g)
        gk20a_init_mc(gops);
        gk20a_init_ltc(gops);
        gk20a_init_gr_ops(gops);
+        gk20a_init_fecs_trace_ops(gops);
        gk20a_init_fb(gops);
        gk20a_init_fifo(gops);
        gk20a_init_ce2(gops);
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
index 39cbbb58..da555f7c 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2012-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -246,4 +246,192 @@ static inline u32 ctxsw_prog_main_image_context_id_o(void)
 {
        return 0x000000f0;
 }
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_o(void)
+{
+        return 0x000000ac;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(void)
+{
+        return 0x000000b0;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_v_m(void)
+{
+        return 0xfffffff << 0;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_m(void)
+{
+        return 0x3 << 28;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(void)
+{
+        return 0x30000000;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(void)
+{
+        return 0x000000b4;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 ctxsw_prog_record_timestamp_record_size_in_bytes_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 ctxsw_prog_record_timestamp_record_size_in_words_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_o(void)
+{
+        return 0x00000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_v_value_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_o(void)
+{
+        return 0x00000004;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_v_value_v(void)
+{
+        return 0x600dbeef;
+}
+static inline u32 ctxsw_prog_record_timestamp_context_id_o(void)
+{
+        return 0x00000008;
+}
+static inline u32 ctxsw_prog_record_timestamp_context_ptr_o(void)
+{
+        return 0x0000000c;
+}
+static inline u32 ctxsw_prog_record_timestamp_new_context_id_o(void)
+{
+        return 0x00000010;
+}
+static inline u32 ctxsw_prog_record_timestamp_new_context_ptr_o(void)
+{
+        return 0x00000014;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_lo_o(void)
+{
+        return 0x00000018;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_o(void)
+{
+        return 0x0000001c;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_v(u32 r)
+{
+        return (r >> 0) & 0xffffff;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_m(void)
+{
+        return 0xff << 24;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_v(u32 r)
+{
+        return (r >> 24) & 0xff;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_f(void)
+{
+        return 0x2000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_v(void)
+{
+        return 0x0000000a;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_f(void)
+{
+        return 0xa000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_v(void)
+{
+        return 0x0000000b;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_f(void)
+{
+        return 0xb000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_f(void)
+{
+        return 0xc000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_v(void)
+{
+        return 0x0000000d;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_f(void)
+{
+        return 0xd000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_f(void)
+{
+        return 0x3000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_f(void)
+{
+        return 0x4000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_v(void)
+{
+        return 0x00000005;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_f(void)
+{
+        return 0x5000000;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(void)
+{
+        return 0x000000ff;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_f(void)
+{
+        return 0xff000000;
+}
 #endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
index 6db5654b..94770431 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2012-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -258,6 +258,10 @@ static inline u32 ltc_ltcs_ltss_intr_en_evicted_cb_m(void)
 {
        return 0x1 << 20;
 }
+static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_m(void)
+{
+        return 0x1 << 21;
+}
 static inline u32 ltc_ltc0_lts0_intr_r(void)
 {
        return 0x00141020;
diff --git a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
index 22bc50ac..4cb36cbe 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2013-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -98,4 +98,12 @@ static inline u32 timer_pri_timeout_fecs_errcode_r(void)
 {
        return 0x0000908c;
 }
+static inline u32 timer_time_0_r(void)
+{
+        return 0x00009400;
+}
+static inline u32 timer_time_1_r(void)
+{
+        return 0x00009410;
+}
 #endif
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index c6ff07da..0d9a98b4 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -1,9 +1,7 @@
 /*
- * drivers/video/tegra/host/gk20a/ltc_gk20a.c
+ * GK20A L2
 *
- * GK20A Graphics
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -173,9 +171,17 @@ out:
 static void gk20a_ltc_init_fs_state(struct gk20a *g)
 {
+        u32 reg;
        gk20a_dbg_info("initialize gk20a L2");
        g->max_ltc_count = g->ltc_count = 1;
+        /* Disable LTC interrupts */
+        reg = gk20a_readl(g, ltc_ltcs_ltss_intr_r());
+        reg &= ~ltc_ltcs_ltss_intr_en_evicted_cb_m();
+        reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_m();
+        gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg);
 }
 static void gk20a_ltc_isr(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 738df2af..7a02d68e 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * GK20A memory management
 *
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -323,7 +323,7 @@ static int gk20a_alloc_comptags(struct gk20a *g,
        if (err)
                return err;
-        /* 
+        /*
         * offset needs to be at the start of a page/cacheline boundary;
         * prune the preceding ctaglines that were allocated for alignment.
         */
@@ -1290,12 +1290,6 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
        int ctag_granularity = g->ops.fb.compression_page_size(g);
        u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
-        if (clear_ctags && ctag_offset) {
-                /* init/clear the ctag buffer */
-                g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
-                                ctag_offset, ctag_offset + ctag_lines - 1);
-        }
        /* Allocate (or validate when map_offset != 0) the virtual address. */
        if (!map_offset) {
                map_offset = gk20a_vm_alloc_va(vm, size,
@@ -1651,17 +1645,14 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
                        bfr.kind_v = bfr.uc_kind_v;
                } else {
                        gk20a_get_comptags(d, dmabuf, &comptags);
-                        clear_ctags = true;
-                        if (comptags.lines < comptags.allocated_lines) {
-                                /* clear tail-padding comptags */
-                                u32 ctagmin = comptags.offset + comptags.lines;
-                                u32 ctagmax = comptags.offset +
-                                        comptags.allocated_lines - 1;
+                        if (g->ops.ltc.cbc_ctrl)
                                g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
-                                                    ctagmin, ctagmax);
+                                                    comptags.offset,
-                        }
+                                                    comptags.offset +
+                                                        comptags.allocated_lines - 1);
+                        else
+                                clear_ctags = true;
                }
        }
@@ -2815,6 +2806,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
        u64 small_vma_start, small_vma_limit, large_vma_start, large_vma_limit,
                kernel_vma_start, kernel_vma_limit;
        u32 pde_lo, pde_hi;
+        struct gk20a *g = mm->g;
        /* note: this must match gmmu_pgsz_gk20a enum */
        u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size, SZ_4K };
@@ -2904,6 +2896,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
                goto clean_up_pdes;
        }
+        /*
+         * Attempt to make a separate VM for fixed allocations.
+         */
+        if (g->separate_fixed_allocs &&
+            small_vma_start < small_vma_limit) {
+                if (g->separate_fixed_allocs >= small_vma_limit)
+                        goto clean_up_pdes;
+                snprintf(alloc_name, sizeof(alloc_name),
+                         "gk20a_%s-fixed", name);
+                err = __gk20a_allocator_init(&vm->fixed,
+                                             vm, alloc_name,
+                                             small_vma_start,
+                                             g->separate_fixed_allocs,
+                                             SZ_4K,
+                                             GPU_BALLOC_MAX_ORDER,
+                                             GPU_BALLOC_GVA_SPACE);
+                if (err)
+                        goto clean_up_ptes;
+                /* Make sure to update the user vma size. */
+                small_vma_start = g->separate_fixed_allocs;
+        }
        if (small_vma_start < small_vma_limit) {
                snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
                         vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
@@ -3066,14 +3083,17 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
        }
        vma = &vm->vma[pgsz_idx];
-        if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+        if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) {
+                if (vm->fixed.init)
+                        vma = &vm->fixed;
                vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
                                                 (u64)args->pages *
                                                 (u64)args->page_size);
-        else
+        } else {
                vaddr_start = gk20a_balloc(vma,
                                           (u64)args->pages *
                                           (u64)args->page_size);
+        }
        if (!vaddr_start) {
                kfree(va_node);
@@ -3140,7 +3160,10 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
        pgsz_idx = __nv_gmmu_va_is_big_page_region(vm, args->offset) ?
                        gmmu_page_size_big : gmmu_page_size_small;
-        vma = &vm->vma[pgsz_idx];
+        if (vm->fixed.init)
+                vma = &vm->fixed;
+        else
+                vma = &vm->vma[pgsz_idx];
        gk20a_bfree(vma, args->offset);
        mutex_lock(&vm->update_gmmu_lock);
@@ -3330,6 +3353,8 @@ void gk20a_deinit_vm(struct vm_gk20a *vm)
                gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
        if (vm->vma[gmmu_page_size_small].init)
                gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+        if (vm->fixed.init)
+                gk20a_allocator_destroy(&vm->fixed);
        gk20a_vm_free_entries(vm, &vm->pdb, 0);
 }
@@ -3843,6 +3868,16 @@ clean_up:
        return err;
 }
+void gk20a_mm_debugfs_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct dentry *gpu_root = platform->debugfs;
+        struct gk20a *g = gk20a_get_platform(pdev)->g;
+        debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,
+                           &g->separate_fixed_allocs);
+}
 void gk20a_init_mm(struct gpu_ops *gops)
 {
        gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled;
@@ -3863,4 +3898,3 @@ void gk20a_init_mm(struct gpu_ops *gops)
        gops->mm.init_pdb = gk20a_mm_init_pdb;
        gops->mm.init_mm_setup_hw = gk20a_init_mm_setup_hw;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index b8b0ca49..368b32d3 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -259,6 +259,10 @@ struct vm_gk20a {
        struct gk20a_mm_entry pdb;
        struct gk20a_allocator vma[gmmu_nr_page_sizes];
+        /* If necessary, split fixed from non-fixed. */
+        struct gk20a_allocator fixed;
        struct rb_root mapped_buffers;
        struct list_head reserved_va_list;
@@ -279,6 +283,7 @@ struct channel_gk20a;
 int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
 int gk20a_init_mm_setup_hw(struct gk20a *g);
+void gk20a_mm_debugfs_init(struct platform_device *pdev);
 int gk20a_mm_fb_flush(struct gk20a *g);
 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
@@ -332,9 +337,9 @@ struct mm_gk20a {
 #ifdef CONFIG_DEBUG_FS
        u32 ltc_enabled;
        u32 ltc_enabled_debug;
+#endif
        u32 bypass_smmu;
        u32 disable_bigpage;
-#endif
 };
 int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 84b3fcaf..6bffed9e 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -59,8 +59,10 @@ struct gk20a_platform {
        struct clk *clk[3];
        int num_clks;
+#ifdef CONFIG_RESET_CONTROLLER
        /* Reset control for device */
        struct reset_control *reset_control;
+#endif
        /* Delay before rail gated */
        int railgate_delay;
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 60ffa381..15d6609d 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -3,7 +3,7 @@
 *
 * GK20A Tegra Platform Interface
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -822,7 +822,7 @@ static long gk20a_round_clk_rate(struct platform_device *dev,
        return gk20a_clk_round_rate(g, rate);
 }
-int gk20a_set_clk_rate(struct platform_device *dev, unsigned long rate)
+static int gk20a_set_clk_rate(struct platform_device *dev, unsigned long rate)
 {
        struct gk20a_platform *platform = gk20a_get_platform(dev);
        struct gk20a *g = platform->g;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 30592ee2..60c87979 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -4426,7 +4426,7 @@ int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
        return status;
 }
-#if CONFIG_DEBUG_FS
+#ifdef CONFIG_DEBUG_FS
 static int elpg_residency_show(struct seq_file *s, void *data)
 {
        struct gk20a *g = s->private;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 4421744c..b41cca08 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -228,6 +228,7 @@ int gk20a_tsg_open(struct gk20a *g, struct file *filp)
        tsg->tsg_gr_ctx = NULL;
        tsg->vm = NULL;
+        tsg->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
        filp->private_data = tsg;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index bcc4d0c4..7e0a75d1 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -49,6 +49,8 @@ struct tsg_gk20a {
        struct gr_ctx_desc *tsg_gr_ctx;
        struct vm_gk20a *vm;
+        u32 interleave_level;
 };
 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index 9f137246..8a0be106 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -362,7 +362,7 @@ int prepare_ucode_blob(struct gk20a *g)
        gm20b_dbg_pmu("prepare ucode blob return 0\n");
        free_acr_resources(g, plsfm);
 free_sgt:
-        kfree(sgt);
+        gk20a_free_sgtable(&sgt);
        return err;
 }
diff --git a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
index d1deffb9..b9763224 100644
--- a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
@@ -1,7 +1,7 @@
 /*
 * GM20B Fifo
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -114,6 +114,7 @@ void gm20b_init_fifo(struct gpu_ops *gops)
        gops->fifo.free_inst = channel_gk20a_free_inst;
        gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc;
        gops->fifo.channel_set_priority = gk20a_channel_set_priority;
+        gops->fifo.channel_set_timeslice = gk20a_channel_set_timeslice;
        gops->fifo.preempt_channel = gk20a_fifo_preempt_channel;
        gops->fifo.update_runlist = gk20a_fifo_update_runlist;
@@ -121,4 +122,5 @@ void gm20b_init_fifo(struct gpu_ops *gops)
        gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
        gops->fifo.get_num_fifos = gm20b_fifo_get_num_fifos;
        gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
+        gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
 }
diff --git a/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
index 95e0c43d..aa01e945 100644
--- a/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -286,6 +286,10 @@ static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_access_m(void)
 {
        return 0x1 << 30;
 }
+static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_m(void)
+{
+        return 0x1 << 21;
+}
 static inline u32 ltc_ltc0_lts0_intr_r(void)
 {
        return 0x0014040c;
diff --git a/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
index 126f7c8c..06d02522 100644
--- a/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -98,4 +98,12 @@ static inline u32 timer_pri_timeout_fecs_errcode_r(void)
 {
        return 0x0000908c;
 }
+static inline u32 timer_time_0_r(void)
+{
+        return 0x00009400;
+}
+static inline u32 timer_time_1_r(void)
+{
+        return 0x00009410;
+}
 #endif
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
index 5b6bff7f..ffc36903 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -1,7 +1,7 @@
 /*
 * GM20B L2
 *
- * Copyright (c) 2014-2015 NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016 NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -190,6 +190,7 @@ void gm20b_ltc_init_fs_state(struct gk20a *g)
        reg = gk20a_readl(g, ltc_ltcs_ltss_intr_r());
        reg &= ~ltc_ltcs_ltss_intr_en_evicted_cb_m();
        reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_access_m();
+        reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_m();
        gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg);
 }
diff --git a/drivers/gpu/nvgpu/gm20b/therm_gm20b.c b/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
index 5bd22841..6ebc4c91 100644
--- a/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
@@ -15,6 +15,7 @@
 #include "gk20a/gk20a.h"
 #include "hw_therm_gm20b.h"
+#include "therm_gm20b.h"
 static int gm20b_init_therm_setup_hw(struct gk20a *g)
 {
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
new file mode 100644
index 00000000..cb955811
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/string.h>
+#include "gk20a/gk20a.h"
+#include "fecs_trace_vgpu.h"
+void vgpu_init_fecs_trace_ops(struct gpu_ops *ops)
+{
+        memset(&ops->fecs_trace, 0, sizeof(ops->fecs_trace));
+}
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
new file mode 100644
index 00000000..1aace1fe
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __FECS_TRACE_VGPU_H
+#define __FECS_TRACE_VGPU_H
+struct gpu_ops;
+void vgpu_init_fecs_trace_ops(struct gpu_ops *ops);
+#endif /* __FECS_TRACE_VGPU_H */
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index e776e97c..9e40218d 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -1,7 +1,7 @@
 /*
 * Virtualized GPU Fifo
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -81,6 +81,7 @@ static int vgpu_channel_alloc_inst(struct gk20a *g, struct channel_gk20a *ch)
        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_HWCTX;
        msg.handle = platform->virt_handle;
        p->id = ch->hw_chid;
+        p->pid = (u64)current->pid;
        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
        if (err || msg.ret) {
                gk20a_err(dev_from_gk20a(g), "fail");
@@ -194,12 +195,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
        if (!runlist->active_channels)
                goto clean_up_runlist_info;
-        runlist->high_prio_channels =
-                kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
-                        GFP_KERNEL);
-        if (!runlist->high_prio_channels)
-                goto clean_up_runlist_info;
        runlist_size  = sizeof(u16) * f->num_channels;
        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
                int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -222,9 +217,6 @@ clean_up_runlist:
                gk20a_gmmu_free(g, &runlist->mem[i]);
 clean_up_runlist_info:
-        kfree(runlist->high_prio_channels);
-        runlist->high_prio_channels = NULL;
        kfree(runlist->active_channels);
        runlist->active_channels = NULL;
@@ -550,6 +542,54 @@ static int vgpu_channel_set_priority(struct channel_gk20a *ch, u32 priority)
        return err ? err : msg.ret;
 }
+static int vgpu_fifo_set_runlist_interleave(struct gk20a *g,
+                                        u32 id,
+                                        bool is_tsg,
+                                        u32 runlist_id,
+                                        u32 new_level)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+        struct tegra_vgpu_cmd_msg msg;
+        struct tegra_vgpu_channel_runlist_interleave_params *p =
+                        &msg.params.channel_interleave;
+        struct channel_gk20a *ch;
+        int err;
+        gk20a_dbg_fn("");
+        /* FIXME: add support for TSGs */
+        if (is_tsg)
+                return -ENOSYS;
+        ch = &g->fifo.channel[id];
+        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_RUNLIST_INTERLEAVE;
+        msg.handle = platform->virt_handle;
+        p->handle = ch->virt_ctx;
+        p->level = new_level;
+        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+        WARN_ON(err || msg.ret);
+        return err ? err : msg.ret;
+}
+int vgpu_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+        struct tegra_vgpu_cmd_msg msg;
+        struct tegra_vgpu_channel_timeslice_params *p =
+                        &msg.params.channel_timeslice;
+        int err;
+        gk20a_dbg_fn("");
+        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_TIMESLICE;
+        msg.handle = platform->virt_handle;
+        p->handle = ch->virt_ctx;
+        p->timeslice_us = timeslice;
+        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+        WARN_ON(err || msg.ret);
+        return err ? err : msg.ret;
+}
 static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
                struct channel_gk20a *ch)
 {
@@ -635,5 +675,6 @@ void vgpu_init_fifo_ops(struct gpu_ops *gops)
        gops->fifo.update_runlist = vgpu_fifo_update_runlist;
        gops->fifo.wait_engine_idle = vgpu_fifo_wait_engine_idle;
        gops->fifo.channel_set_priority = vgpu_channel_set_priority;
+        gops->fifo.set_runlist_interleave = vgpu_fifo_set_runlist_interleave;
+        gops->fifo.channel_set_timeslice = vgpu_channel_set_timeslice;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index e8328326..5a953e20 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -18,6 +18,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/pm_runtime.h>
 #include "vgpu/vgpu.h"
+#include "vgpu/fecs_trace_vgpu.h"
 #include "gk20a/debug_gk20a.h"
 #include "gk20a/hal_gk20a.h"
 #include "gk20a/hw_mc_gk20a.h"
@@ -259,6 +260,7 @@ void vgpu_init_hal_common(struct gk20a *g)
        vgpu_init_ltc_ops(gops);
        vgpu_init_mm_ops(gops);
        vgpu_init_debug_ops(gops);
+        vgpu_init_fecs_trace_ops(gops);
 }
 static int vgpu_init_hal(struct gk20a *g)
diff --git a/include/linux/tegra_vgpu.h b/include/linux/tegra_vgpu.h
index 280ca9c0..c4dd81dd 100644
--- a/include/linux/tegra_vgpu.h
+++ b/include/linux/tegra_vgpu.h
@@ -1,7 +1,7 @@
 /*
 * Tegra GPU Virtualization Interfaces to Server
 *
- * Copyright (c) 2014-2015, NVIDIA Corporation. All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA Corporation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -74,7 +74,9 @@ enum {
        TEGRA_VGPU_CMD_SET_MMU_DEBUG_MODE,
        TEGRA_VGPU_CMD_SET_SM_DEBUG_MODE,
        TEGRA_VGPU_CMD_REG_OPS,
-        TEGRA_VGPU_CMD_CHANNEL_SET_PRIORITY
+        TEGRA_VGPU_CMD_CHANNEL_SET_PRIORITY,
+        TEGRA_VGPU_CMD_CHANNEL_SET_RUNLIST_INTERLEAVE,
+        TEGRA_VGPU_CMD_CHANNEL_SET_TIMESLICE
 };
 struct tegra_vgpu_connect_params {
@@ -84,6 +86,7 @@ struct tegra_vgpu_connect_params {
 struct tegra_vgpu_channel_hwctx_params {
        u32 id;
+        u64 pid;
        u64 handle;
 };
@@ -298,6 +301,17 @@ struct tegra_vgpu_channel_priority_params {
        u32 priority;
 };
+/* level follows nvgpu.h definitions */
+struct tegra_vgpu_channel_runlist_interleave_params {
+        u64 handle;
+        u32 level;
+};
+struct tegra_vgpu_channel_timeslice_params {
+        u64 handle;
+        u32 timeslice_us;
+};
 struct tegra_vgpu_cmd_msg {
        u32 cmd;
        int ret;
@@ -326,6 +340,8 @@ struct tegra_vgpu_cmd_msg {
                struct tegra_vgpu_sm_debug_mode sm_debug_mode;
                struct tegra_vgpu_reg_ops_params reg_ops;
                struct tegra_vgpu_channel_priority_params channel_priority;
+                struct tegra_vgpu_channel_runlist_interleave_params channel_interleave;
+                struct tegra_vgpu_channel_timeslice_params channel_timeslice;
                char padding[192];
        } params;
 };
diff --git a/include/trace/events/gk20a.h b/include/trace/events/gk20a.h
index 461ff6e8..23b5b642 100644
--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -387,7 +387,7 @@ TRACE_EVENT(gk20a_as_ioctl_get_va_regions,
 TRACE_EVENT(gk20a_mmu_fault,
            TP_PROTO(u32 fault_hi, u32 fault_lo,
                     u32 fault_info,
-                     u32 instance,
+                     u64 instance,
                     u32 engine_id,
                     const char *engine,
                     const char *client,
@@ -398,7 +398,7 @@ TRACE_EVENT(gk20a_mmu_fault,
                         __field(u32, fault_hi)
                         __field(u32, fault_lo)
                         __field(u32, fault_info)
-                         __field(u32, instance)
+                         __field(u64, instance)
                         __field(u32, engine_id)
                         __field(const char *, engine)
                         __field(const char *, client)
@@ -414,7 +414,7 @@ TRACE_EVENT(gk20a_mmu_fault,
                       __entry->client = client;
                       __entry->fault_type = fault_type;
                       ),
-            TP_printk("fault=0x%x,%08x info=0x%x instance=0x%x engine_id=%d engine=%s client=%s type=%s",
+            TP_printk("fault=0x%x,%08x info=0x%x instance=0x%llx engine_id=%d engine=%s client=%s type=%s",
                      __entry->fault_hi, __entry->fault_lo,
                      __entry->fault_info, __entry->instance, __entry->engine_id,
                      __entry->engine, __entry->client, __entry->fault_type)
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 442a84ac..64ac45b5 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -351,6 +351,28 @@ struct nvgpu_gpu_get_buffer_info_args {
        };
 };
+#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT               16
+#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TSC              1
+#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_JIFFIES          2
+#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TIMEOFDAY        3
+struct nvgpu_gpu_get_cpu_time_correlation_sample {
+        /* gpu timestamp value */
+        __u64 cpu_timestamp;
+        /* raw GPU counter (PTIMER) value */
+        __u64 gpu_timestamp;
+};
+struct nvgpu_gpu_get_cpu_time_correlation_info_args {
+        /* timestamp pairs */
+        struct nvgpu_gpu_get_cpu_time_correlation_sample samples[
+                NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT];
+        /* number of pairs to read */
+        __u32 count;
+        /* cpu clock source id */
+        __u32 source_id;
+};
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
        _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -397,11 +419,13 @@ struct nvgpu_gpu_get_buffer_info_args {
        _IO(NVGPU_GPU_IOCTL_MAGIC, 22)
 #define NVGPU_GPU_IOCTL_CLEAR_SM_ERRORS \
        _IO(NVGPU_GPU_IOCTL_MAGIC, 23)
+#define NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO \
+        _IOWR(NVGPU_GPU_IOCTL_MAGIC, 24, \
+                        struct nvgpu_gpu_get_cpu_time_correlation_info_args)
 #define NVGPU_GPU_IOCTL_LAST            \
-        _IOC_NR(NVGPU_GPU_IOCTL_CLEAR_SM_ERRORS)
+        _IOC_NR(NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE    \
-        sizeof(struct nvgpu_gpu_prepare_compressible_read_args)
+        sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
 /*
 * /dev/nvhost-tsg-gpu device
@@ -834,6 +858,34 @@ struct nvgpu_channel_wdt_args {
 #define NVGPU_IOCTL_CHANNEL_DISABLE_WDT         1
 #define NVGPU_IOCTL_CHANNEL_ENABLE_WDT          2
+/*
+ * Interleaving channels in a runlist is an approach to improve
+ * GPU scheduling by allowing certain channels to appear multiple
+ * times on the runlist. The number of times a channel appears is
+ * governed by the following levels:
+ *
+ * low (L)   : appears once
+ * medium (M): if L, appears L times
+ *             else, appears once
+ * high (H)  : if L, appears (M + 1) x L times
+ *             else if M, appears M times
+ *             else, appears once
+ */
+struct nvgpu_runlist_interleave_args {
+        __u32 level;
+        __u32 reserved;
+};
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW      0
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM   1
+#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH     2
+#define NVGPU_RUNLIST_INTERLEAVE_NUM_LEVELS     3
+/* controls how long a channel occupies an engine uninterrupted */
+struct nvgpu_timeslice_args {
+        __u32 timeslice_us;
+        __u32 reserved;
+};
 #define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD        \
        _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args)
 #define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \
@@ -876,9 +928,13 @@ struct nvgpu_channel_wdt_args {
        _IOWR(NVGPU_IOCTL_MAGIC, 118, struct nvgpu_cycle_stats_snapshot_args)
 #define NVGPU_IOCTL_CHANNEL_WDT \
        _IOW(NVGPU_IOCTL_MAGIC, 119, struct nvgpu_channel_wdt_args)
+#define NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE \
+        _IOW(NVGPU_IOCTL_MAGIC, 120, struct nvgpu_runlist_interleave_args)
+#define NVGPU_IOCTL_CHANNEL_SET_TIMESLICE \
+        _IOW(NVGPU_IOCTL_MAGIC, 121, struct nvgpu_timeslice_args)
 #define NVGPU_IOCTL_CHANNEL_LAST        \
-        _IOC_NR(NVGPU_IOCTL_CHANNEL_WDT)
+        _IOC_NR(NVGPU_IOCTL_CHANNEL_SET_TIMESLICE)
 #define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args)
 /*
@@ -1159,4 +1215,94 @@ struct nvgpu_as_map_buffer_batch_args {
 #define NVGPU_AS_IOCTL_MAX_ARG_SIZE     \
        sizeof(struct nvgpu_as_map_buffer_ex_args)
+/*
+ * /dev/nvhost-ctxsw-gpu device
+ *
+ * Opening a '/dev/nvhost-ctxsw-gpu' device node creates a way to trace
+ * context switches on GR engine
+ */
+#define NVGPU_CTXSW_IOCTL_MAGIC 'C'
+#define NVGPU_CTXSW_TAG_SOF                     0x00
+#define NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST       0x01
+#define NVGPU_CTXSW_TAG_FE_ACK                  0x02
+#define NVGPU_CTXSW_TAG_FE_ACK_WFI              0x0a
+#define NVGPU_CTXSW_TAG_FE_ACK_GFXP             0x0b
+#define NVGPU_CTXSW_TAG_FE_ACK_CTAP             0x0c
+#define NVGPU_CTXSW_TAG_FE_ACK_CILP             0x0d
+#define NVGPU_CTXSW_TAG_SAVE_END                0x03
+#define NVGPU_CTXSW_TAG_RESTORE_START           0x04
+#define NVGPU_CTXSW_TAG_CONTEXT_START           0x05
+#define NVGPU_CTXSW_TAG_INVALID_TIMESTAMP       0xff
+#define NVGPU_CTXSW_TAG_LAST                    \
+        NVGPU_CTXSW_TAG_INVALID_TIMESTAMP
+struct nvgpu_ctxsw_trace_entry {
+        __u8 tag;
+        __u8 vmid;
+        __u16 seqno;            /* sequence number to detect drops */
+        __u32 context_id;       /* context_id as allocated by FECS */
+        __u64 pid;              /* 64-bit is max bits of different OS pid */
+        __u64 timestamp;        /* 64-bit time */
+};
+#define NVGPU_CTXSW_RING_HEADER_MAGIC 0x7000fade
+#define NVGPU_CTXSW_RING_HEADER_VERSION 0
+struct nvgpu_ctxsw_ring_header {
+        __u32 magic;
+        __u32 version;
+        __u32 num_ents;
+        __u32 ent_size;
+        volatile __u32 drop_count;      /* excluding filtered out events */
+        volatile __u32 write_seqno;
+        volatile __u32 write_idx;
+        volatile __u32 read_idx;
+};
+struct nvgpu_ctxsw_ring_setup_args {
+        __u32 size;     /* [in/out] size of ring buffer in bytes (including
+                           header). will be rounded page size. this parameter
+                           is updated with actual allocated size. */
+};
+#define NVGPU_CTXSW_FILTER_SIZE (NVGPU_CTXSW_TAG_LAST + 1)
+#define NVGPU_CTXSW_FILTER_SET(n, p) \
+        ((p)->tag_bits[(n) / 64] |=  (1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_CLR(n, p) \
+        ((p)->tag_bits[(n) / 64] &= ~(1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_ISSET(n, p) \
+        ((p)->tag_bits[(n) / 64] &   (1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_CLR_ALL(p)    memset((void *)(p), 0, sizeof(*(p)))
+#define NVGPU_CTXSW_FILTER_SET_ALL(p)    memset((void *)(p), ~0, sizeof(*(p)))
+struct nvgpu_ctxsw_trace_filter {
+        __u64 tag_bits[(NVGPU_CTXSW_FILTER_SIZE + 63) / 64];
+};
+struct nvgpu_ctxsw_trace_filter_args {
+        struct nvgpu_ctxsw_trace_filter filter;
+};
+#define NVGPU_CTXSW_IOCTL_TRACE_ENABLE \
+        _IO(NVGPU_CTXSW_IOCTL_MAGIC, 1)
+#define NVGPU_CTXSW_IOCTL_TRACE_DISABLE \
+        _IO(NVGPU_CTXSW_IOCTL_MAGIC, 2)
+#define NVGPU_CTXSW_IOCTL_RING_SETUP \
+        _IOWR(NVGPU_CTXSW_IOCTL_MAGIC, 3, struct nvgpu_ctxsw_ring_setup_args)
+#define NVGPU_CTXSW_IOCTL_SET_FILTER \
+        _IOW(NVGPU_CTXSW_IOCTL_MAGIC, 4, struct nvgpu_ctxsw_trace_filter_args)
+#define NVGPU_CTXSW_IOCTL_GET_FILTER \
+        _IOR(NVGPU_CTXSW_IOCTL_MAGIC, 5, struct nvgpu_ctxsw_trace_filter_args)
+#define NVGPU_CTXSW_IOCTL_POLL \
+        _IO(NVGPU_CTXSW_IOCTL_MAGIC, 6)
+#define NVGPU_CTXSW_IOCTL_LAST            \
+        _IOC_NR(NVGPU_CTXSW_IOCTL_POLL)
+#define NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE  \
+        sizeof(struct nvgpu_ctxsw_trace_filter_args)
 #endif