56 files changed, 1851 insertions, 1568 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index e7ea3c5d..4b6a8e87 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -66,7 +66,6 @@ nvgpu-y := \
        gk20a/fifo_gk20a.o \
        gk20a/channel_gk20a.o \
        gk20a/channel_sync_gk20a.o \
-        gk20a/debug_gk20a.o \
        gk20a/dbg_gpu_gk20a.o \
        gk20a/regops_gk20a.o \
        gk20a/gr_gk20a.o \
@@ -107,7 +106,6 @@ nvgpu-y := \
        gm20b/mm_gm20b.o \
        gm20b/regops_gm20b.o \
        gm20b/mc_gm20b.o \
-        gm20b/debug_gm20b.o \
        gm20b/cde_gm20b.o \
        gm20b/therm_gm20b.o \
        gm206/bios_gm206.o \
@@ -117,6 +115,18 @@ nvgpu-y := \
        boardobj/boardobjgrp_e255.o \
        boardobj/boardobjgrp_e32.o
+nvgpu-$(CONFIG_DEBUG_FS) += \
+        common/linux/debug.o \
+        common/linux/debug_gr.o \
+        common/linux/debug_fifo.o \
+        common/linux/debug_cde.o \
+        common/linux/debug_ce.o \
+        common/linux/debug_pmu.o \
+        common/linux/debug_sched.o \
+        common/linux/debug_mm.o \
+        common/linux/debug_allocator.o \
+        common/linux/debug_kmem.o
 nvgpu-$(CONFIG_TEGRA_GK20A) += tegra/linux/platform_gk20a_tegra.o
 nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
 nvgpu-$(CONFIG_GK20A_PCI) += common/linux/pci.o
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/common/linux/debug.c
index ac435046..2962a467 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/common/linux/debug.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011-2017 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
 *
 * This software is licensed under the terms of the GNU General Public
 * License version 2, as published by the Free Software Foundation, and
@@ -12,26 +12,23 @@
 *
 */
-#ifdef CONFIG_DEBUG_FS
+#include "debug_cde.h"
+#include "debug_ce.h"
+#include "debug_fifo.h"
+#include "debug_gr.h"
+#include "debug_mm.h"
+#include "debug_allocator.h"
+#include "debug_kmem.h"
+#include "debug_pmu.h"
+#include "debug_sched.h"
+#include "gk20a/gk20a.h"
+#include "gk20a/platform_gk20a.h"
 #include <linux/debugfs.h>
-#endif
 #include <linux/seq_file.h>
-#include <linux/io.h>
-#include <linux/fs.h>
-#include <nvgpu/log.h>
-#include <nvgpu/kmem.h>
-#include <nvgpu/semaphore.h>
-#include <nvgpu/log.h>
-#include "gk20a.h"
-#include "gk20a/platform_gk20a.h"
-#include "debug_gk20a.h"
-#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/debug.h>
-#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
 unsigned int gk20a_debug_trace_cmdbuf;
@@ -59,81 +56,22 @@ void gk20a_debug_output(struct gk20a_debug_output *o,
        o->fn(o->ctx, o->buf, len);
 }
-static void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+static int gk20a_gr_dump_regs(struct gk20a *g,
-                 struct gk20a_debug_output *o)
-{
-        struct fifo_gk20a *f = &g->fifo;
-        u32 chid;
-        struct ch_state **ch_state;
-        ch_state = nvgpu_kzalloc(g, sizeof(*ch_state) * f->num_channels);
-        if (!ch_state) {
-                gk20a_debug_output(o, "cannot alloc memory for channels\n");
-                return;
-        }
-        for (chid = 0; chid < f->num_channels; chid++) {
-                struct channel_gk20a *ch = &f->channel[chid];
-                if (gk20a_channel_get(ch)) {
-                        ch_state[chid] =
-                                nvgpu_kmalloc(g, sizeof(struct ch_state) +
-                                        ram_in_alloc_size_v());
-                        /* ref taken stays to below loop with
-                         * successful allocs */
-                        if (!ch_state[chid])
-                                gk20a_channel_put(ch);
-                }
-        }
-        for (chid = 0; chid < f->num_channels; chid++) {
-                struct channel_gk20a *ch = &f->channel[chid];
-                if (!ch_state[chid])
-                        continue;
-                ch_state[chid]->pid = ch->pid;
-                ch_state[chid]->refs = atomic_read(&ch->ref_count);
-                nvgpu_mem_rd_n(g, &ch->inst_block, 0,
-                                &ch_state[chid]->inst_block[0],
-                                ram_in_alloc_size_v());
-                gk20a_channel_put(ch);
-        }
-        for (chid = 0; chid < f->num_channels; chid++) {
-                if (ch_state[chid]) {
-                        g->ops.fifo.dump_channel_status_ramfc(g, o, chid,
-                                                 ch_state[chid]);
-                        nvgpu_kfree(g, ch_state[chid]);
-                }
-        }
-        nvgpu_kfree(g, ch_state);
-}
-void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
-{
-        g->ops.fifo.dump_pbdma_status(g, o);
-        g->ops.fifo.dump_eng_status(g, o);
-        gk20a_debug_dump_all_channel_status_ramfc(g, o);
-}
-static int gk20a_gr_dump_regs(struct device *dev,
                struct gk20a_debug_output *o)
 {
-        struct gk20a_platform *platform = gk20a_get_platform(dev);
-        struct gk20a *g = platform->g;
        if (g->ops.gr.dump_gr_regs)
                gr_gk20a_elpg_protected_call(g, g->ops.gr.dump_gr_regs(g, o));
        return 0;
 }
-int gk20a_gr_debug_dump(struct device *dev)
+int gk20a_gr_debug_dump(struct gk20a *g)
 {
        struct gk20a_debug_output o = {
                .fn = gk20a_debug_write_printk
        };
-        gk20a_gr_dump_regs(dev, &o);
+        gk20a_gr_dump_regs(g, &o);
        return 0;
 }
@@ -154,23 +92,22 @@ static int gk20a_gr_debug_show(struct seq_file *s, void *unused)
                return -EINVAL;
        }
-        gk20a_gr_dump_regs(dev, &o);
+        gk20a_gr_dump_regs(g, &o);
        gk20a_idle(g);
        return 0;
 }
-void gk20a_debug_dump(struct device *dev)
+void gk20a_debug_dump(struct gk20a *g)
 {
-        struct gk20a_platform *platform = gk20a_get_platform(dev);
+        struct gk20a_platform *platform = gk20a_get_platform(g->dev);
-        struct gk20a *g = platform->g;
        struct gk20a_debug_output o = {
                .fn = gk20a_debug_write_printk
        };
        if (platform->dump_platform_dependencies)
-                platform->dump_platform_dependencies(dev);
+                platform->dump_platform_dependencies(g->dev);
        /* HAL only initialized after 1st power-on */
        if (g->ops.debug.show_dump)
@@ -227,22 +164,28 @@ static const struct file_operations gk20a_debug_fops = {
        .release        = single_release,
 };
+void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
+{
+        g->ops.fifo.dump_pbdma_status(g, o);
+        g->ops.fifo.dump_eng_status(g, o);
+        gk20a_debug_dump_all_channel_status_ramfc(g, o);
+}
 void gk20a_init_debug_ops(struct gpu_ops *gops)
 {
        gops->debug.show_dump = gk20a_debug_show_dump;
 }
-#ifdef CONFIG_DEBUG_FS
 static int railgate_residency_show(struct seq_file *s, void *data)
 {
-        struct device *dev = s->private;
+        struct gk20a *g = s->private;
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
-        struct gk20a *g = get_gk20a(dev);
        unsigned long time_since_last_state_transition_ms;
        unsigned long total_rail_gate_time_ms;
        unsigned long total_rail_ungate_time_ms;
-        if (platform->is_railgated(dev)) {
+        if (platform->is_railgated(g->dev)) {
                time_since_last_state_transition_ms =
                                jiffies_to_msecs(jiffies -
                                g->pstats.last_rail_gate_complete);
@@ -282,30 +225,27 @@ static const struct file_operations railgate_residency_fops = {
        .release        = single_release,
 };
-int gk20a_railgating_debugfs_init(struct device *dev)
+static int gk20a_railgating_debugfs_init(struct gk20a *g)
 {
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
        struct dentry *d;
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
        if (!g->can_railgate)
                return 0;
        d = debugfs_create_file(
-                "railgate_residency", S_IRUGO|S_IWUSR, platform->debugfs, dev,
+                "railgate_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
                                                &railgate_residency_fops);
        if (!d)
                return -ENOMEM;
        return 0;
 }
-#endif
-void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
+void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink)
 {
-#ifdef CONFIG_DEBUG_FS
+        struct device *dev = g->dev;
        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = platform->g;
        platform->debugfs = debugfs_create_dir(dev_name(dev), NULL);
        if (!platform->debugfs)
@@ -409,17 +349,28 @@ void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
 #endif
        gr_gk20a_debugfs_init(g);
-        gk20a_pmu_debugfs_init(g->dev);
+        gk20a_pmu_debugfs_init(g);
-        gk20a_railgating_debugfs_init(g->dev);
+        gk20a_railgating_debugfs_init(g);
-        gk20a_cde_debugfs_init(g->dev);
+        gk20a_cde_debugfs_init(g);
-        gk20a_ce_debugfs_init(g->dev);
+        gk20a_ce_debugfs_init(g);
-        nvgpu_alloc_debugfs_init(g->dev);
+        nvgpu_alloc_debugfs_init(g);
-        gk20a_mm_debugfs_init(g->dev);
+        gk20a_mm_debugfs_init(g);
-        gk20a_fifo_debugfs_init(g->dev);
+        gk20a_fifo_debugfs_init(g);
-        gk20a_sched_debugfs_init(g->dev);
+        gk20a_sched_debugfs_init(g);
 #ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
-        nvgpu_kmem_debugfs_init(g->dev);
+        nvgpu_kmem_debugfs_init(g);
-#endif
 #endif
+}
+void gk20a_debug_deinit(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        if (!platform->debugfs)
+                return;
+        gk20a_fifo_debugfs_deinit(g);
+        debugfs_remove_recursive(platform->debugfs);
+        debugfs_remove_recursive(platform->debugfs_alias);
 }
diff --git a/drivers/gpu/nvgpu/common/linux/debug_allocator.c b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
new file mode 100644
index 00000000..3d4a2bb2
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_allocator.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <nvgpu/allocator.h>
+u32 nvgpu_alloc_tracing_on;
+void nvgpu_alloc_print_stats(struct nvgpu_allocator *__a,
+                             struct seq_file *s, int lock)
+{
+        __a->ops->print_stats(__a, s, lock);
+}
+static int __alloc_show(struct seq_file *s, void *unused)
+{
+        struct nvgpu_allocator *a = s->private;
+        nvgpu_alloc_print_stats(a, s, 1);
+        return 0;
+}
+static int __alloc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __alloc_show, inode->i_private);
+}
+static const struct file_operations __alloc_fops = {
+        .open = __alloc_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
+{
+        if (!g->debugfs_allocators)
+                return;
+        a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
+                                               g->debugfs_allocators,
+                                               a, &__alloc_fops);
+}
+void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
+{
+        if (!IS_ERR_OR_NULL(a->debugfs_entry))
+                debugfs_remove(a->debugfs_entry);
+}
+void nvgpu_alloc_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        g->debugfs_allocators = debugfs_create_dir("allocators", platform->debugfs);
+        if (IS_ERR_OR_NULL(g->debugfs_allocators)) {
+                g->debugfs_allocators = NULL;
+                return;
+        }
+        debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
+                           &nvgpu_alloc_tracing_on);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_allocator.h b/drivers/gpu/nvgpu/common/linux/debug_allocator.h
new file mode 100644
index 00000000..1b21cfc5
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_ALLOCATOR_H__
+#define __NVGPU_DEBUG_ALLOCATOR_H__
+struct gk20a;
+void nvgpu_alloc_debugfs_init(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_ALLOCATOR_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.c b/drivers/gpu/nvgpu/common/linux/debug_cde.c
new file mode 100644
index 00000000..eb7c33e2
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_cde.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+static ssize_t gk20a_cde_reload_write(struct file *file,
+        const char __user *userbuf, size_t count, loff_t *ppos)
+{
+        struct gk20a *g = file->private_data;
+        gk20a_cde_reload(g);
+        return count;
+}
+static const struct file_operations gk20a_cde_reload_fops = {
+        .open           = simple_open,
+        .write          = gk20a_cde_reload_write,
+};
+void gk20a_cde_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        if (!platform->has_cde)
+                return;
+        debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->cde_app.shader_parameter);
+        debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->cde_app.ctx_count);
+        debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->cde_app.ctx_usecount);
+        debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->cde_app.ctx_count_top);
+        debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
+                            g, &gk20a_cde_reload_fops);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.h b/drivers/gpu/nvgpu/common/linux/debug_cde.h
new file mode 100644
index 00000000..4895edd6
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_CDE_H__
+#define __NVGPU_DEBUG_CDE_H__
+struct gk20a;
+void gk20a_cde_debugfs_init(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_CDE_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_ce.c b/drivers/gpu/nvgpu/common/linux/debug_ce.c
new file mode 100644
index 00000000..9c50870e
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_ce.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_ce.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+void gk20a_ce_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.ctx_count);
+        debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.app_state);
+        debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.next_ctx_id);
+}
diff --git a/drivers/gpu/nvgpu/gm20b/debug_gm20b.h b/drivers/gpu/nvgpu/common/linux/debug_ce.h
index c3c5fed6..2a8750c4 100644
--- a/drivers/gpu/nvgpu/gm20b/debug_gm20b.h
+++ b/drivers/gpu/nvgpu/common/linux/debug_ce.h
@@ -1,7 +1,5 @@
 /*
- * GM20B Debug functionality
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
- *
- * Copyright (C) 2015 NVIDIA CORPORATION.  All rights reserved.
 *
 * This software is licensed under the terms of the GNU General Public
 * License version 2, as published by the Free Software Foundation, and
@@ -14,11 +12,10 @@
 *
 */
-#ifndef _DEBUG_GM20B_H_
+#ifndef __NVGPU_DEBUG_CE_H__
-#define _DEBUG_GM20B_H_
+#define __NVGPU_DEBUG_CE_H__
-struct gpu_ops;
-void gm20b_init_debug_ops(struct gpu_ops *gops);
+struct gk20a;
+void gk20a_ce_debugfs_init(struct gk20a *g);
-#endif
+#endif /* __NVGPU_DEBUG_CE_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_fifo.c b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
new file mode 100644
index 00000000..6a28b1a5
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_fifo.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <nvgpu/sort.h>
+void __gk20a_fifo_profile_free(struct kref *ref);
+static void *gk20a_fifo_sched_debugfs_seq_start(
+                struct seq_file *s, loff_t *pos)
+{
+        struct gk20a *g = s->private;
+        struct fifo_gk20a *f = &g->fifo;
+        if (*pos >= f->num_channels)
+                return NULL;
+        return &f->channel[*pos];
+}
+static void *gk20a_fifo_sched_debugfs_seq_next(
+                struct seq_file *s, void *v, loff_t *pos)
+{
+        struct gk20a *g = s->private;
+        struct fifo_gk20a *f = &g->fifo;
+        ++(*pos);
+        if (*pos >= f->num_channels)
+                return NULL;
+        return &f->channel[*pos];
+}
+static void gk20a_fifo_sched_debugfs_seq_stop(
+                struct seq_file *s, void *v)
+{
+}
+static int gk20a_fifo_sched_debugfs_seq_show(
+                struct seq_file *s, void *v)
+{
+        struct gk20a *g = s->private;
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = v;
+        struct tsg_gk20a *tsg = NULL;
+        struct fifo_engine_info_gk20a *engine_info;
+        struct fifo_runlist_info_gk20a *runlist;
+        u32 runlist_id;
+        int ret = SEQ_SKIP;
+        u32 engine_id;
+        engine_id = gk20a_fifo_get_gr_engine_id(g);
+        engine_info = (f->engine_info + engine_id);
+        runlist_id = engine_info->runlist_id;
+        runlist = &f->runlist_info[runlist_id];
+        if (ch == f->channel) {
+                seq_puts(s, "chid     tsgid    pid      timeslice  timeout  interleave graphics_preempt compute_preempt\n");
+                seq_puts(s, "                            (usecs)   (msecs)\n");
+                ret = 0;
+        }
+        if (!test_bit(ch->hw_chid, runlist->active_channels))
+                return ret;
+        if (gk20a_channel_get(ch)) {
+                if (gk20a_is_channel_marked_as_tsg(ch))
+                        tsg = &f->tsg[ch->tsgid];
+                seq_printf(s, "%-8d %-8d %-8d %-9d %-8d %-10d %-8d %-8d\n",
+                                ch->hw_chid,
+                                ch->tsgid,
+                                ch->tgid,
+                                tsg ? tsg->timeslice_us : ch->timeslice_us,
+                                ch->timeout_ms_max,
+                                tsg ? tsg->interleave_level : ch->interleave_level,
+                                ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->graphics_preempt_mode : U32_MAX,
+                                ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->compute_preempt_mode : U32_MAX);
+                gk20a_channel_put(ch);
+        }
+        return 0;
+}
+static const struct seq_operations gk20a_fifo_sched_debugfs_seq_ops = {
+        .start = gk20a_fifo_sched_debugfs_seq_start,
+        .next = gk20a_fifo_sched_debugfs_seq_next,
+        .stop = gk20a_fifo_sched_debugfs_seq_stop,
+        .show = gk20a_fifo_sched_debugfs_seq_show
+};
+static int gk20a_fifo_sched_debugfs_open(struct inode *inode,
+        struct file *file)
+{
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        err = seq_open(file, &gk20a_fifo_sched_debugfs_seq_ops);
+        if (err)
+                return err;
+        gk20a_dbg(gpu_dbg_info, "i_private=%p", inode->i_private);
+        ((struct seq_file *)file->private_data)->private = inode->i_private;
+        return 0;
+};
+/*
+ * The file operations structure contains our open function along with
+ * set of the canned seq_ ops.
+ */
+static const struct file_operations gk20a_fifo_sched_debugfs_fops = {
+        .owner = THIS_MODULE,
+        .open = gk20a_fifo_sched_debugfs_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = seq_release
+};
+static int gk20a_fifo_profile_enable(void *data, u64 val)
+{
+        struct gk20a *g = (struct gk20a *) data;
+        struct fifo_gk20a *f = &g->fifo;
+        nvgpu_mutex_acquire(&f->profile.lock);
+        if (val == 0) {
+                if (f->profile.enabled) {
+                        f->profile.enabled = false;
+                        kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+                }
+        } else {
+                if (!f->profile.enabled) {
+                        /* not kref init as it can have a running condition if
+                         * we enable/disable/enable while kickoff is happening
+                         */
+                        if (!kref_get_unless_zero(&f->profile.ref)) {
+                                f->profile.data = vzalloc(
+                                                        FIFO_PROFILING_ENTRIES *
+                                        sizeof(struct fifo_profile_gk20a));
+                                f->profile.sorted  = vzalloc(
+                                                        FIFO_PROFILING_ENTRIES *
+                                                        sizeof(u64));
+                                if (!(f->profile.data && f->profile.sorted)) {
+                                        nvgpu_vfree(g, f->profile.data);
+                                        nvgpu_vfree(g, f->profile.sorted);
+                                        nvgpu_mutex_release(&f->profile.lock);
+                                        return -ENOMEM;
+                                }
+                                kref_init(&f->profile.ref);
+                        }
+                        atomic_set(&f->profile.get, 0);
+                        f->profile.enabled = true;
+                }
+        }
+        nvgpu_mutex_release(&f->profile.lock);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(
+        gk20a_fifo_profile_enable_debugfs_fops,
+        NULL,
+        gk20a_fifo_profile_enable,
+        "%llu\n"
+);
+static int __profile_cmp(const void *a, const void *b)
+{
+        return *((unsigned long long *) a) - *((unsigned long long *) b);
+}
+/*
+ * This uses about 800b in the stack, but the function using it is not part
+ * of a callstack where much memory is being used, so it is fine
+ */
+#define PERCENTILE_WIDTH        5
+#define PERCENTILE_RANGES       (100/PERCENTILE_WIDTH)
+static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
+                u64 *percentiles, u32 index_end, u32 index_start)
+{
+        unsigned int nelem = 0;
+        unsigned int index;
+        struct fifo_profile_gk20a *profile;
+        for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
+                profile = &g->fifo.profile.data[index];
+                if (profile->timestamp[index_end] >
+                                profile->timestamp[index_start]) {
+                        /* This is a valid element */
+                        g->fifo.profile.sorted[nelem] =
+                                                profile->timestamp[index_end] -
+                                                profile->timestamp[index_start];
+                        nelem++;
+                }
+        }
+        /* sort it */
+        sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
+                __profile_cmp, NULL);
+        /* build ranges */
+        for (index = 0; index < PERCENTILE_RANGES; index++)
+                percentiles[index] =
+                        g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
+                                                nelem)/100 - 1];
+        return nelem;
+}
+static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
+{
+        struct gk20a *g = s->private;
+        unsigned int get, nelem, index;
+        /*
+         * 800B in the stack, but function is declared statically and only
+         * called from debugfs handler
+         */
+        u64 percentiles_ioctl[PERCENTILE_RANGES];
+        u64 percentiles_kickoff[PERCENTILE_RANGES];
+        u64 percentiles_jobtracking[PERCENTILE_RANGES];
+        u64 percentiles_append[PERCENTILE_RANGES];
+        u64 percentiles_userd[PERCENTILE_RANGES];
+        if (!kref_get_unless_zero(&g->fifo.profile.ref)) {
+                seq_printf(s, "Profiling disabled\n");
+                return 0;
+        }
+        get = atomic_read(&g->fifo.profile.get);
+        __gk20a_fifo_create_stats(g, percentiles_ioctl,
+                PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
+        __gk20a_fifo_create_stats(g, percentiles_kickoff,
+                PROFILE_END, PROFILE_ENTRY);
+        __gk20a_fifo_create_stats(g, percentiles_jobtracking,
+                PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
+        __gk20a_fifo_create_stats(g, percentiles_append,
+                PROFILE_APPEND, PROFILE_JOB_TRACKING);
+        nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
+                PROFILE_END, PROFILE_APPEND);
+        seq_printf(s, "Number of kickoffs: %d\n", nelem);
+        seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
+        for (index = 0; index < PERCENTILE_RANGES; index++)
+                seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
+                        PERCENTILE_WIDTH * (index+1),
+                        percentiles_ioctl[index],
+                        percentiles_kickoff[index],
+                        percentiles_append[index],
+                        percentiles_jobtracking[index],
+                        percentiles_userd[index]);
+        kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+        return 0;
+}
+static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, gk20a_fifo_profile_stats, inode->i_private);
+}
+static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
+        .open           = gk20a_fifo_profile_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+void gk20a_fifo_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        struct dentry *gpu_root = platform->debugfs;
+        struct dentry *fifo_root;
+        struct dentry *profile_root;
+        fifo_root = debugfs_create_dir("fifo", gpu_root);
+        if (IS_ERR_OR_NULL(fifo_root))
+                return;
+        gk20a_dbg(gpu_dbg_info, "g=%p", g);
+        debugfs_create_file("sched", 0600, fifo_root, g,
+                &gk20a_fifo_sched_debugfs_fops);
+        profile_root = debugfs_create_dir("profile", fifo_root);
+        if (IS_ERR_OR_NULL(profile_root))
+                return;
+        nvgpu_mutex_init(&g->fifo.profile.lock);
+        g->fifo.profile.enabled = false;
+        atomic_set(&g->fifo.profile.get, 0);
+        atomic_set(&g->fifo.profile.ref.refcount, 0);
+        debugfs_create_file("enable", 0600, profile_root, g,
+                &gk20a_fifo_profile_enable_debugfs_fops);
+        debugfs_create_file("stats", 0600, profile_root, g,
+                &gk20a_fifo_profile_stats_debugfs_fops);
+}
+void __gk20a_fifo_profile_free(struct kref *ref)
+{
+        struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a,
+                                                profile.ref);
+        nvgpu_vfree(f->g, f->profile.data);
+        nvgpu_vfree(f->g, f->profile.sorted);
+}
+/* Get the next element in the ring buffer of profile entries
+ * and grab a reference to the structure
+ */
+struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_profile_gk20a *profile;
+        unsigned int index;
+        /* If kref is zero, profiling is not enabled */
+        if (!kref_get_unless_zero(&f->profile.ref))
+                return NULL;
+        index = atomic_inc_return(&f->profile.get);
+        profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
+        return profile;
+}
+/* Free the reference to the structure. This allows deferred cleanups */
+void gk20a_fifo_profile_release(struct gk20a *g,
+                                        struct fifo_profile_gk20a *profile)
+{
+        kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+}
+void gk20a_fifo_debugfs_deinit(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        nvgpu_mutex_acquire(&f->profile.lock);
+        if (f->profile.enabled) {
+                f->profile.enabled = false;
+                kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+        }
+        nvgpu_mutex_release(&f->profile.lock);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_fifo.h b/drivers/gpu/nvgpu/common/linux/debug_fifo.h
new file mode 100644
index 00000000..46ac853e
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_fifo.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_FIFO_H__
+#define __NVGPU_DEBUG_FIFO_H__
+struct gk20a;
+void gk20a_fifo_debugfs_init(struct gk20a *g);
+void gk20a_fifo_debugfs_deinit(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_FIFO_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_gr.c b/drivers/gpu/nvgpu/common/linux/debug_gr.c
new file mode 100644
index 00000000..56b8612e
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_gr.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_gr.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+int gr_gk20a_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        g->debugfs_gr_default_attrib_cb_size =
+                debugfs_create_u32("gr_default_attrib_cb_size",
+                                   S_IRUGO|S_IWUSR, platform->debugfs,
+                                   &g->gr.attrib_cb_default_size);
+        return 0;
+}
diff --git a/drivers/gpu/nvgpu/gm20b/debug_gm20b.c b/drivers/gpu/nvgpu/common/linux/debug_gr.h
index b266200c..4b46acbb 100644
--- a/drivers/gpu/nvgpu/gm20b/debug_gm20b.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_gr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
 *
 * This software is licensed under the terms of the GNU General Public
 * License version 2, as published by the Free Software Foundation, and
@@ -12,10 +12,10 @@
 *
 */
-#include "gk20a/gk20a.h"
+#ifndef __NVGPU_DEBUG_GR_H__
-#include "debug_gm20b.h"
+#define __NVGPU_DEBUG_GR_H__
-void gm20b_init_debug_ops(struct gpu_ops *gops)
+struct gk20a;
-{
+int gr_gk20a_debugfs_init(struct gk20a *g);
-        gops->debug.show_dump = gk20a_debug_show_dump;
-}
+#endif /* __NVGPU_DEBUG_GR_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_kmem.c b/drivers/gpu/nvgpu/common/linux/debug_kmem.c
new file mode 100644
index 00000000..2ee542a8
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_kmem.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_kmem.h"
+#include "kmem_priv.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+/**
+ * to_human_readable_bytes - Determine  suffix for passed size.
+ *
+ * @bytes - Number of bytes to generate a suffix for.
+ * @hr_bytes [out] - The human readable number of bytes.
+ * @hr_suffix [out] - The suffix for the HR number of bytes.
+ *
+ * Computes a human readable decomposition of the passed number of bytes. The
+ * suffix for the bytes is passed back through the @hr_suffix pointer. The right
+ * number of bytes is then passed back in @hr_bytes. This returns the following
+ * ranges:
+ *
+ *   0 - 1023 B
+ *   1 - 1023 KB
+ *   1 - 1023 MB
+ *   1 - 1023 GB
+ *   1 - 1023 TB
+ *   1 - ...  PB
+ */
+static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
+                                      const char **hr_suffix)
+{
+        static const char *suffixes[] =
+                { "B", "KB", "MB", "GB", "TB", "PB" };
+        u64 suffix_ind = 0;
+        while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
+                bytes >>= 10;
+                suffix_ind++;
+        }
+        /*
+         * Handle case where bytes > 1023PB.
+         */
+        suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
+                suffix_ind : ARRAY_SIZE(suffixes) - 1;
+        *hr_bytes = bytes;
+        *hr_suffix = suffixes[suffix_ind];
+}
+/**
+ * print_hr_bytes - Print human readable bytes
+ *
+ * @s - A seq_file to print to. May be NULL.
+ * @msg - A message to print before the bytes.
+ * @bytes - Number of bytes.
+ *
+ * Print @msg followed by the human readable decomposition of the passed number
+ * of bytes.
+ *
+ * If @s is NULL then this prints will be made to the kernel log.
+ */
+static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
+{
+        u64 hr_bytes;
+        const char *hr_suffix;
+        __to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
+        __pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
+}
+/**
+ * print_histogram - Build a histogram of the memory usage.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ */
+static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
+                            struct seq_file *s)
+{
+        int i;
+        u64 pot_min, pot_max;
+        u64 nr_buckets;
+        unsigned int *buckets;
+        unsigned int total_allocs;
+        struct nvgpu_rbtree_node *node;
+        static const char histogram_line[] =
+                "++++++++++++++++++++++++++++++++++++++++";
+        /*
+         * pot_min is essentially a round down to the nearest power of 2. This
+         * is the start of the histogram. pot_max is just a round up to the
+         * nearest power of two. Each histogram bucket is one power of two so
+         * the histogram buckets are exponential.
+         */
+        pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
+        pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
+        nr_buckets = __ffs(pot_max) - __ffs(pot_min);
+        buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
+        if (!buckets) {
+                __pstat(s, "OOM: could not allocate bucket storage!?\n");
+                return;
+        }
+        /*
+         * Iterate across all of the allocs and determine what bucket they
+         * should go in. Round the size down to the nearest power of two to
+         * find the right bucket.
+         */
+        nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
+        while (node) {
+                int b;
+                u64 bucket_min;
+                struct nvgpu_mem_alloc *alloc =
+                        nvgpu_mem_alloc_from_rbtree_node(node);
+                bucket_min = (u64)rounddown_pow_of_two(alloc->size);
+                if (bucket_min < tracker->min_alloc)
+                        bucket_min = tracker->min_alloc;
+                b = __ffs(bucket_min) - __ffs(pot_min);
+                /*
+                 * Handle the one case were there's an alloc exactly as big as
+                 * the maximum bucket size of the largest bucket. Most of the
+                 * buckets have an inclusive minimum and exclusive maximum. But
+                 * the largest bucket needs to have an _inclusive_ maximum as
+                 * well.
+                 */
+                if (b == (int)nr_buckets)
+                        b--;
+                buckets[b]++;
+                nvgpu_rbtree_enum_next(&node, node);
+        }
+        total_allocs = 0;
+        for (i = 0; i < (int)nr_buckets; i++)
+                total_allocs += buckets[i];
+        __pstat(s, "Alloc histogram:\n");
+        /*
+         * Actually compute the histogram lines.
+         */
+        for (i = 0; i < (int)nr_buckets; i++) {
+                char this_line[sizeof(histogram_line) + 1];
+                u64 line_length;
+                u64 hr_bytes;
+                const char *hr_suffix;
+                memset(this_line, 0, sizeof(this_line));
+                /*
+                 * Compute the normalized line length. Cant use floating point
+                 * so we will just multiply everything by 1000 and use fixed
+                 * point.
+                 */
+                line_length = (1000 * buckets[i]) / total_allocs;
+                line_length *= sizeof(histogram_line);
+                line_length /= 1000;
+                memset(this_line, '+', line_length);
+                __to_human_readable_bytes(1 << (__ffs(pot_min) + i),
+                                          &hr_bytes, &hr_suffix);
+                __pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
+                        hr_bytes, hr_bytes << 1,
+                        hr_suffix, buckets[i], this_line);
+        }
+}
+/**
+ * nvgpu_kmem_print_stats - Print kmem tracking stats.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ *
+ * Print stats from a tracker. If @s is non-null then seq_printf() will be
+ * used with @s. Otherwise the stats are pr_info()ed.
+ */
+void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
+                            struct seq_file *s)
+{
+        nvgpu_lock_tracker(tracker);
+        __pstat(s, "Mem tracker: %s\n\n", tracker->name);
+        __pstat(s, "Basic Stats:\n");
+        __pstat(s,        "  Number of allocs        %lld\n",
+                tracker->nr_allocs);
+        __pstat(s,        "  Number of frees         %lld\n",
+                tracker->nr_frees);
+        print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
+        print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
+        print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
+        print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
+        print_hr_bytes(s, "  Bytes allocated (real)  ",
+                       tracker->bytes_alloced_real);
+        print_hr_bytes(s, "  Bytes freed (real)      ",
+                       tracker->bytes_freed_real);
+        __pstat(s, "\n");
+        print_histogram(tracker, s);
+        nvgpu_unlock_tracker(tracker);
+}
+static int __kmem_tracking_show(struct seq_file *s, void *unused)
+{
+        struct nvgpu_mem_alloc_tracker *tracker = s->private;
+        nvgpu_kmem_print_stats(tracker, s);
+        return 0;
+}
+static int __kmem_tracking_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __kmem_tracking_show, inode->i_private);
+}
+static const struct file_operations __kmem_tracking_fops = {
+        .open = __kmem_tracking_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __kmem_traces_dump_tracker(struct gk20a *g,
+                                      struct nvgpu_mem_alloc_tracker *tracker,
+                                      struct seq_file *s)
+{
+        struct nvgpu_rbtree_node *node;
+        nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
+        while (node) {
+                struct nvgpu_mem_alloc *alloc =
+                        nvgpu_mem_alloc_from_rbtree_node(node);
+                kmem_print_mem_alloc(g, alloc, s);
+                nvgpu_rbtree_enum_next(&node, node);
+        }
+        return 0;
+}
+static int __kmem_traces_show(struct seq_file *s, void *unused)
+{
+        struct gk20a *g = s->private;
+        nvgpu_lock_tracker(g->vmallocs);
+        seq_puts(s, "Oustanding vmallocs:\n");
+        __kmem_traces_dump_tracker(g, g->vmallocs, s);
+        seq_puts(s, "\n");
+        nvgpu_unlock_tracker(g->vmallocs);
+        nvgpu_lock_tracker(g->kmallocs);
+        seq_puts(s, "Oustanding kmallocs:\n");
+        __kmem_traces_dump_tracker(g, g->kmallocs, s);
+        nvgpu_unlock_tracker(g->kmallocs);
+        return 0;
+}
+static int __kmem_traces_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __kmem_traces_show, inode->i_private);
+}
+static const struct file_operations __kmem_traces_fops = {
+        .open = __kmem_traces_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+void nvgpu_kmem_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        struct dentry *node;
+        g->debugfs_kmem = debugfs_create_dir("kmem_tracking", platform->debugfs);
+        if (IS_ERR_OR_NULL(g->debugfs_kmem))
+                return;
+        node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g->vmallocs, &__kmem_tracking_fops);
+        node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g->kmallocs, &__kmem_tracking_fops);
+        node = debugfs_create_file("traces", S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g, &__kmem_traces_fops);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/common/linux/debug_kmem.h b/drivers/gpu/nvgpu/common/linux/debug_kmem.h
new file mode 100644
index 00000000..44322b53
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_kmem.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_KMEM_H__
+#define __NVGPU_DEBUG_KMEM_H__
+struct gk20a;
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+void nvgpu_kmem_debugfs_init(struct gk20a *g);
+#endif
+#endif /* __NVGPU_DEBUG_KMEM_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_mm.c b/drivers/gpu/nvgpu/common/linux/debug_mm.c
new file mode 100644
index 00000000..1e260f89
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_mm.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_mm.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+void gk20a_mm_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        debugfs_create_bool("force_pramin", 0664, platform->debugfs,
+                           &g->mm.force_pramin);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_mm.h b/drivers/gpu/nvgpu/common/linux/debug_mm.h
new file mode 100644
index 00000000..bf7bc985
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_mm.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_MM_H__
+#define __NVGPU_DEBUG_MM_H__
+struct gk20a;
+void gk20a_mm_debugfs_init(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_MM_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_pmu.c b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
new file mode 100644
index 00000000..f19f5139
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_pmu.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+static int lpwr_debug_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        if (g->ops.pmu.pmu_pg_engines_feature_list &&
+                g->ops.pmu.pmu_pg_engines_feature_list(g,
+                PMU_PG_ELPG_ENGINE_ID_GRAPHICS) !=
+                PMU_PG_FEATURE_GR_POWER_GATING_ENABLED) {
+                seq_printf(s, "PSTATE: %u\n"
+                        "RPPG Enabled: %u\n"
+                        "RPPG ref count: %u\n"
+                        "RPPG state: %u\n"
+                        "MSCG Enabled: %u\n"
+                        "MSCG pstate state: %u\n"
+                        "MSCG transition state: %u\n",
+                        g->ops.clk_arb.get_current_pstate(g),
+                        g->elpg_enabled, g->pmu.elpg_refcnt,
+                        g->pmu.elpg_stat, g->mscg_enabled,
+                        g->pmu.mscg_stat, g->pmu.mscg_transition_state);
+        } else
+                seq_printf(s, "ELPG Enabled: %u\n"
+                        "ELPG ref count: %u\n"
+                        "ELPG state: %u\n",
+                        g->elpg_enabled, g->pmu.elpg_refcnt,
+                        g->pmu.elpg_stat);
+        return 0;
+}
+static int lpwr_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, lpwr_debug_show, inode->i_private);
+}
+static const struct file_operations lpwr_debug_fops = {
+        .open           = lpwr_debug_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int mscg_stat_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        u64 total_ingating, total_ungating, residency, divisor, dividend;
+        struct pmu_pg_stats_data pg_stat_data = { 0 };
+        int err;
+        /* Don't unnecessarily power on the device */
+        if (g->power_on) {
+                err = gk20a_busy(g);
+                if (err)
+                        return err;
+                gk20a_pmu_get_pg_stats(g,
+                        PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
+                gk20a_idle(g);
+        }
+        total_ingating = g->pg_ingating_time_us +
+                        (u64)pg_stat_data.ingating_time;
+        total_ungating = g->pg_ungating_time_us +
+                        (u64)pg_stat_data.ungating_time;
+        divisor = total_ingating + total_ungating;
+        /* We compute the residency on a scale of 1000 */
+        dividend = total_ingating * 1000;
+        if (divisor)
+                residency = div64_u64(dividend, divisor);
+        else
+                residency = 0;
+        seq_printf(s,
+                        "Time in MSCG: %llu us\n"
+                        "Time out of MSCG: %llu us\n"
+                        "MSCG residency ratio: %llu\n"
+                        "MSCG Entry Count: %u\n"
+                        "MSCG Avg Entry latency %u\n"
+                        "MSCG Avg Exit latency %u\n",
+                        total_ingating, total_ungating,
+                        residency, pg_stat_data.gating_cnt,
+                        pg_stat_data.avg_entry_latency_us,
+                        pg_stat_data.avg_exit_latency_us);
+        return 0;
+}
+static int mscg_stat_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, mscg_stat_show, inode->i_private);
+}
+static const struct file_operations mscg_stat_fops = {
+        .open           = mscg_stat_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int mscg_transitions_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        struct pmu_pg_stats_data pg_stat_data = { 0 };
+        u32 total_gating_cnt;
+        int err;
+        if (g->power_on) {
+                err = gk20a_busy(g);
+                if (err)
+                        return err;
+                gk20a_pmu_get_pg_stats(g,
+                        PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
+                gk20a_idle(g);
+        }
+        total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
+        seq_printf(s, "%u\n", total_gating_cnt);
+        return 0;
+}
+static int mscg_transitions_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, mscg_transitions_show, inode->i_private);
+}
+static const struct file_operations mscg_transitions_fops = {
+        .open           = mscg_transitions_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int elpg_stat_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        struct pmu_pg_stats_data pg_stat_data = { 0 };
+        u64 total_ingating, total_ungating, residency, divisor, dividend;
+        int err;
+        /* Don't unnecessarily power on the device */
+        if (g->power_on) {
+                err = gk20a_busy(g);
+                if (err)
+                        return err;
+                gk20a_pmu_get_pg_stats(g,
+                        PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
+                gk20a_idle(g);
+        }
+        total_ingating = g->pg_ingating_time_us +
+                        (u64)pg_stat_data.ingating_time;
+        total_ungating = g->pg_ungating_time_us +
+                        (u64)pg_stat_data.ungating_time;
+        divisor = total_ingating + total_ungating;
+        /* We compute the residency on a scale of 1000 */
+        dividend = total_ingating * 1000;
+        if (divisor)
+                residency = div64_u64(dividend, divisor);
+        else
+                residency = 0;
+        seq_printf(s,
+                        "Time in ELPG: %llu us\n"
+                        "Time out of ELPG: %llu us\n"
+                        "ELPG residency ratio: %llu\n"
+                        "ELPG Entry Count: %u\n"
+                        "ELPG Avg Entry latency %u us\n"
+                        "ELPG Avg Exit latency %u us\n",
+                        total_ingating, total_ungating,
+                        residency, pg_stat_data.gating_cnt,
+                        pg_stat_data.avg_entry_latency_us,
+                        pg_stat_data.avg_exit_latency_us);
+        return 0;
+}
+static int elpg_stat_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, elpg_stat_show, inode->i_private);
+}
+static const struct file_operations elpg_stat_fops = {
+        .open           = elpg_stat_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int elpg_transitions_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        struct pmu_pg_stats_data pg_stat_data = { 0 };
+        u32 total_gating_cnt;
+        int err;
+        if (g->power_on) {
+                err = gk20a_busy(g);
+                if (err)
+                        return err;
+                gk20a_pmu_get_pg_stats(g,
+                        PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
+                gk20a_idle(g);
+        }
+        total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
+        seq_printf(s, "%u\n", total_gating_cnt);
+        return 0;
+}
+static int elpg_transitions_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, elpg_transitions_show, inode->i_private);
+}
+static const struct file_operations elpg_transitions_fops = {
+        .open           = elpg_transitions_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int falc_trace_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        struct pmu_gk20a *pmu = &g->pmu;
+        u32 i = 0, j = 0, k, l, m;
+        char part_str[40];
+        void *tracebuffer;
+        char *trace;
+        u32 *trace1;
+        /* allocate system memory to copy pmu trace buffer */
+        tracebuffer = nvgpu_kzalloc(g, GK20A_PMU_TRACE_BUFSIZE);
+        if (tracebuffer == NULL)
+                return -ENOMEM;
+        /* read pmu traces into system memory buffer */
+        nvgpu_mem_rd_n(g, &pmu->trace_buf,
+                       0, tracebuffer, GK20A_PMU_TRACE_BUFSIZE);
+        trace = (char *)tracebuffer;
+        trace1 = (u32 *)tracebuffer;
+        for (i = 0; i < GK20A_PMU_TRACE_BUFSIZE; i += 0x40) {
+                for (j = 0; j < 0x40; j++)
+                        if (trace1[(i / 4) + j])
+                                break;
+                if (j == 0x40)
+                        break;
+                seq_printf(s, "Index %x: ", trace1[(i / 4)]);
+                l = 0;
+                m = 0;
+                while (nvgpu_find_hex_in_string((trace+i+20+m), g, &k)) {
+                        if (k >= 40)
+                                break;
+                        strncpy(part_str, (trace+i+20+m), k);
+                        part_str[k] = 0;
+                        seq_printf(s, "%s0x%x", part_str,
+                                        trace1[(i / 4) + 1 + l]);
+                        l++;
+                        m += k + 2;
+                }
+                seq_printf(s, "%s", (trace+i+20+m));
+        }
+        nvgpu_kfree(g, tracebuffer);
+        return 0;
+}
+static int falc_trace_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, falc_trace_show, inode->i_private);
+}
+static const struct file_operations falc_trace_fops = {
+        .open           = falc_trace_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int perfmon_events_enable_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        seq_printf(s, "%u\n", g->pmu.perfmon_sampling_enabled ? 1 : 0);
+        return 0;
+}
+static int perfmon_events_enable_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, perfmon_events_enable_show, inode->i_private);
+}
+static ssize_t perfmon_events_enable_write(struct file *file,
+        const char __user *userbuf, size_t count, loff_t *ppos)
+{
+        struct seq_file *s = file->private_data;
+        struct gk20a *g = s->private;
+        unsigned long val = 0;
+        char buf[40];
+        int buf_size;
+        int err;
+        memset(buf, 0, sizeof(buf));
+        buf_size = min(count, (sizeof(buf)-1));
+        if (copy_from_user(buf, userbuf, buf_size))
+                return -EFAULT;
+        if (kstrtoul(buf, 10, &val) < 0)
+                return -EINVAL;
+        /* Don't turn on gk20a unnecessarily */
+        if (g->power_on) {
+                err = gk20a_busy(g);
+                if (err)
+                        return err;
+                if (val && !g->pmu.perfmon_sampling_enabled) {
+                        g->pmu.perfmon_sampling_enabled = true;
+                        nvgpu_pmu_perfmon_start_sampling(&(g->pmu));
+                } else if (!val && g->pmu.perfmon_sampling_enabled) {
+                        g->pmu.perfmon_sampling_enabled = false;
+                        nvgpu_pmu_perfmon_stop_sampling(&(g->pmu));
+                }
+                gk20a_idle(g);
+        } else {
+                g->pmu.perfmon_sampling_enabled = val ? true : false;
+        }
+        return count;
+}
+static const struct file_operations perfmon_events_enable_fops = {
+        .open           = perfmon_events_enable_open,
+        .read           = seq_read,
+        .write          = perfmon_events_enable_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int perfmon_events_count_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        seq_printf(s, "%lu\n", g->pmu.perfmon_events_cnt);
+        return 0;
+}
+static int perfmon_events_count_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, perfmon_events_count_show, inode->i_private);
+}
+static const struct file_operations perfmon_events_count_fops = {
+        .open           = perfmon_events_count_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int security_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        seq_printf(s, "%d\n", g->pmu.pmu_mode);
+        return 0;
+}
+static int security_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, security_show, inode->i_private);
+}
+static const struct file_operations security_fops = {
+        .open           = security_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+int gk20a_pmu_debugfs_init(struct gk20a *g)
+{
+        struct dentry *d;
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        d = debugfs_create_file(
+                "lpwr_debug", S_IRUGO|S_IWUSR, platform->debugfs, g,
+                                                &lpwr_debug_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "mscg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+                                                &mscg_stat_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "mscg_transitions", S_IRUGO, platform->debugfs, g,
+                                                &mscg_transitions_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+                                                &elpg_stat_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "elpg_transitions", S_IRUGO, platform->debugfs, g,
+                                                &elpg_transitions_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "falc_trace", S_IRUGO, platform->debugfs, g,
+                                                &falc_trace_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "perfmon_events_enable", S_IRUGO, platform->debugfs, g,
+                                                &perfmon_events_enable_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "perfmon_events_count", S_IRUGO, platform->debugfs, g,
+                                                &perfmon_events_count_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "pmu_security", S_IRUGO, platform->debugfs, g,
+                                                &security_fops);
+        if (!d)
+                goto err_out;
+        return 0;
+err_out:
+        pr_err("%s: Failed to make debugfs node\n", __func__);
+        debugfs_remove_recursive(platform->debugfs);
+        return -ENOMEM;
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_pmu.h b/drivers/gpu/nvgpu/common/linux/debug_pmu.h
new file mode 100644
index 00000000..c4e3243d
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_PMU_H__
+#define __NVGPU_DEBUG_PMU_H__
+struct gk20a;
+int gk20a_pmu_debugfs_init(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_PMU_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_sched.c b/drivers/gpu/nvgpu/common/linux/debug_sched.c
new file mode 100644
index 00000000..40b93149
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_sched.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include "debug_sched.h"
+#include "gk20a/platform_gk20a.h"
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+static int gk20a_sched_debugfs_show(struct seq_file *s, void *unused)
+{
+        struct gk20a *g = s->private;
+        struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
+        bool sched_busy = true;
+        int n = sched->bitmap_size / sizeof(u64);
+        int i;
+        int err;
+        err = gk20a_busy(g);
+        if (err)
+                return err;
+        if (nvgpu_mutex_tryacquire(&sched->busy_lock)) {
+                sched_busy = false;
+                nvgpu_mutex_release(&sched->busy_lock);
+        }
+        seq_printf(s, "control_locked=%d\n", sched->control_locked);
+        seq_printf(s, "busy=%d\n", sched_busy);
+        seq_printf(s, "bitmap_size=%zu\n", sched->bitmap_size);
+        nvgpu_mutex_acquire(&sched->status_lock);
+        seq_puts(s, "active_tsg_bitmap\n");
+        for (i = 0; i < n; i++)
+                seq_printf(s, "\t0x%016llx\n", sched->active_tsg_bitmap[i]);
+        seq_puts(s, "recent_tsg_bitmap\n");
+        for (i = 0; i < n; i++)
+                seq_printf(s, "\t0x%016llx\n", sched->recent_tsg_bitmap[i]);
+        nvgpu_mutex_release(&sched->status_lock);
+        gk20a_idle(g);
+        return 0;
+}
+static int gk20a_sched_debugfs_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, gk20a_sched_debugfs_show, inode->i_private);
+}
+static const struct file_operations gk20a_sched_debugfs_fops = {
+        .open           = gk20a_sched_debugfs_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+void gk20a_sched_debugfs_init(struct gk20a *g)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+        debugfs_create_file("sched_ctrl", S_IRUGO, platform->debugfs,
+                        g, &gk20a_sched_debugfs_fops);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_sched.h b/drivers/gpu/nvgpu/common/linux/debug_sched.h
new file mode 100644
index 00000000..34a8f55f
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_sched.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __NVGPU_DEBUG_SCHED_H__
+#define __NVGPU_DEBUG_SCHED_H__
+struct gk20a;
+void gk20a_sched_debugfs_init(struct gk20a *g);
+#endif /* __NVGPU_DEBUG_SCHED_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c
index 80e7698b..f85016d4 100644
--- a/drivers/gpu/nvgpu/common/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/common/linux/driver_common.c
@@ -21,6 +21,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "gk20a/gk20a_scale.h"
 #include "gk20a/gk20a.h"
@@ -182,7 +183,7 @@ int nvgpu_probe(struct gk20a *g,
        nvgpu_init_mm_vars(g);
        gk20a_create_sysfs(g->dev);
-        gk20a_debug_init(g->dev, debugfs_symlink);
+        gk20a_debug_init(g, debugfs_symlink);
        g->dbg_regops_tmp_buf = nvgpu_kzalloc(g, SZ_4K);
        if (!g->dbg_regops_tmp_buf) {
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 2502ff30..d81328f0 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -26,9 +26,9 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/log.h>
 #include <nvgpu/list.h>
+#include <nvgpu/debug.h>
 #include "gk20a/gk20a.h"
-#include "gk20a/debug_gk20a.h"
 #include "gk20a/ctxsw_trace_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
 #include "gk20a/fence_gk20a.h"
diff --git a/drivers/gpu/nvgpu/common/linux/kmem.c b/drivers/gpu/nvgpu/common/linux/kmem.c
index d058eba5..41aaa729 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem.c
+++ b/drivers/gpu/nvgpu/common/linux/kmem.c
@@ -134,19 +134,19 @@ void __nvgpu_vfree(struct gk20a *g, void *addr)
 #ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
-static void lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+void nvgpu_lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
 {
        nvgpu_mutex_acquire(&tracker->lock);
 }
-static void unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+void nvgpu_unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
 {
        nvgpu_mutex_release(&tracker->lock);
 }
-static void kmem_print_mem_alloc(struct gk20a *g,
+void kmem_print_mem_alloc(struct gk20a *g,
-                                 struct nvgpu_mem_alloc *alloc,
+                         struct nvgpu_mem_alloc *alloc,
-                                 struct seq_file *s)
+                         struct seq_file *s)
 {
 #ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
        int i;
@@ -231,7 +231,7 @@ static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
        alloc->stack_length = stack_trace.nr_entries;
 #endif
-        lock_tracker(tracker);
+        nvgpu_lock_tracker(tracker);
        tracker->bytes_alloced += size;
        tracker->bytes_alloced_real += real_size;
        tracker->nr_allocs++;
@@ -246,10 +246,10 @@ static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
        if (ret) {
                WARN(1, "Duplicate alloc??? 0x%llx\n", addr);
                kfree(alloc);
-                unlock_tracker(tracker);
+                nvgpu_unlock_tracker(tracker);
                return ret;
        }
-        unlock_tracker(tracker);
+        nvgpu_unlock_tracker(tracker);
        return 0;
 }
@@ -259,17 +259,17 @@ static int __nvgpu_free_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
 {
        struct nvgpu_mem_alloc *alloc;
-        lock_tracker(tracker);
+        nvgpu_lock_tracker(tracker);
        alloc = nvgpu_rem_alloc(tracker, addr);
        if (WARN(!alloc, "Possible double-free detected: 0x%llx!", addr)) {
-                unlock_tracker(tracker);
+                nvgpu_unlock_tracker(tracker);
                return -EINVAL;
        }
        tracker->nr_frees++;
        tracker->bytes_freed += alloc->size;
        tracker->bytes_freed_real += alloc->real_size;
-        unlock_tracker(tracker);
+        nvgpu_unlock_tracker(tracker);
        return 0;
 }
@@ -407,307 +407,6 @@ void __nvgpu_track_kfree(struct gk20a *g, void *addr)
        __nvgpu_free_kmem_alloc(g->kmallocs, (u64)(uintptr_t)addr);
 }
-/**
- * to_human_readable_bytes - Determine  suffix for passed size.
- *
- * @bytes - Number of bytes to generate a suffix for.
- * @hr_bytes [out] - The human readable number of bytes.
- * @hr_suffix [out] - The suffix for the HR number of bytes.
- *
- * Computes a human readable decomposition of the passed number of bytes. The
- * suffix for the bytes is passed back through the @hr_suffix pointer. The right
- * number of bytes is then passed back in @hr_bytes. This returns the following
- * ranges:
- *
- *   0 - 1023 B
- *   1 - 1023 KB
- *   1 - 1023 MB
- *   1 - 1023 GB
- *   1 - 1023 TB
- *   1 - ...  PB
- */
-static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
-                                      const char **hr_suffix)
-{
-        static const char *suffixes[] =
-                { "B", "KB", "MB", "GB", "TB", "PB" };
-        u64 suffix_ind = 0;
-        while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
-                bytes >>= 10;
-                suffix_ind++;
-        }
-        /*
-         * Handle case where bytes > 1023PB.
-         */
-        suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
-                suffix_ind : ARRAY_SIZE(suffixes) - 1;
-        *hr_bytes = bytes;
-        *hr_suffix = suffixes[suffix_ind];
-}
-/**
- * print_hr_bytes - Print human readable bytes
- *
- * @s - A seq_file to print to. May be NULL.
- * @msg - A message to print before the bytes.
- * @bytes - Number of bytes.
- *
- * Print @msg followed by the human readable decomposition of the passed number
- * of bytes.
- *
- * If @s is NULL then this prints will be made to the kernel log.
- */
-static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
-{
-        u64 hr_bytes;
-        const char *hr_suffix;
-        __to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
-        __pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
-}
-/**
- * print_histogram - Build a histogram of the memory usage.
- *
- * @tracker The tracking to pull data from.
- * @s       A seq_file to dump info into.
- */
-static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
-                            struct seq_file *s)
-{
-        int i;
-        u64 pot_min, pot_max;
-        u64 nr_buckets;
-        unsigned int *buckets;
-        unsigned int total_allocs;
-        struct nvgpu_rbtree_node *node;
-        static const char histogram_line[] =
-                "++++++++++++++++++++++++++++++++++++++++";
-        /*
-         * pot_min is essentially a round down to the nearest power of 2. This
-         * is the start of the histogram. pot_max is just a round up to the
-         * nearest power of two. Each histogram bucket is one power of two so
-         * the histogram buckets are exponential.
-         */
-        pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
-        pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
-        nr_buckets = __ffs(pot_max) - __ffs(pot_min);
-        buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
-        if (!buckets) {
-                __pstat(s, "OOM: could not allocate bucket storage!?\n");
-                return;
-        }
-        /*
-         * Iterate across all of the allocs and determine what bucket they
-         * should go in. Round the size down to the nearest power of two to
-         * find the right bucket.
-         */
-        nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
-        while (node) {
-                int b;
-                u64 bucket_min;
-                struct nvgpu_mem_alloc *alloc =
-                        nvgpu_mem_alloc_from_rbtree_node(node);
-                bucket_min = (u64)rounddown_pow_of_two(alloc->size);
-                if (bucket_min < tracker->min_alloc)
-                        bucket_min = tracker->min_alloc;
-                b = __ffs(bucket_min) - __ffs(pot_min);
-                /*
-                 * Handle the one case were there's an alloc exactly as big as
-                 * the maximum bucket size of the largest bucket. Most of the
-                 * buckets have an inclusive minimum and exclusive maximum. But
-                 * the largest bucket needs to have an _inclusive_ maximum as
-                 * well.
-                 */
-                if (b == (int)nr_buckets)
-                        b--;
-                buckets[b]++;
-                nvgpu_rbtree_enum_next(&node, node);
-        }
-        total_allocs = 0;
-        for (i = 0; i < (int)nr_buckets; i++)
-                total_allocs += buckets[i];
-        __pstat(s, "Alloc histogram:\n");
-        /*
-         * Actually compute the histogram lines.
-         */
-        for (i = 0; i < (int)nr_buckets; i++) {
-                char this_line[sizeof(histogram_line) + 1];
-                u64 line_length;
-                u64 hr_bytes;
-                const char *hr_suffix;
-                memset(this_line, 0, sizeof(this_line));
-                /*
-                 * Compute the normalized line length. Cant use floating point
-                 * so we will just multiply everything by 1000 and use fixed
-                 * point.
-                 */
-                line_length = (1000 * buckets[i]) / total_allocs;
-                line_length *= sizeof(histogram_line);
-                line_length /= 1000;
-                memset(this_line, '+', line_length);
-                __to_human_readable_bytes(1 << (__ffs(pot_min) + i),
-                                          &hr_bytes, &hr_suffix);
-                __pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
-                        hr_bytes, hr_bytes << 1,
-                        hr_suffix, buckets[i], this_line);
-        }
-}
-#ifdef CONFIG_DEBUG_FS
-/**
- * nvgpu_kmem_print_stats - Print kmem tracking stats.
- *
- * @tracker The tracking to pull data from.
- * @s       A seq_file to dump info into.
- *
- * Print stats from a tracker. If @s is non-null then seq_printf() will be
- * used with @s. Otherwise the stats are pr_info()ed.
- */
-void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
-                            struct seq_file *s)
-{
-        lock_tracker(tracker);
-        __pstat(s, "Mem tracker: %s\n\n", tracker->name);
-        __pstat(s, "Basic Stats:\n");
-        __pstat(s,        "  Number of allocs        %lld\n",
-                tracker->nr_allocs);
-        __pstat(s,        "  Number of frees         %lld\n",
-                tracker->nr_frees);
-        print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
-        print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
-        print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
-        print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
-        print_hr_bytes(s, "  Bytes allocated (real)  ",
-                       tracker->bytes_alloced_real);
-        print_hr_bytes(s, "  Bytes freed (real)      ",
-                       tracker->bytes_freed_real);
-        __pstat(s, "\n");
-        print_histogram(tracker, s);
-        unlock_tracker(tracker);
-}
-static int __kmem_tracking_show(struct seq_file *s, void *unused)
-{
-        struct nvgpu_mem_alloc_tracker *tracker = s->private;
-        nvgpu_kmem_print_stats(tracker, s);
-        return 0;
-}
-static int __kmem_tracking_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, __kmem_tracking_show, inode->i_private);
-}
-static const struct file_operations __kmem_tracking_fops = {
-        .open = __kmem_tracking_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
-};
-static int __kmem_traces_dump_tracker(struct gk20a *g,
-                                      struct nvgpu_mem_alloc_tracker *tracker,
-                                      struct seq_file *s)
-{
-        struct nvgpu_rbtree_node *node;
-        nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
-        while (node) {
-                struct nvgpu_mem_alloc *alloc =
-                        nvgpu_mem_alloc_from_rbtree_node(node);
-                kmem_print_mem_alloc(g, alloc, s);
-                nvgpu_rbtree_enum_next(&node, node);
-        }
-        return 0;
-}
-static int __kmem_traces_show(struct seq_file *s, void *unused)
-{
-        struct gk20a *g = s->private;
-        lock_tracker(g->vmallocs);
-        seq_puts(s, "Oustanding vmallocs:\n");
-        __kmem_traces_dump_tracker(g, g->vmallocs, s);
-        seq_puts(s, "\n");
-        unlock_tracker(g->vmallocs);
-        lock_tracker(g->kmallocs);
-        seq_puts(s, "Oustanding kmallocs:\n");
-        __kmem_traces_dump_tracker(g, g->kmallocs, s);
-        unlock_tracker(g->kmallocs);
-        return 0;
-}
-static int __kmem_traces_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, __kmem_traces_show, inode->i_private);
-}
-static const struct file_operations __kmem_traces_fops = {
-        .open = __kmem_traces_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
-};
-void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *plat = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
-        struct dentry *gpu_root = plat->debugfs;
-        struct dentry *node;
-        g->debugfs_kmem = debugfs_create_dir("kmem_tracking", gpu_root);
-        if (IS_ERR_OR_NULL(g->debugfs_kmem))
-                return;
-        node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
-                                   g->debugfs_kmem,
-                                   g->vmallocs, &__kmem_tracking_fops);
-        node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
-                                   g->debugfs_kmem,
-                                   g->kmallocs, &__kmem_tracking_fops);
-        node = debugfs_create_file("traces", S_IRUGO,
-                                   g->debugfs_kmem,
-                                   g, &__kmem_traces_fops);
-}
-#else
-void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-}
-#endif
 static int __do_check_for_outstanding_allocs(
        struct gk20a *g,
        struct nvgpu_mem_alloc_tracker *tracker,
diff --git a/drivers/gpu/nvgpu/common/linux/kmem_priv.h b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
index d3abb378..a41762af 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem_priv.h
+++ b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
@@ -20,6 +20,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/lock.h>
+struct seq_file;
 #define __pstat(s, fmt, msg...)                         \
        do {                                            \
                if (s)                                  \
@@ -92,6 +94,12 @@ struct nvgpu_mem_alloc_tracker {
        unsigned long max_alloc;
 };
+void nvgpu_lock_tracker(struct nvgpu_mem_alloc_tracker *tracker);
+void nvgpu_unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker);
+void kmem_print_mem_alloc(struct gk20a *g,
+                         struct nvgpu_mem_alloc *alloc,
+                         struct seq_file *s);
 #endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
 #endif /* __KMEM_PRIV_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index d5fc40de..4f7fc3fa 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -29,6 +29,7 @@
 #include <nvgpu/nvgpu_common.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -970,10 +971,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
        gk20a_user_deinit(dev, &nvgpu_class);
-#ifdef CONFIG_DEBUG_FS
+        gk20a_debug_deinit(g);
-        debugfs_remove_recursive(platform->debugfs);
-        debugfs_remove_recursive(platform->debugfs_alias);
-#endif
        gk20a_remove_sysfs(dev);
diff --git a/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
index 40ee199a..eae0475a 100644
--- a/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
@@ -411,7 +411,9 @@ int nvgpu_bitmap_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
        wmb();
        a->inited = true;
+#ifdef CONFIG_DEBUG_FS
        nvgpu_init_alloc_debug(g, __a);
+#endif
        alloc_dbg(__a, "New allocator: type      bitmap\n");
        alloc_dbg(__a, "               base      0x%llx\n", a->base);
        alloc_dbg(__a, "               bit_offs  0x%llx\n", a->bit_offs);
diff --git a/drivers/gpu/nvgpu/common/mm/buddy_allocator.c b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
index 34bc51df..0ef94c10 100644
--- a/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
@@ -251,7 +251,9 @@ static void nvgpu_buddy_allocator_destroy(struct nvgpu_allocator *__a)
        alloc_lock(__a);
+#ifdef CONFIG_DEBUG_FS
        nvgpu_fini_alloc_debug(__a);
+#endif
        /*
         * Free the fixed allocs first.
@@ -1290,7 +1292,9 @@ int __nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
        wmb();
        a->initialized = 1;
+#ifdef CONFIG_DEBUG_FS
        nvgpu_init_alloc_debug(g, __a);
+#endif
        alloc_dbg(__a, "New allocator: type      buddy\n");
        alloc_dbg(__a, "               base      0x%llx\n", a->base);
        alloc_dbg(__a, "               size      0x%llx\n", a->length);
diff --git a/drivers/gpu/nvgpu/common/mm/lockless_allocator.c b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
index 234ae4a3..944b4b0f 100644
--- a/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
@@ -99,7 +99,9 @@ static void nvgpu_lockless_alloc_destroy(struct nvgpu_allocator *a)
 {
        struct nvgpu_lockless_allocator *pa = a->priv;
+#ifdef CONFIG_DEBUG_FS
        nvgpu_fini_alloc_debug(a);
+#endif
        nvgpu_vfree(a->g, pa->next);
        nvgpu_kfree(nvgpu_alloc_to_gpu(a), pa);
@@ -191,7 +193,9 @@ int nvgpu_lockless_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
        wmb();
        a->inited = true;
+#ifdef CONFIG_DEBUG_FS
        nvgpu_init_alloc_debug(g, __a);
+#endif
        alloc_dbg(__a, "New allocator: type          lockless\n");
        alloc_dbg(__a, "               base          0x%llx\n", a->base);
        alloc_dbg(__a, "               nodes         %d\n", a->nr_nodes);
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
index 211b353b..1646d2b1 100644
--- a/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
@@ -20,11 +20,6 @@
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
-#ifdef CONFIG_DEBUG_FS
-#include "gk20a/platform_gk20a.h"
-#endif
-u32 nvgpu_alloc_tracing_on;
 u64 nvgpu_alloc_length(struct nvgpu_allocator *a)
 {
@@ -151,68 +146,3 @@ int __nvgpu_alloc_common_init(struct nvgpu_allocator *a, struct gk20a *g,
        return 0;
 }
-#ifdef CONFIG_DEBUG_FS
-void nvgpu_alloc_print_stats(struct nvgpu_allocator *__a,
-                             struct seq_file *s, int lock)
-{
-        __a->ops->print_stats(__a, s, lock);
-}
-static int __alloc_show(struct seq_file *s, void *unused)
-{
-        struct nvgpu_allocator *a = s->private;
-        nvgpu_alloc_print_stats(a, s, 1);
-        return 0;
-}
-static int __alloc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, __alloc_show, inode->i_private);
-}
-static const struct file_operations __alloc_fops = {
-        .open = __alloc_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
-};
-#endif
-void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-        if (!g->debugfs_allocators)
-                return;
-        a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
-                                               g->debugfs_allocators,
-                                               a, &__alloc_fops);
-#endif
-}
-void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-        if (!IS_ERR_OR_NULL(a->debugfs_entry))
-                debugfs_remove(a->debugfs_entry);
-#endif
-}
-#ifdef CONFIG_DEBUG_FS
-void nvgpu_alloc_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct dentry *gpu_root = platform->debugfs;
-        struct gk20a *g = get_gk20a(dev);
-        g->debugfs_allocators = debugfs_create_dir("allocators", gpu_root);
-        if (IS_ERR_OR_NULL(g->debugfs_allocators))
-                return;
-        debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
-                           &nvgpu_alloc_tracing_on);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 14b5da3c..3f4f3706 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -916,7 +916,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
        if (err)
                goto fail;
+#ifdef CONFIG_DEBUG_FS
        nvgpu_init_alloc_debug(g, __a);
+#endif
        palloc_dbg(a, "New allocator: type      page\n");
        palloc_dbg(a, "               base      0x%llx\n", a->base);
        palloc_dbg(a, "               size      0x%llx\n", a->length);
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index a0160274..084f1793 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -18,9 +18,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/fs.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
 #include <linux/dma-buf.h>
 #include <trace/events/gk20a.h>
@@ -40,8 +37,6 @@
 #include "cde_gk20a.h"
 #include "fence_gk20a.h"
 #include "gr_gk20a.h"
-#include "debug_gk20a.h"
-#include "platform_gk20a.h"
 #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -1585,8 +1580,7 @@ int gk20a_prepare_compressible_read(
        if (IS_ERR(dmabuf))
                return -EINVAL;
-        err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g),
+        err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
-                                     offset, &state);
        if (err) {
                dma_buf_put(dmabuf);
                return err;
@@ -1650,7 +1644,7 @@ int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
                return -EINVAL;
        }
-        err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state);
+        err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
        if (err) {
                nvgpu_err(g, "could not get state from dmabuf");
                dma_buf_put(dmabuf);
@@ -1671,38 +1665,3 @@ int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
        dma_buf_put(dmabuf);
        return 0;
 }
-#ifdef CONFIG_DEBUG_FS
-static ssize_t gk20a_cde_reload_write(struct file *file,
-        const char __user *userbuf, size_t count, loff_t *ppos)
-{
-        struct gk20a *g = file->private_data;
-        gk20a_cde_reload(g);
-        return count;
-}
-static const struct file_operations gk20a_cde_reload_fops = {
-        .open           = simple_open,
-        .write          = gk20a_cde_reload_write,
-};
-void gk20a_cde_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
-        if (!platform->has_cde)
-                return;
-        debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->cde_app.shader_parameter);
-        debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->cde_app.ctx_count);
-        debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->cde_app.ctx_usecount);
-        debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->cde_app.ctx_count_top);
-        debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
-                            g, &gk20a_cde_reload_fops);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index ffd55b4d..4f400bf3 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -295,7 +295,6 @@ int gk20a_cde_convert(struct gk20a *g,
                struct nvgpu_fence *fence,
                u32 __flags, struct gk20a_cde_param *params,
                int num_params, struct gk20a_fence **fence_out);
-void gk20a_cde_debugfs_init(struct device *dev);
 int gk20a_prepare_compressible_read(
                struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 1ed90b14..c905bedb 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -13,15 +13,10 @@
 * more details.
 */
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
 #include <nvgpu/kmem.h>
 #include <nvgpu/dma.h>
 #include "gk20a.h"
-#include "debug_gk20a.h"
 #include <nvgpu/log.h>
@@ -33,10 +28,6 @@
 #include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
 static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
 {
        gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -728,18 +719,3 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
        return;
 }
 EXPORT_SYMBOL(gk20a_ce_delete_context);
-#ifdef CONFIG_DEBUG_FS
-void gk20a_ce_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
-        debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->ce_app.ctx_count);
-        debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->ce_app.app_state);
-        debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
-                           platform->debugfs, &g->ce_app.next_ctx_id);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index dfd19019..f972e175 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -156,10 +156,4 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
 void gk20a_ce_delete_context(struct gk20a *g,
                u32 ce_ctx_id);
-#ifdef CONFIG_DEBUG_FS
-/* CE app debugfs api */
-void gk20a_ce_debugfs_init(struct device *dev);
-#endif
 #endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 571570d8..13abed95 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -30,9 +30,9 @@
 #include <nvgpu/circ_buf.h>
 #include <nvgpu/cond.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
 #include "dbg_gpu_gk20a.h"
 #include "fence_gk20a.h"
@@ -1403,6 +1403,7 @@ static u32 get_gp_free_count(struct channel_gk20a *c)
        return gp_free_count(c);
 }
+#ifdef CONFIG_DEBUG_FS
 static void trace_write_pushbuffer(struct channel_gk20a *c,
                                   struct nvgpu_gpfifo *g)
 {
@@ -1439,6 +1440,7 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
                dma_buf_vunmap(dmabuf, mem);
        }
 }
+#endif
 static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                                         struct nvgpu_gpfifo *g,
@@ -1446,6 +1448,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                                         int offset,
                                         int count)
 {
+#ifdef CONFIG_DEBUG_FS
        u32 size;
        int i;
        struct nvgpu_gpfifo *gp;
@@ -1478,6 +1481,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
        if (gpfifo_allocated)
                nvgpu_big_free(c->g, g);
+#endif
 }
 static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
@@ -1629,8 +1633,8 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
        nvgpu_err(g, "Job on channel %d timed out",
                  ch->hw_chid);
-        gk20a_debug_dump(g->dev);
+        gk20a_debug_dump(g);
-        gk20a_gr_debug_dump(g->dev);
+        gk20a_gr_debug_dump(g);
        g->ops.fifo.force_reset_ch(ch,
                NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT, true);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index ac3a3d57..46560a56 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -29,12 +29,11 @@
 #include <nvgpu/log.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/atomic.h>
-#include <nvgpu/sort.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/log2.h>
+#include <nvgpu/debug.h>
 #include "gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
 #include "mm_gk20a.h"
@@ -46,10 +45,6 @@
 #include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
 #define FECS_METHOD_WFI_RESTORE 0x80000
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
@@ -57,10 +52,6 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
                                            bool wait_for_finish);
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
-#ifdef CONFIG_DEBUG_FS
-static void __gk20a_fifo_profile_free(struct kref *ref);
-#endif
 u32 gk20a_fifo_get_engine_ids(struct gk20a *g,
                u32 engine_id[], u32 engine_id_sz,
                u32 engine_enum)
@@ -562,14 +553,6 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
        f->engine_info = NULL;
        nvgpu_kfree(g, f->active_engines_list);
        f->active_engines_list = NULL;
-#ifdef CONFIG_DEBUG_FS
-        nvgpu_mutex_acquire(&f->profile.lock);
-        if (f->profile.enabled) {
-                f->profile.enabled = false;
-                kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
-        }
-        nvgpu_mutex_release(&f->profile.lock);
-#endif
 }
 /* reads info from hardware and fills in pbmda exception info record */
@@ -1543,7 +1526,7 @@ static bool gk20a_fifo_handle_mmu_fault(
        } else {
                fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
                fake_fault = false;
-                gk20a_debug_dump(g->dev);
+                gk20a_debug_dump(g);
        }
@@ -1833,7 +1816,7 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
                        gk20a_channel_abort(ch, false);
                        if (gk20a_fifo_error_ch(g, ch))
-                                gk20a_debug_dump(g->dev);
+                                gk20a_debug_dump(g);
                        gk20a_channel_put(ch);
                }
@@ -1860,7 +1843,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
                struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
                if (gk20a_fifo_error_tsg(g, tsg))
-                        gk20a_debug_dump(g->dev);
+                        gk20a_debug_dump(g);
                gk20a_fifo_abort_tsg(g, tsgid, false);
        }
@@ -1957,7 +1940,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
        unsigned int id_type;
        if (verbose)
-                gk20a_debug_dump(g->dev);
+                gk20a_debug_dump(g);
        if (g->ops.ltc.flush)
                g->ops.ltc.flush(g);
@@ -3441,345 +3424,6 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
                return NULL;
 }
-#ifdef CONFIG_DEBUG_FS
-/* Get the next element in the ring buffer of profile entries
- * and grab a reference to the structure
- */
-struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g)
-{
-        struct fifo_gk20a *f = &g->fifo;
-        struct fifo_profile_gk20a *profile;
-        unsigned int index;
-        /* If kref is zero, profiling is not enabled */
-        if (!kref_get_unless_zero(&f->profile.ref))
-                return NULL;
-        index = atomic_inc_return(&f->profile.get);
-        profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
-        return profile;
-}
-/* Free the reference to the structure. This allows deferred cleanups */
-void gk20a_fifo_profile_release(struct gk20a *g,
-                                        struct fifo_profile_gk20a *profile)
-{
-        kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
-}
-static void *gk20a_fifo_sched_debugfs_seq_start(
-                struct seq_file *s, loff_t *pos)
-{
-        struct gk20a *g = s->private;
-        struct fifo_gk20a *f = &g->fifo;
-        if (*pos >= f->num_channels)
-                return NULL;
-        return &f->channel[*pos];
-}
-static void *gk20a_fifo_sched_debugfs_seq_next(
-                struct seq_file *s, void *v, loff_t *pos)
-{
-        struct gk20a *g = s->private;
-        struct fifo_gk20a *f = &g->fifo;
-        ++(*pos);
-        if (*pos >= f->num_channels)
-                return NULL;
-        return &f->channel[*pos];
-}
-static void gk20a_fifo_sched_debugfs_seq_stop(
-                struct seq_file *s, void *v)
-{
-}
-static int gk20a_fifo_sched_debugfs_seq_show(
-                struct seq_file *s, void *v)
-{
-        struct gk20a *g = s->private;
-        struct fifo_gk20a *f = &g->fifo;
-        struct channel_gk20a *ch = v;
-        struct tsg_gk20a *tsg = NULL;
-        struct fifo_engine_info_gk20a *engine_info;
-        struct fifo_runlist_info_gk20a *runlist;
-        u32 runlist_id;
-        int ret = SEQ_SKIP;
-        u32 engine_id;
-        engine_id = gk20a_fifo_get_gr_engine_id(g);
-        engine_info = (f->engine_info + engine_id);
-        runlist_id = engine_info->runlist_id;
-        runlist = &f->runlist_info[runlist_id];
-        if (ch == f->channel) {
-                seq_puts(s, "chid     tsgid    pid      timeslice  timeout  interleave graphics_preempt compute_preempt\n");
-                seq_puts(s, "                            (usecs)   (msecs)\n");
-                ret = 0;
-        }
-        if (!test_bit(ch->hw_chid, runlist->active_channels))
-                return ret;
-        if (gk20a_channel_get(ch)) {
-                if (gk20a_is_channel_marked_as_tsg(ch))
-                        tsg = &f->tsg[ch->tsgid];
-                seq_printf(s, "%-8d %-8d %-8d %-9d %-8d %-10d %-8d %-8d\n",
-                                ch->hw_chid,
-                                ch->tsgid,
-                                ch->tgid,
-                                tsg ? tsg->timeslice_us : ch->timeslice_us,
-                                ch->timeout_ms_max,
-                                tsg ? tsg->interleave_level : ch->interleave_level,
-                                ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->graphics_preempt_mode : U32_MAX,
-                                ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->compute_preempt_mode : U32_MAX);
-                gk20a_channel_put(ch);
-        }
-        return 0;
-}
-static const struct seq_operations gk20a_fifo_sched_debugfs_seq_ops = {
-        .start = gk20a_fifo_sched_debugfs_seq_start,
-        .next = gk20a_fifo_sched_debugfs_seq_next,
-        .stop = gk20a_fifo_sched_debugfs_seq_stop,
-        .show = gk20a_fifo_sched_debugfs_seq_show
-};
-static int gk20a_fifo_sched_debugfs_open(struct inode *inode,
-        struct file *file)
-{
-        int err;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        err = seq_open(file, &gk20a_fifo_sched_debugfs_seq_ops);
-        if (err)
-                return err;
-        gk20a_dbg(gpu_dbg_info, "i_private=%p", inode->i_private);
-        ((struct seq_file *)file->private_data)->private = inode->i_private;
-        return 0;
-};
-/*
- * The file operations structure contains our open function along with
- * set of the canned seq_ ops.
- */
-static const struct file_operations gk20a_fifo_sched_debugfs_fops = {
-        .owner = THIS_MODULE,
-        .open = gk20a_fifo_sched_debugfs_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = seq_release
-};
-static void __gk20a_fifo_profile_free(struct kref *ref)
-{
-        struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a,
-                                                profile.ref);
-        nvgpu_vfree(f->g, f->profile.data);
-        nvgpu_vfree(f->g, f->profile.sorted);
-}
-static int gk20a_fifo_profile_enable(void *data, u64 val)
-{
-        struct gk20a *g = (struct gk20a *) data;
-        struct fifo_gk20a *f = &g->fifo;
-        nvgpu_mutex_acquire(&f->profile.lock);
-        if (val == 0) {
-                if (f->profile.enabled) {
-                        f->profile.enabled = false;
-                        kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
-                }
-        } else {
-                if (!f->profile.enabled) {
-                        /* not kref init as it can have a running condition if
-                         * we enable/disable/enable while kickoff is happening
-                         */
-                        if (!kref_get_unless_zero(&f->profile.ref)) {
-                                f->profile.data = vzalloc(
-                                                        FIFO_PROFILING_ENTRIES *
-                                        sizeof(struct fifo_profile_gk20a));
-                                f->profile.sorted  = vzalloc(
-                                                        FIFO_PROFILING_ENTRIES *
-                                                        sizeof(u64));
-                                if (!(f->profile.data && f->profile.sorted)) {
-                                        nvgpu_vfree(g, f->profile.data);
-                                        nvgpu_vfree(g, f->profile.sorted);
-                                        nvgpu_mutex_release(&f->profile.lock);
-                                        return -ENOMEM;
-                                }
-                                kref_init(&f->profile.ref);
-                        }
-                        atomic_set(&f->profile.get, 0);
-                        f->profile.enabled = true;
-                }
-        }
-        nvgpu_mutex_release(&f->profile.lock);
-        return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(
-        gk20a_fifo_profile_enable_debugfs_fops,
-        NULL,
-        gk20a_fifo_profile_enable,
-        "%llu\n"
-);
-static int __profile_cmp(const void *a, const void *b)
-{
-        return *((unsigned long long *) a) - *((unsigned long long *) b);
-}
-/*
- * This uses about 800b in the stack, but the function using it is not part
- * of a callstack where much memory is being used, so it is fine
- */
-#define PERCENTILE_WIDTH        5
-#define PERCENTILE_RANGES       (100/PERCENTILE_WIDTH)
-static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
-                u64 *percentiles, u32 index_end, u32 index_start)
-{
-        unsigned int nelem = 0;
-        unsigned int index;
-        struct fifo_profile_gk20a *profile;
-        for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
-                profile = &g->fifo.profile.data[index];
-                if (profile->timestamp[index_end] >
-                                profile->timestamp[index_start]) {
-                        /* This is a valid element */
-                        g->fifo.profile.sorted[nelem] =
-                                                profile->timestamp[index_end] -
-                                                profile->timestamp[index_start];
-                        nelem++;
-                }
-        }
-        /* sort it */
-        sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
-                __profile_cmp, NULL);
-        /* build ranges */
-        for (index = 0; index < PERCENTILE_RANGES; index++)
-                percentiles[index] =
-                        g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
-                                                nelem)/100 - 1];
-        return nelem;
-}
-static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
-{
-        struct gk20a *g = s->private;
-        unsigned int get, nelem, index;
-        /*
-         * 800B in the stack, but function is declared statically and only
-         * called from debugfs handler
-         */
-        u64 percentiles_ioctl[PERCENTILE_RANGES];
-        u64 percentiles_kickoff[PERCENTILE_RANGES];
-        u64 percentiles_jobtracking[PERCENTILE_RANGES];
-        u64 percentiles_append[PERCENTILE_RANGES];
-        u64 percentiles_userd[PERCENTILE_RANGES];
-        if (!kref_get_unless_zero(&g->fifo.profile.ref)) {
-                seq_printf(s, "Profiling disabled\n");
-                return 0;
-        }
-        get = atomic_read(&g->fifo.profile.get);
-        __gk20a_fifo_create_stats(g, percentiles_ioctl,
-                PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
-        __gk20a_fifo_create_stats(g, percentiles_kickoff,
-                PROFILE_END, PROFILE_ENTRY);
-        __gk20a_fifo_create_stats(g, percentiles_jobtracking,
-                PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
-        __gk20a_fifo_create_stats(g, percentiles_append,
-                PROFILE_APPEND, PROFILE_JOB_TRACKING);
-        nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
-                PROFILE_END, PROFILE_APPEND);
-        seq_printf(s, "Number of kickoffs: %d\n", nelem);
-        seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
-        for (index = 0; index < PERCENTILE_RANGES; index++)
-                seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
-                        PERCENTILE_WIDTH * (index+1),
-                        percentiles_ioctl[index],
-                        percentiles_kickoff[index],
-                        percentiles_append[index],
-                        percentiles_jobtracking[index],
-                        percentiles_userd[index]);
-        kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
-        return 0;
-}
-static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, gk20a_fifo_profile_stats, inode->i_private);
-}
-static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
-        .open           = gk20a_fifo_profile_stats_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-void gk20a_fifo_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
-        struct dentry *gpu_root = platform->debugfs;
-        struct dentry *fifo_root;
-        struct dentry *profile_root;
-        fifo_root = debugfs_create_dir("fifo", gpu_root);
-        if (IS_ERR_OR_NULL(fifo_root))
-                return;
-        gk20a_dbg(gpu_dbg_info, "g=%p", g);
-        debugfs_create_file("sched", 0600, fifo_root, g,
-                &gk20a_fifo_sched_debugfs_fops);
-        profile_root = debugfs_create_dir("profile", fifo_root);
-        if (IS_ERR_OR_NULL(profile_root))
-                return;
-        nvgpu_mutex_init(&g->fifo.profile.lock);
-        g->fifo.profile.enabled = false;
-        atomic_set(&g->fifo.profile.get, 0);
-        atomic_set(&g->fifo.profile.ref.refcount, 0);
-        debugfs_create_file("enable", 0600, profile_root, g,
-                &gk20a_fifo_profile_enable_debugfs_fops);
-        debugfs_create_file("stats", 0600, profile_root, g,
-                &gk20a_fifo_profile_stats_debugfs_fops);
-}
-#endif /* CONFIG_DEBUG_FS */
 static const char * const ccsr_chan_status_str[] = {
        "idle",
        "pending",
@@ -3901,6 +3545,54 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g,
        gk20a_debug_output(o, "\n");
 }
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+                 struct gk20a_debug_output *o)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        struct ch_state **ch_state;
+        ch_state = nvgpu_kzalloc(g, sizeof(*ch_state) * f->num_channels);
+        if (!ch_state) {
+                gk20a_debug_output(o, "cannot alloc memory for channels\n");
+                return;
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *ch = &f->channel[chid];
+                if (gk20a_channel_get(ch)) {
+                        ch_state[chid] =
+                                nvgpu_kmalloc(g, sizeof(struct ch_state) +
+                                        ram_in_alloc_size_v());
+                        /* ref taken stays to below loop with
+                         * successful allocs */
+                        if (!ch_state[chid])
+                                gk20a_channel_put(ch);
+                }
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *ch = &f->channel[chid];
+                if (!ch_state[chid])
+                        continue;
+                ch_state[chid]->pid = ch->pid;
+                ch_state[chid]->refs = atomic_read(&ch->ref_count);
+                nvgpu_mem_rd_n(g, &ch->inst_block, 0,
+                                &ch_state[chid]->inst_block[0],
+                                ram_in_alloc_size_v());
+                gk20a_channel_put(ch);
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (ch_state[chid]) {
+                        g->ops.fifo.dump_channel_status_ramfc(g, o, chid,
+                                                 ch_state[chid]);
+                        nvgpu_kfree(g, ch_state[chid]);
+                }
+        }
+        nvgpu_kfree(g, ch_state);
+}
 void gk20a_dump_pbdma_status(struct gk20a *g,
                                 struct gk20a_debug_output *o)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 6c8868a2..228e5130 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -23,10 +23,11 @@
 #include "channel_gk20a.h"
 #include "tsg_gk20a.h"
-#include "debug_gk20a.h"
 #include <nvgpu/kref.h>
+struct gk20a_debug_output;
 #define MAX_RUNLIST_BUFFERS             2
 #define FIFO_INVAL_ENGINE_ID            ((u32)~0)
@@ -287,8 +288,6 @@ int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
 int gk20a_fifo_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
-void gk20a_fifo_debugfs_init(struct device *dev);
 const char *gk20a_fifo_interleave_level_name(u32 interleave_level);
 int gk20a_fifo_engine_enum_from_type(struct gk20a *g, u32 engine_type,
@@ -341,6 +340,8 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g,
                                     struct gk20a_debug_output *o,
                                     u32 hw_chid,
                                     struct ch_state *ch_state);
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+                 struct gk20a_debug_output *o);
 void gk20a_dump_pbdma_status(struct gk20a *g,
                                 struct gk20a_debug_output *o);
 void gk20a_dump_eng_status(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 689fafb1..899c1d6a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -30,6 +30,7 @@ struct acr_desc;
 struct nvgpu_mem_alloc_tracker;
 struct dbg_profiler_object_data;
 struct ecc_gk20a;
+struct gk20a_debug_output;
 #include <linux/sched.h>
 #include <nvgpu/lock.h>
@@ -61,7 +62,6 @@ struct ecc_gk20a;
 #include "therm_gk20a.h"
 #include "gm20b/acr_gm20b.h"
 #include "cde_gk20a.h"
-#include "debug_gk20a.h"
 #include "sched_gk20a.h"
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 #include "clk/clk.h"
@@ -1544,10 +1544,6 @@ void nvgpu_wait_for_deferred_interrupts(struct gk20a *g);
 struct gk20a * __must_check gk20a_get(struct gk20a *g);
 void gk20a_put(struct gk20a *g);
-#ifdef CONFIG_DEBUG_FS
-int gk20a_railgating_debugfs_init(struct device *dev);
-#endif
 static inline bool gk20a_platform_has_syncpoints(struct gk20a *g)
 {
 #ifdef CONFIG_TEGRA_GK20A_NVHOST
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 2188618c..982cfac8 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -30,6 +30,7 @@
 #include <nvgpu/bug.h>
 #include <nvgpu/firmware.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "gk20a.h"
 #include "kind_gk20a.h"
@@ -37,13 +38,8 @@
 #include "gr_pri_gk20a.h"
 #include "regops_gk20a.h"
 #include "dbg_gpu_gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
 #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
@@ -514,7 +510,7 @@ int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
                nvgpu_err(g,
                           "timeout waiting on ucode response");
                gk20a_fecs_dump_falcon_stats(g);
-                gk20a_gr_debug_dump(g->dev);
+                gk20a_gr_debug_dump(g);
                return -1;
        } else if (check == WAIT_UCODE_ERROR) {
                nvgpu_err(g,
@@ -9032,20 +9028,6 @@ static int gr_gk20a_dump_gr_status_regs(struct gk20a *g,
        return 0;
 }
-#ifdef CONFIG_DEBUG_FS
-int gr_gk20a_debugfs_init(struct gk20a *g)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(g->dev);
-        g->debugfs_gr_default_attrib_cb_size =
-                debugfs_create_u32("gr_default_attrib_cb_size",
-                                   S_IRUGO|S_IWUSR, platform->debugfs,
-                                   &g->gr.attrib_cb_default_size);
-        return 0;
-}
-#endif
 static void gr_gk20a_init_cyclestats(struct gk20a *g)
 {
 #if defined(CONFIG_GK20A_CYCLE_STATS)
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 79aeb42f..deb8ea9c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -653,7 +653,6 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 void gr_gk20a_free_gr_ctx(struct gk20a *g,
                          struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx);
 int gr_gk20a_halt_pipe(struct gk20a *g);
-int gr_gk20a_debugfs_init(struct gk20a *g);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 int gr_gk20a_css_attach(struct channel_gk20a *ch,   /* in - main hw structure */
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index 8a3beb39..b19398a6 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -37,6 +37,7 @@
 #include "pramin_gk20a.h"
 #include "priv_ring_gk20a.h"
+#include <nvgpu/debug.h>
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 53d22a7d..08e2e9cc 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -2563,13 +2563,13 @@ priv_exist_or_err:
        return 0;
 }
-int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
+int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g,
                           u64 offset, struct gk20a_buffer_state **state)
 {
        int err = 0;
        struct gk20a_dmabuf_priv *priv;
        struct gk20a_buffer_state *s;
-        struct gk20a *g = get_gk20a(dev);
+        struct device *dev = g->dev;
        if (WARN_ON(offset >= (u64)dmabuf->size))
                return -EINVAL;
@@ -3123,18 +3123,6 @@ static bool gk20a_mm_is_bar1_supported(struct gk20a *g)
        return true;
 }
-#ifdef CONFIG_DEBUG_FS
-void gk20a_mm_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct dentry *gpu_root = platform->debugfs;
-        struct gk20a *g = gk20a_get_platform(dev)->g;
-        debugfs_create_bool("force_pramin", 0664, gpu_root,
-                           &g->mm.force_pramin);
-}
-#endif
 void gk20a_init_mm(struct gpu_ops *gops)
 {
        gops->mm.gmmu_map = gk20a_locked_gmmu_map;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 79b55371..5d90cbf6 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -146,7 +146,6 @@ struct channel_gk20a;
 int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
 int gk20a_init_mm_setup_hw(struct gk20a *g);
-void gk20a_mm_debugfs_init(struct device *dev);
 void gk20a_init_mm_ce_context(struct gk20a *g);
 int gk20a_mm_fb_flush(struct gk20a *g);
@@ -437,7 +436,7 @@ dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr);
 int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
-int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
+int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g,
                           u64 offset, struct gk20a_buffer_state **state);
 int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index a9e03943..552d5d73 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -35,12 +35,6 @@
 #include "nvgpu_gpuid_t19x.h"
 #endif
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include "platform_gk20a.h"
-#endif
 #define GK20A_PMU_UCODE_IMAGE   "gpmu_ucode.bin"
 #define PMU_MEM_SCRUBBING_TIMEOUT_MAX 1000
@@ -49,7 +43,7 @@
 #define gk20a_dbg_pmu(fmt, arg...) \
        gk20a_dbg(gpu_dbg_pmu, fmt, ##arg)
-static int gk20a_pmu_get_pg_stats(struct gk20a *g,
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
                u32 pg_engine_id,
                struct pmu_pg_stats_data *pg_stat_data);
 static void ap_callback_init_and_enable_ctrl(
@@ -281,7 +275,7 @@ static void set_pmu_cmdline_args_falctracesize_v1(
        pmu->args_v1.falc_trace_size = size;
 }
-static bool find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos)
+bool nvgpu_find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos)
 {
        u32 i = 0, j = strlen(strings);
        for (; i < j; i++) {
@@ -326,7 +320,7 @@ static void printtrace(struct pmu_gk20a *pmu)
                count = scnprintf(buf, 0x40, "Index %x: ", trace1[(i / 4)]);
                l = 0;
                m = 0;
-                while (find_hex_in_string((trace+i+20+m), g, &k)) {
+                while (nvgpu_find_hex_in_string((trace+i+20+m), g, &k)) {
                        if (k >= 40)
                                break;
                        strncpy(part_str, (trace+i+20+m), k);
@@ -4141,7 +4135,7 @@ void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
                nvgpu_err(g, "ZBC save timeout");
 }
-static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
+int nvgpu_pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
 {
        struct gk20a *g = gk20a_from_pmu(pmu);
        struct pmu_v *pv = &g->ops.pmu_ver;
@@ -4185,7 +4179,7 @@ static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
        return 0;
 }
-static int pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
+int nvgpu_pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
 {
        struct gk20a *g = gk20a_from_pmu(pmu);
        struct pmu_cmd cmd;
@@ -4231,7 +4225,7 @@ static int pmu_handle_perfmon_event(struct pmu_gk20a *pmu,
        /* restart sampling */
        if (pmu->perfmon_sampling_enabled)
-                return pmu_perfmon_start_sampling(pmu);
+                return nvgpu_pmu_perfmon_start_sampling(pmu);
        return 0;
 }
@@ -5173,9 +5167,9 @@ int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable)
        gk20a_dbg_fn("");
        if (enable)
-                err = pmu_perfmon_start_sampling(pmu);
+                err = nvgpu_pmu_perfmon_start_sampling(pmu);
        else
-                err = pmu_perfmon_stop_sampling(pmu);
+                err = nvgpu_pmu_perfmon_stop_sampling(pmu);
        return err;
 }
@@ -5293,7 +5287,7 @@ void gk20a_pmu_elpg_statistics(struct gk20a *g, u32 pg_engine_id,
        pg_stat_data->avg_exit_latency_us = stats.pg_avg_exit_time_us;
 }
-static int gk20a_pmu_get_pg_stats(struct gk20a *g,
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
                u32 pg_engine_id,
                struct pmu_pg_stats_data *pg_stat_data)
 {
@@ -5463,466 +5457,3 @@ int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
        status = gk20a_pmu_ap_send_command(g, &ap_cmd, true);
        return status;
 }
-#ifdef CONFIG_DEBUG_FS
-static int lpwr_debug_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        if (g->ops.pmu.pmu_pg_engines_feature_list &&
-                g->ops.pmu.pmu_pg_engines_feature_list(g,
-                PMU_PG_ELPG_ENGINE_ID_GRAPHICS) !=
-                PMU_PG_FEATURE_GR_POWER_GATING_ENABLED) {
-                seq_printf(s, "PSTATE: %u\n"
-                        "RPPG Enabled: %u\n"
-                        "RPPG ref count: %u\n"
-                        "RPPG state: %u\n"
-                        "MSCG Enabled: %u\n"
-                        "MSCG pstate state: %u\n"
-                        "MSCG transition state: %u\n",
-                        g->ops.clk_arb.get_current_pstate(g),
-                        g->elpg_enabled, g->pmu.elpg_refcnt,
-                        g->pmu.elpg_stat, g->mscg_enabled,
-                        g->pmu.mscg_stat, g->pmu.mscg_transition_state);
-        } else
-                seq_printf(s, "ELPG Enabled: %u\n"
-                        "ELPG ref count: %u\n"
-                        "ELPG state: %u\n",
-                        g->elpg_enabled, g->pmu.elpg_refcnt,
-                        g->pmu.elpg_stat);
-        return 0;
-}
-static int lpwr_debug_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, lpwr_debug_show, inode->i_private);
-}
-static const struct file_operations lpwr_debug_fops = {
-        .open           = lpwr_debug_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int mscg_stat_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        u64 total_ingating, total_ungating, residency, divisor, dividend;
-        struct pmu_pg_stats_data pg_stat_data = { 0 };
-        int err;
-        /* Don't unnecessarily power on the device */
-        if (g->power_on) {
-                err = gk20a_busy(g);
-                if (err)
-                        return err;
-                gk20a_pmu_get_pg_stats(g,
-                        PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
-                gk20a_idle(g);
-        }
-        total_ingating = g->pg_ingating_time_us +
-                        (u64)pg_stat_data.ingating_time;
-        total_ungating = g->pg_ungating_time_us +
-                        (u64)pg_stat_data.ungating_time;
-        divisor = total_ingating + total_ungating;
-        /* We compute the residency on a scale of 1000 */
-        dividend = total_ingating * 1000;
-        if (divisor)
-                residency = div64_u64(dividend, divisor);
-        else
-                residency = 0;
-        seq_printf(s,
-                        "Time in MSCG: %llu us\n"
-                        "Time out of MSCG: %llu us\n"
-                        "MSCG residency ratio: %llu\n"
-                        "MSCG Entry Count: %u\n"
-                        "MSCG Avg Entry latency %u\n"
-                        "MSCG Avg Exit latency %u\n",
-                        total_ingating, total_ungating,
-                        residency, pg_stat_data.gating_cnt,
-                        pg_stat_data.avg_entry_latency_us,
-                        pg_stat_data.avg_exit_latency_us);
-        return 0;
-}
-static int mscg_stat_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, mscg_stat_show, inode->i_private);
-}
-static const struct file_operations mscg_stat_fops = {
-        .open           = mscg_stat_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int mscg_transitions_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        struct pmu_pg_stats_data pg_stat_data = { 0 };
-        u32 total_gating_cnt;
-        int err;
-        if (g->power_on) {
-                err = gk20a_busy(g);
-                if (err)
-                        return err;
-                gk20a_pmu_get_pg_stats(g,
-                        PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
-                gk20a_idle(g);
-        }
-        total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
-        seq_printf(s, "%u\n", total_gating_cnt);
-        return 0;
-}
-static int mscg_transitions_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, mscg_transitions_show, inode->i_private);
-}
-static const struct file_operations mscg_transitions_fops = {
-        .open           = mscg_transitions_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int elpg_stat_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        struct pmu_pg_stats_data pg_stat_data = { 0 };
-        u64 total_ingating, total_ungating, residency, divisor, dividend;
-        int err;
-        /* Don't unnecessarily power on the device */
-        if (g->power_on) {
-                err = gk20a_busy(g);
-                if (err)
-                        return err;
-                gk20a_pmu_get_pg_stats(g,
-                        PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
-                gk20a_idle(g);
-        }
-        total_ingating = g->pg_ingating_time_us +
-                        (u64)pg_stat_data.ingating_time;
-        total_ungating = g->pg_ungating_time_us +
-                        (u64)pg_stat_data.ungating_time;
-        divisor = total_ingating + total_ungating;
-        /* We compute the residency on a scale of 1000 */
-        dividend = total_ingating * 1000;
-        if (divisor)
-                residency = div64_u64(dividend, divisor);
-        else
-                residency = 0;
-        seq_printf(s,
-                        "Time in ELPG: %llu us\n"
-                        "Time out of ELPG: %llu us\n"
-                        "ELPG residency ratio: %llu\n"
-                        "ELPG Entry Count: %u\n"
-                        "ELPG Avg Entry latency %u us\n"
-                        "ELPG Avg Exit latency %u us\n",
-                        total_ingating, total_ungating,
-                        residency, pg_stat_data.gating_cnt,
-                        pg_stat_data.avg_entry_latency_us,
-                        pg_stat_data.avg_exit_latency_us);
-        return 0;
-}
-static int elpg_stat_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, elpg_stat_show, inode->i_private);
-}
-static const struct file_operations elpg_stat_fops = {
-        .open           = elpg_stat_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int elpg_transitions_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        struct pmu_pg_stats_data pg_stat_data = { 0 };
-        u32 total_gating_cnt;
-        int err;
-        if (g->power_on) {
-                err = gk20a_busy(g);
-                if (err)
-                        return err;
-                gk20a_pmu_get_pg_stats(g,
-                        PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
-                gk20a_idle(g);
-        }
-        total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
-        seq_printf(s, "%u\n", total_gating_cnt);
-        return 0;
-}
-static int elpg_transitions_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, elpg_transitions_show, inode->i_private);
-}
-static const struct file_operations elpg_transitions_fops = {
-        .open           = elpg_transitions_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int falc_trace_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        struct pmu_gk20a *pmu = &g->pmu;
-        u32 i = 0, j = 0, k, l, m;
-        char part_str[40];
-        void *tracebuffer;
-        char *trace;
-        u32 *trace1;
-        /* allocate system memory to copy pmu trace buffer */
-        tracebuffer = nvgpu_kzalloc(g, GK20A_PMU_TRACE_BUFSIZE);
-        if (tracebuffer == NULL)
-                return -ENOMEM;
-        /* read pmu traces into system memory buffer */
-        nvgpu_mem_rd_n(g, &pmu->trace_buf,
-                       0, tracebuffer, GK20A_PMU_TRACE_BUFSIZE);
-        trace = (char *)tracebuffer;
-        trace1 = (u32 *)tracebuffer;
-        for (i = 0; i < GK20A_PMU_TRACE_BUFSIZE; i += 0x40) {
-                for (j = 0; j < 0x40; j++)
-                        if (trace1[(i / 4) + j])
-                                break;
-                if (j == 0x40)
-                        break;
-                seq_printf(s, "Index %x: ", trace1[(i / 4)]);
-                l = 0;
-                m = 0;
-                while (find_hex_in_string((trace+i+20+m), g, &k)) {
-                        if (k >= 40)
-                                break;
-                        strncpy(part_str, (trace+i+20+m), k);
-                        part_str[k] = 0;
-                        seq_printf(s, "%s0x%x", part_str,
-                                        trace1[(i / 4) + 1 + l]);
-                        l++;
-                        m += k + 2;
-                }
-                seq_printf(s, "%s", (trace+i+20+m));
-        }
-        nvgpu_kfree(g, tracebuffer);
-        return 0;
-}
-static int falc_trace_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, falc_trace_show, inode->i_private);
-}
-static const struct file_operations falc_trace_fops = {
-        .open           = falc_trace_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int perfmon_events_enable_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        seq_printf(s, "%u\n", g->pmu.perfmon_sampling_enabled ? 1 : 0);
-        return 0;
-}
-static int perfmon_events_enable_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, perfmon_events_enable_show, inode->i_private);
-}
-static ssize_t perfmon_events_enable_write(struct file *file,
-        const char __user *userbuf, size_t count, loff_t *ppos)
-{
-        struct seq_file *s = file->private_data;
-        struct gk20a *g = s->private;
-        unsigned long val = 0;
-        char buf[40];
-        int buf_size;
-        int err;
-        memset(buf, 0, sizeof(buf));
-        buf_size = min(count, (sizeof(buf)-1));
-        if (copy_from_user(buf, userbuf, buf_size))
-                return -EFAULT;
-        if (kstrtoul(buf, 10, &val) < 0)
-                return -EINVAL;
-        /* Don't turn on gk20a unnecessarily */
-        if (g->power_on) {
-                err = gk20a_busy(g);
-                if (err)
-                        return err;
-                if (val && !g->pmu.perfmon_sampling_enabled) {
-                        g->pmu.perfmon_sampling_enabled = true;
-                        pmu_perfmon_start_sampling(&(g->pmu));
-                } else if (!val && g->pmu.perfmon_sampling_enabled) {
-                        g->pmu.perfmon_sampling_enabled = false;
-                        pmu_perfmon_stop_sampling(&(g->pmu));
-                }
-                gk20a_idle(g);
-        } else {
-                g->pmu.perfmon_sampling_enabled = val ? true : false;
-        }
-        return count;
-}
-static const struct file_operations perfmon_events_enable_fops = {
-        .open           = perfmon_events_enable_open,
-        .read           = seq_read,
-        .write          = perfmon_events_enable_write,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int perfmon_events_count_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        seq_printf(s, "%lu\n", g->pmu.perfmon_events_cnt);
-        return 0;
-}
-static int perfmon_events_count_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, perfmon_events_count_show, inode->i_private);
-}
-static const struct file_operations perfmon_events_count_fops = {
-        .open           = perfmon_events_count_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int security_show(struct seq_file *s, void *data)
-{
-        struct gk20a *g = s->private;
-        seq_printf(s, "%d\n", g->pmu.pmu_mode);
-        return 0;
-}
-static int security_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, security_show, inode->i_private);
-}
-static const struct file_operations security_fops = {
-        .open           = security_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-int gk20a_pmu_debugfs_init(struct device *dev)
-{
-        struct dentry *d;
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        struct gk20a *g = get_gk20a(dev);
-        d = debugfs_create_file(
-                "lpwr_debug", S_IRUGO|S_IWUSR, platform->debugfs, g,
-                                                &lpwr_debug_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "mscg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
-                                                &mscg_stat_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "mscg_transitions", S_IRUGO, platform->debugfs, g,
-                                                &mscg_transitions_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
-                                                &elpg_stat_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "elpg_transitions", S_IRUGO, platform->debugfs, g,
-                                                &elpg_transitions_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "falc_trace", S_IRUGO, platform->debugfs, g,
-                                                &falc_trace_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "perfmon_events_enable", S_IRUGO, platform->debugfs, g,
-                                                &perfmon_events_enable_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "perfmon_events_count", S_IRUGO, platform->debugfs, g,
-                                                &perfmon_events_count_fops);
-        if (!d)
-                goto err_out;
-        d = debugfs_create_file(
-                "pmu_security", S_IRUGO, platform->debugfs, g,
-                                                &security_fops);
-        if (!d)
-                goto err_out;
-        return 0;
-err_out:
-        pr_err("%s: Failed to make debugfs node\n", __func__);
-        debugfs_remove_recursive(platform->debugfs);
-        return -ENOMEM;
-}
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index e7a8b7c2..cefb6577 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -426,7 +426,6 @@ int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token);
 int gk20a_pmu_destroy(struct gk20a *g);
 int gk20a_pmu_load_norm(struct gk20a *g, u32 *load);
 int gk20a_pmu_load_update(struct gk20a *g);
-int gk20a_pmu_debugfs_init(struct device *dev);
 void gk20a_pmu_reset_load_counters(struct gk20a *g);
 void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
                u32 *total_cycles);
@@ -468,5 +467,11 @@ int gk20a_pmu_vidmem_surface_alloc(struct gk20a *g, struct nvgpu_mem *mem,
                u32 size);
 int gk20a_pmu_sysmem_surface_alloc(struct gk20a *g, struct nvgpu_mem *mem,
                u32 size);
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
+                u32 pg_engine_id, struct pmu_pg_stats_data *pg_stat_data);
+bool nvgpu_find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos);
+int nvgpu_pmu_perfmon_start_sampling(struct pmu_gk20a *pmu);
+int nvgpu_pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu);
 #endif /*__PMU_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
index b7edf3f0..3f3119af 100644
--- a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
@@ -13,10 +13,6 @@
 #include <asm/barrier.h>
 #include <linux/wait.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#include "platform_gk20a.h"
-#endif
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <uapi/linux/nvgpu.h>
@@ -523,69 +519,6 @@ int gk20a_sched_dev_release(struct inode *inode, struct file *filp)
        return 0;
 }
-#ifdef CONFIG_DEBUG_FS
-static int gk20a_sched_debugfs_show(struct seq_file *s, void *unused)
-{
-        struct device *dev = s->private;
-        struct gk20a *g = gk20a_get_platform(dev)->g;
-        struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
-        bool sched_busy = true;
-        int n = sched->bitmap_size / sizeof(u64);
-        int i;
-        int err;
-        err = gk20a_busy(g);
-        if (err)
-                return err;
-        if (nvgpu_mutex_tryacquire(&sched->busy_lock)) {
-                sched_busy = false;
-                nvgpu_mutex_release(&sched->busy_lock);
-        }
-        seq_printf(s, "control_locked=%d\n", sched->control_locked);
-        seq_printf(s, "busy=%d\n", sched_busy);
-        seq_printf(s, "bitmap_size=%zu\n", sched->bitmap_size);
-        nvgpu_mutex_acquire(&sched->status_lock);
-        seq_puts(s, "active_tsg_bitmap\n");
-        for (i = 0; i < n; i++)
-                seq_printf(s, "\t0x%016llx\n", sched->active_tsg_bitmap[i]);
-        seq_puts(s, "recent_tsg_bitmap\n");
-        for (i = 0; i < n; i++)
-                seq_printf(s, "\t0x%016llx\n", sched->recent_tsg_bitmap[i]);
-        nvgpu_mutex_release(&sched->status_lock);
-        gk20a_idle(g);
-        return 0;
-}
-static int gk20a_sched_debugfs_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, gk20a_sched_debugfs_show, inode->i_private);
-}
-static const struct file_operations gk20a_sched_debugfs_fops = {
-        .open           = gk20a_sched_debugfs_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-void gk20a_sched_debugfs_init(struct device *dev)
-{
-        struct gk20a_platform *platform = dev_get_drvdata(dev);
-        debugfs_create_file("sched_ctrl", S_IRUGO, platform->debugfs,
-                        dev, &gk20a_sched_debugfs_fops);
-}
-#endif /* CONFIG_DEBUG_FS */
 void gk20a_sched_ctrl_tsg_added(struct gk20a *g, struct tsg_gk20a *tsg)
 {
        struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
diff --git a/drivers/gpu/nvgpu/gk20a/sched_gk20a.h b/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
index 4f6d1510..776f689d 100644
--- a/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
@@ -48,7 +48,6 @@ void gk20a_sched_ctrl_tsg_added(struct gk20a *, struct tsg_gk20a *);
 void gk20a_sched_ctrl_tsg_removed(struct gk20a *, struct tsg_gk20a *);
 int gk20a_sched_ctrl_init(struct gk20a *);
-void gk20a_sched_debugfs_init(struct device *dev);
 void gk20a_sched_ctrl_cleanup(struct gk20a *g);
 #endif /* __SCHED_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 82c587f9..c6e451e1 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -20,6 +20,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/log.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/gr_gk20a.h"
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index f5328f03..831fd5da 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -33,11 +33,11 @@
 #include "clk_gm20b.h"
 #include "mc_gm20b.h"
 #include "regops_gm20b.h"
-#include "debug_gm20b.h"
 #include "cde_gm20b.h"
 #include "therm_gm20b.h"
 #include "hal_gm20b.h"
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
@@ -234,7 +234,7 @@ int gm20b_init_hal(struct gk20a *g)
        gm20b_init_pmu_ops(gops);
        gm20b_init_clk_ops(gops);
        gm20b_init_regops(gops);
-        gm20b_init_debug_ops(gops);
+        gk20a_init_debug_ops(gops);
        gk20a_init_dbg_session_ops(gops);
        gm20b_init_cde_ops(gops);
        gm20b_init_therm_ops(gops);
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index f28ff45f..d923e5e9 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -53,6 +53,7 @@
 #include "hal_gp106.h"
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/hw/gp106/hw_proj_gp106.h>
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index 98a8be2f..9a30ad7c 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -23,6 +23,7 @@
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/bug.h>
+#include <nvgpu/debug.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/gr_gk20a.h"
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index e2a931be..a1906a08 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -44,6 +44,7 @@
 #include "gp10b.h"
 #include "hal_gp10b.h"
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
diff --git a/drivers/gpu/nvgpu/include/nvgpu/allocator.h b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
index 3579b0fb..567c4422 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/allocator.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
@@ -256,11 +256,13 @@ static inline struct gk20a *nvgpu_alloc_to_gpu(struct nvgpu_allocator *a)
        return a->g;
 }
+#ifdef CONFIG_DEBUG_FS
 /*
 * Common functionality for the internals of the allocators.
 */
 void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a);
 void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a);
+#endif
 int  __nvgpu_alloc_common_init(struct nvgpu_allocator *a, struct gk20a *g,
                               const char *name, void *priv, bool dbg,
@@ -281,11 +283,6 @@ static inline void nvgpu_alloc_disable_dbg(struct nvgpu_allocator *a)
 */
 extern u32 nvgpu_alloc_tracing_on;
-#ifdef CONFIG_DEBUG_FS
-struct device;
-void nvgpu_alloc_debugfs_init(struct device *dev);
-#endif
 #define nvgpu_alloc_trace_func()                        \
        do {                                            \
                if (nvgpu_alloc_tracing_on)             \
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/debug.h
index 213922b3..70a03978 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/debug.h
@@ -14,28 +14,42 @@
 *
 */
-#ifndef _DEBUG_GK20A_H_
+#ifndef __NVGPU_DEBUG_H__
-#define _DEBUG_GK20A_H_
+#define __NVGPU_DEBUG_H__
-struct platform_device;
 struct gk20a;
 struct gpu_ops;
-extern unsigned int gk20a_debug_trace_cmdbuf;
 struct gk20a_debug_output {
        void (*fn)(void *ctx, const char *str, size_t len);
        void *ctx;
        char buf[256];
 };
+#ifdef CONFIG_DEBUG_FS
+extern unsigned int gk20a_debug_trace_cmdbuf;
 void gk20a_debug_output(struct gk20a_debug_output *o,
                                        const char *fmt, ...);
-void gk20a_debug_dump(struct device *pdev);
+void gk20a_debug_dump(struct gk20a *g);
 void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o);
-int gk20a_gr_debug_dump(struct device *pdev);
+int gk20a_gr_debug_dump(struct gk20a *g);
-void gk20a_debug_init(struct device *dev, const char *debugfs_symlink);
 void gk20a_init_debug_ops(struct gpu_ops *gops);
-void gk20a_debug_dump_device(void *dev);
+void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink);
+void gk20a_debug_deinit(struct gk20a *g);
+#else
+static inline void gk20a_debug_output(struct gk20a_debug_output *o,
+                                        const char *fmt, ...) {}
+static inline void gk20a_debug_dump(struct gk20a *g) {}
+static inline void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o) {}
+static inline int gk20a_gr_debug_dump(struct gk20a *g) { return 0;}
+static inline void gk20a_init_debug_ops(struct gpu_ops *gops) {}
+static inline void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink) {}
+static inline void gk20a_debug_deinit(struct gk20a *g) {}
 #endif
+#endif /* __NVGPU_DEBUG_H__ */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h b/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
index dc198a04..611854f2 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
@@ -31,12 +31,6 @@ void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
                            unsigned long ip);
 void  __nvgpu_track_vfree(struct gk20a *g, void *addr);
 void  __nvgpu_track_kfree(struct gk20a *g, void *addr);
-void nvgpu_kmem_debugfs_init(struct device *dev);
-#else
-static inline void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-}
 #endif
 /**
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 02cc5b47..cdd0d378 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -22,10 +22,10 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 #include "vgpu/vgpu.h"
 #include "vgpu/fecs_trace_vgpu.h"
-#include "gk20a/debug_gk20a.h"
 #include "gk20a/hal_gk20a.h"
 #include "gk20a/ctxsw_trace_gk20a.h"
 #include "gk20a/tsg_gk20a.h"
@@ -667,7 +667,7 @@ int vgpu_probe(struct platform_device *pdev)
        if (err)
                return err;
-        gk20a_debug_init(dev, "gpu.0");
+        gk20a_debug_init(gk20a, "gpu.0");
        /* Set DMA parameters to allow larger sgt lists */
        dev->dma_parms = &gk20a->dma_parms;