14 files changed, 1253 insertions, 49 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig.nvgpu b/drivers/gpu/nvgpu/Kconfig.nvgpu
index 3e3607e0..8baf6897 100644
--- a/drivers/gpu/nvgpu/Kconfig.nvgpu
+++ b/drivers/gpu/nvgpu/Kconfig.nvgpu
@@ -47,6 +47,17 @@ config GK20A_DEVFREQ
 endchoice
+config NVGPU_TRACK_MEM_USAGE
+        bool "Track the usage of system memory in nvgpu"
+        depends on GK20A
+        default n
+        help
+          Say Y here to allow nvgpu to track and keep statistics on
+          the system memory used by the driver. This does recreate
+          some of the kmem_leak tracking but this is also applicable
+          to other OSes which do not have Linux' kmem_leak.
 config GK20A_CYCLE_STATS
        bool "Support GK20A GPU CYCLE STATS"
        depends on GK20A
diff --git a/drivers/gpu/nvgpu/common/linux/kmem.c b/drivers/gpu/nvgpu/common/linux/kmem.c
index 24e0ca5d..60e79348 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem.c
+++ b/drivers/gpu/nvgpu/common/linux/kmem.c
@@ -15,11 +15,22 @@
 */
 #include <linux/kernel.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
+#include <linux/rbtree.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
 #include <nvgpu/kmem.h>
+#include "gk20a/gk20a.h"
+#include "kmem_priv.h"
 /*
 * Statically declared because this needs to be shared across all nvgpu driver
 * instances. This makes sure that all kmem caches are _definitely_ uniquely
@@ -27,26 +38,793 @@
 */
 static atomic_t kmem_cache_id;
-/*
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
- * Linux specific version of the nvgpu_kmem_cache struct. This type is
- * completely opaque to the rest of the driver.
+static void lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+{
+        mutex_lock(&tracker->lock);
+}
+static void unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+{
+        mutex_unlock(&tracker->lock);
+}
+static void kmem_print_mem_alloc(struct gk20a *g,
+                                 struct nvgpu_mem_alloc *alloc,
+                                 struct seq_file *s)
+{
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+        int i;
+        __pstat(s, "nvgpu-alloc: addr=0x%llx size=%ld\n",
+                alloc->addr, alloc->size);
+        for (i = 0; i < alloc->stack_length; i++)
+                __pstat(s, "  %3d [<%p>] %pS\n", i,
+                        (void *)alloc->stack[i],
+                        (void *)alloc->stack[i]);
+        __pstat(s, "\n");
+#else
+        __pstat(s, "nvgpu-alloc: addr=0x%llx size=%ld src=%pF\n",
+                alloc->addr, alloc->size, alloc->ip);
+#endif
+}
+static int nvgpu_add_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+                           struct nvgpu_mem_alloc *alloc)
+{
+        struct rb_node **new = &tracker->allocs.rb_node;
+        struct rb_node *parent = NULL;
+        while (*new) {
+                struct nvgpu_mem_alloc *tmp = rb_entry(*new,
+                                                       struct nvgpu_mem_alloc,
+                                                       allocs_entry);
+                parent = *new;
+                if (alloc->addr < tmp->addr)
+                        new = &(*new)->rb_left;
+                else if (alloc->addr > tmp->addr)
+                        new = &(*new)->rb_right;
+                else
+                        return -EINVAL;
+        }
+        /* Put the new node there */
+        rb_link_node(&alloc->allocs_entry, parent, new);
+        rb_insert_color(&alloc->allocs_entry, &tracker->allocs);
+        return 0;
+}
+static struct nvgpu_mem_alloc *nvgpu_rem_alloc(
+        struct nvgpu_mem_alloc_tracker *tracker, u64 alloc_addr)
+{
+        struct rb_node *node = tracker->allocs.rb_node;
+        struct nvgpu_mem_alloc *alloc;
+        while (node) {
+                alloc = container_of(node,
+                                     struct nvgpu_mem_alloc, allocs_entry);
+                if (alloc_addr < alloc->addr)
+                        node = node->rb_left;
+                else if (alloc_addr > alloc->addr)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node)
+                return NULL;
+        rb_erase(node, &tracker->allocs);
+        return alloc;
+}
+static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+                                   unsigned long size, unsigned long real_size,
+                                   u64 addr, unsigned long ip)
+{
+        int ret;
+        struct nvgpu_mem_alloc *alloc;
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+        struct stack_trace stack_trace;
+#endif
+        alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
+        if (!alloc)
+                return -ENOMEM;
+        alloc->owner = tracker;
+        alloc->size = size;
+        alloc->real_size = real_size;
+        alloc->addr = addr;
+        alloc->ip = (void *)(uintptr_t)ip;
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+        stack_trace.max_entries = MAX_STACK_TRACE;
+        stack_trace.nr_entries = 0;
+        stack_trace.entries = alloc->stack;
+        /*
+         * This 4 here skips the 2 function calls that happen for all traced
+         * allocs due to nvgpu:
+         *
+         *   __nvgpu_save_kmem_alloc+0x7c/0x128
+         *   __nvgpu_track_kzalloc+0xcc/0xf8
+         *
+         * And the function calls that get made by the stack trace code itself.
+         * If the trace savings code changes this will likely have to change
+         * as well.
+         */
+        stack_trace.skip = 4;
+        save_stack_trace(&stack_trace);
+        alloc->stack_length = stack_trace.nr_entries;
+#endif
+        lock_tracker(tracker);
+        tracker->bytes_alloced += size;
+        tracker->bytes_alloced_real += real_size;
+        tracker->nr_allocs++;
+        /* Keep track of this for building a histogram later on. */
+        if (tracker->max_alloc < size)
+                tracker->max_alloc = size;
+        if (tracker->min_alloc > size)
+                tracker->min_alloc = size;
+        ret = nvgpu_add_alloc(tracker, alloc);
+        if (ret) {
+                WARN(1, "Duplicate alloc??? 0x%llx\n", addr);
+                kfree(alloc);
+                unlock_tracker(tracker);
+                return ret;
+        }
+        unlock_tracker(tracker);
+        return 0;
+}
+static int __nvgpu_free_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+                                   u64 addr)
+{
+        struct nvgpu_mem_alloc *alloc;
+        lock_tracker(tracker);
+        alloc = nvgpu_rem_alloc(tracker, addr);
+        if (WARN(!alloc, "Possible double-free detected: 0x%llx!", addr)) {
+                unlock_tracker(tracker);
+                return -EINVAL;
+        }
+        tracker->nr_frees++;
+        tracker->bytes_freed += alloc->size;
+        tracker->bytes_freed_real += alloc->real_size;
+        unlock_tracker(tracker);
+        return 0;
+}
+static void __nvgpu_check_valloc_size(unsigned long size)
+{
+        WARN(size < PAGE_SIZE, "Alloc smaller than page size! (%lu)!\n", size);
+}
+static void __nvgpu_check_kalloc_size(size_t size)
+{
+        WARN(size > PAGE_SIZE, "Alloc larger than page size! (%zu)!\n", size);
+}
+void *__nvgpu_track_vmalloc(struct gk20a *g, unsigned long size,
+                            unsigned long ip)
+{
+        void *alloc = vmalloc(size);
+        if (!alloc)
+                return NULL;
+        kmem_dbg("vmalloc: size=%-6ld addr=0x%p", size, alloc);
+        __nvgpu_check_valloc_size(size);
+        /*
+         * Ignore the return message. If this fails let's not cause any issues
+         * for the rest of the driver.
+         */
+        __nvgpu_save_kmem_alloc(g->vmallocs, size, roundup_pow_of_two(size),
+                                (u64)(uintptr_t)alloc, ip);
+        return alloc;
+}
+void *__nvgpu_track_vzalloc(struct gk20a *g, unsigned long size,
+                            unsigned long ip)
+{
+        void *alloc = vzalloc(size);
+        if (!alloc)
+                return NULL;
+        kmem_dbg("vzalloc: size=%-6ld addr=0x%p", size, alloc);
+        __nvgpu_check_valloc_size(size);
+        /*
+         * Ignore the return message. If this fails let's not cause any issues
+         * for the rest of the driver.
+         */
+        __nvgpu_save_kmem_alloc(g->vmallocs, size, roundup_pow_of_two(size),
+                                (u64)(uintptr_t)alloc, ip);
+        return alloc;
+}
+void *__nvgpu_track_kmalloc(struct gk20a *g, size_t size, unsigned long ip)
+{
+        void *alloc = kmalloc(size, GFP_KERNEL);
+        if (!alloc)
+                return NULL;
+        kmem_dbg("kmalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+                 size, alloc, GFP_KERNEL);
+        __nvgpu_check_kalloc_size(size);
+        __nvgpu_save_kmem_alloc(g->kmallocs, size, roundup_pow_of_two(size),
+                                (u64)(uintptr_t)alloc, ip);
+        return alloc;
+}
+void *__nvgpu_track_kzalloc(struct gk20a *g, size_t size, unsigned long ip)
+{
+        void *alloc = kzalloc(size, GFP_KERNEL);
+        if (!alloc)
+                return NULL;
+        kmem_dbg("kzalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+                 size, alloc, GFP_KERNEL);
+        __nvgpu_check_kalloc_size(size);
+        __nvgpu_save_kmem_alloc(g->kmallocs, size, roundup_pow_of_two(size),
+                                (u64)(uintptr_t)alloc, ip);
+        return alloc;
+}
+void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
+                            unsigned long ip)
+{
+        void *alloc = kcalloc(n, size, GFP_KERNEL);
+        if (!alloc)
+                return NULL;
+        kmem_dbg("kcalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+                 n * size, alloc, GFP_KERNEL);
+        __nvgpu_check_kalloc_size(n * size);
+        __nvgpu_save_kmem_alloc(g->kmallocs, n * size,
+                                roundup_pow_of_two(n * size),
+                                (u64)(uintptr_t)alloc, ip);
+        return alloc;
+}
+void __nvgpu_track_vfree(struct gk20a *g, void *addr)
+{
+        /*
+         * Often it is accepted practice to pass NULL pointers into free
+         * functions to save code.
+         */
+        if (!addr)
+                return;
+        vfree(addr);
+        kmem_dbg("vfree: addr=0x%p", addr);
+        __nvgpu_free_kmem_alloc(g->vmallocs, (u64)(uintptr_t)addr);
+}
+void __nvgpu_track_kfree(struct gk20a *g, void *addr)
+{
+        if (!addr)
+                return;
+        kfree(addr);
+        kmem_dbg("kfree: addr=0x%p", addr);
+        __nvgpu_free_kmem_alloc(g->kmallocs, (u64)(uintptr_t)addr);
+}
+/**
+ * to_human_readable_bytes - Determine  suffix for passed size.
+ *
+ * @bytes - Number of bytes to generate a suffix for.
+ * @hr_bytes [out] - The human readable number of bytes.
+ * @hr_suffix [out] - The suffix for the HR number of bytes.
+ *
+ * Computes a human readable decomposition of the passed number of bytes. The
+ * suffix for the bytes is passed back through the @hr_suffix pointer. The right
+ * number of bytes is then passed back in @hr_bytes. This returns the following
+ * ranges:
+ *
+ *   0 - 1023 B
+ *   1 - 1023 KB
+ *   1 - 1023 MB
+ *   1 - 1023 GB
+ *   1 - 1023 TB
+ *   1 - ...  PB
+ */
+static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
+                                      const char **hr_suffix)
+{
+        static const char *suffixes[] =
+                { "B", "KB", "MB", "GB", "TB", "PB" };
+        u64 suffix_ind = 0;
+        while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
+                bytes >>= 10;
+                suffix_ind++;
+        }
+        /*
+         * Handle case where bytes > 1023PB.
+         */
+        suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
+                suffix_ind : ARRAY_SIZE(suffixes) - 1;
+        *hr_bytes = bytes;
+        *hr_suffix = suffixes[suffix_ind];
+}
+/**
+ * print_hr_bytes - Print human readable bytes
+ *
+ * @s - A seq_file to print to. May be NULL.
+ * @msg - A message to print before the bytes.
+ * @bytes - Number of bytes.
+ *
+ * Print @msg followed by the human readable decomposition of the passed number
+ * of bytes.
+ *
+ * If @s is NULL then this prints will be made to the kernel log.
+ */
+static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
+{
+        u64 hr_bytes;
+        const char *hr_suffix;
+        __to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
+        __pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
+}
+/**
+ * print_histogram - Build a histogram of the memory usage.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
 */
-struct nvgpu_kmem_cache {
+static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
-        struct gk20a *g;
+                            struct seq_file *s)
-        struct kmem_cache *cache;
+{
+        int i;
+        u64 pot_min, pot_max;
+        u64 nr_buckets;
+        unsigned int *buckets;
+        unsigned int total_allocs;
+        struct rb_node *node;
+        static const char histogram_line[] =
+                "++++++++++++++++++++++++++++++++++++++++";
+        /*
+         * pot_min is essentially a round down to the nearest power of 2. This
+         * is the start of the histogram. pot_max is just a round up to the
+         * nearest power of two. Each histogram bucket is one power of two so
+         * the histogram buckets are exponential.
+         */
+        pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
+        pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
+        nr_buckets = __ffs(pot_max) - __ffs(pot_min);
+        buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
+        if (!buckets) {
+                __pstat(s, "OOM: could not allocate bucket storage!?\n");
+                return;
+        }
        /*
-         * Memory to hold the kmem_cache unique name. Only necessary on our
+         * Iterate across all of the allocs and determine what bucket they
-         * k3.10 kernel when not using the SLUB allocator but it's easier to
+         * should go in. Round the size down to the nearest power of two to
-         * just carry this on to newer kernels.
+         * find the right bucket.
         */
-        char name[128];
+        for (node = rb_first(&tracker->allocs);
+             node != NULL;
+             node = rb_next(node)) {
+                int b;
+                u64 bucket_min;
+                struct nvgpu_mem_alloc *alloc;
+                alloc = container_of(node, struct nvgpu_mem_alloc,
+                                     allocs_entry);
+                bucket_min = (u64)rounddown_pow_of_two(alloc->size);
+                if (bucket_min < tracker->min_alloc)
+                        bucket_min = tracker->min_alloc;
+                b = __ffs(bucket_min) - __ffs(pot_min);
+                /*
+                 * Handle the one case were there's an alloc exactly as big as
+                 * the maximum bucket size of the largest bucket. Most of the
+                 * buckets have an inclusive minimum and exclusive maximum. But
+                 * the largest bucket needs to have an _inclusive_ maximum as
+                 * well.
+                 */
+                if (b == (int)nr_buckets)
+                        b--;
+                buckets[b]++;
+        }
+        total_allocs = 0;
+        for (i = 0; i < (int)nr_buckets; i++)
+                total_allocs += buckets[i];
+        __pstat(s, "Alloc histogram:\n");
+        /*
+         * Actually compute the histogram lines.
+         */
+        for (i = 0; i < (int)nr_buckets; i++) {
+                char this_line[sizeof(histogram_line) + 1];
+                u64 line_length;
+                u64 hr_bytes;
+                const char *hr_suffix;
+                memset(this_line, 0, sizeof(this_line));
+                /*
+                 * Compute the normalized line length. Cant use floating point
+                 * so we will just multiply everything by 1000 and use fixed
+                 * point.
+                 */
+                line_length = (1000 * buckets[i]) / total_allocs;
+                line_length *= sizeof(histogram_line);
+                line_length /= 1000;
+                memset(this_line, '+', line_length);
+                __to_human_readable_bytes(1 << (__ffs(pot_min) + i),
+                                          &hr_bytes, &hr_suffix);
+                __pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
+                        hr_bytes, hr_bytes << 1,
+                        hr_suffix, buckets[i], this_line);
+        }
+}
+/**
+ * nvgpu_kmem_print_stats - Print kmem tracking stats.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ *
+ * Print stats from a tracker. If @s is non-null then seq_printf() will be
+ * used with @s. Otherwise the stats are pr_info()ed.
+ */
+void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
+                            struct seq_file *s)
+{
+        lock_tracker(tracker);
+        __pstat(s, "Mem tracker: %s\n\n", tracker->name);
+        __pstat(s, "Basic Stats:\n");
+        __pstat(s,        "  Number of allocs        %lld\n",
+                tracker->nr_allocs);
+        __pstat(s,        "  Number of frees         %lld\n",
+                tracker->nr_frees);
+        print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
+        print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
+        print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
+        print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
+        print_hr_bytes(s, "  Bytes allocated (real)  ",
+                       tracker->bytes_alloced_real);
+        print_hr_bytes(s, "  Bytes freed (real)      ",
+                       tracker->bytes_freed_real);
+        __pstat(s, "\n");
+        print_histogram(tracker, s);
+        unlock_tracker(tracker);
+}
+#if defined(CONFIG_DEBUG_FS)
+static int __kmem_tracking_show(struct seq_file *s, void *unused)
+{
+        struct nvgpu_mem_alloc_tracker *tracker = s->private;
+        nvgpu_kmem_print_stats(tracker, s);
+        return 0;
+}
+static int __kmem_tracking_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __kmem_tracking_show, inode->i_private);
+}
+static const struct file_operations __kmem_tracking_fops = {
+        .open = __kmem_tracking_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __kmem_traces_dump_tracker(struct gk20a *g,
+                                      struct nvgpu_mem_alloc_tracker *tracker,
+                                      struct seq_file *s)
+{
+        struct rb_node *node;
+        for (node = rb_first(&tracker->allocs);
+             node != NULL;
+             node = rb_next(node)) {
+                struct nvgpu_mem_alloc *alloc;
+                alloc = container_of(node, struct nvgpu_mem_alloc,
+                                     allocs_entry);
+                kmem_print_mem_alloc(g, alloc, s);
+        }
+        return 0;
+}
+static int __kmem_traces_show(struct seq_file *s, void *unused)
+{
+        struct gk20a *g = s->private;
+        lock_tracker(g->vmallocs);
+        seq_puts(s, "Oustanding vmallocs:\n");
+        __kmem_traces_dump_tracker(g, g->vmallocs, s);
+        seq_puts(s, "\n");
+        unlock_tracker(g->vmallocs);
+        lock_tracker(g->kmallocs);
+        seq_puts(s, "Oustanding kmallocs:\n");
+        __kmem_traces_dump_tracker(g, g->kmallocs, s);
+        unlock_tracker(g->kmallocs);
+        return 0;
+}
+static int __kmem_traces_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __kmem_traces_show, inode->i_private);
+}
+static const struct file_operations __kmem_traces_fops = {
+        .open = __kmem_traces_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
 };
+void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+        struct gk20a_platform *plat = dev_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        struct dentry *gpu_root = plat->debugfs;
+        struct dentry *node;
+        g->debugfs_kmem = debugfs_create_dir("kmem_tracking", gpu_root);
+        if (IS_ERR_OR_NULL(g->debugfs_kmem))
+                return;
+        node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g->vmallocs, &__kmem_tracking_fops);
+        node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g->kmallocs, &__kmem_tracking_fops);
+        node = debugfs_create_file("traces", S_IRUGO,
+                                   g->debugfs_kmem,
+                                   g, &__kmem_traces_fops);
+}
+#else
+void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+}
+#endif
+static int __do_check_for_outstanding_allocs(
+        struct gk20a *g,
+        struct nvgpu_mem_alloc_tracker *tracker,
+        const char *type, bool silent)
+{
+        struct rb_node *node;
+        int count = 0;
+        for (node = rb_first(&tracker->allocs);
+             node != NULL;
+             node = rb_next(node)) {
+                struct nvgpu_mem_alloc *alloc;
+                alloc = container_of(node, struct nvgpu_mem_alloc,
+                                     allocs_entry);
+                if (!silent)
+                        kmem_print_mem_alloc(g, alloc, NULL);
+                count++;
+        }
+        return count;
+}
+/**
+ * check_for_outstanding_allocs - Count and display outstanding allocs
+ *
+ * @g      - The GPU.
+ * @silent - If set don't print anything about the allocs.
+ *
+ * Dump (or just count) the number of allocations left outstanding.
+ */
+static int check_for_outstanding_allocs(struct gk20a *g, bool silent)
+{
+        int count = 0;
+        count += __do_check_for_outstanding_allocs(g, g->kmallocs, "kmalloc",
+                                                   silent);
+        count += __do_check_for_outstanding_allocs(g, g->vmallocs, "vmalloc",
+                                                   silent);
+        return count;
+}
+static void do_nvgpu_kmem_cleanup(struct nvgpu_mem_alloc_tracker *tracker,
+                                  void (*force_free_func)(const void *))
+{
+        struct rb_node *node;
+        while ((node = rb_first(&tracker->allocs)) != NULL) {
+                struct nvgpu_mem_alloc *alloc;
+                alloc = container_of(node, struct nvgpu_mem_alloc,
+                                     allocs_entry);
+                if (force_free_func)
+                        force_free_func((void *)alloc->addr);
+                kfree(alloc);
+        }
+}
+/**
+ * nvgpu_kmem_cleanup - Cleanup the kmem tracking
+ *
+ * @g          - The GPU.
+ * @force_free - If set will also free leaked objects if possible.
+ *
+ * Cleanup all of the allocs made by nvgpu_kmem tracking code. If @force_free
+ * is non-zero then the allocation made by nvgpu is also freed. This is risky,
+ * though, as it is possible that the memory is still in use by other parts of
+ * the GPU driver not aware that this has happened.
+ *
+ * In theory it should be fine if the GPU driver has been deinitialized and
+ * there are no bugs in that code. However, if there are any bugs in that code
+ * then they could likely manifest as odd crashes indeterminate amounts of time
+ * in the future. So use @force_free at your own risk.
+ */
+static void nvgpu_kmem_cleanup(struct gk20a *g, bool force_free)
+{
+        do_nvgpu_kmem_cleanup(g->kmallocs, force_free ? kfree : NULL);
+        do_nvgpu_kmem_cleanup(g->vmallocs, force_free ? vfree : NULL);
+}
+void nvgpu_kmem_fini(struct gk20a *g, int flags)
+{
+        int count;
+        bool silent, force_free;
+        if (!flags)
+                return;
+        silent = !(flags & NVGPU_KMEM_FINI_DUMP_ALLOCS);
+        force_free = !!(flags & NVGPU_KMEM_FINI_FORCE_CLEANUP);
+        count = check_for_outstanding_allocs(g, silent);
+        nvgpu_kmem_cleanup(g, force_free);
+        /*
+         * If we leak objects we can either BUG() out or just WARN(). In general
+         * it doesn't make sense to BUG() on here since leaking a few objects
+         * won't crash the kernel but it can be helpful for development.
+         *
+         * If neither flag is set then we just silently do nothing.
+         */
+        if (count > 0) {
+                if (flags & NVGPU_KMEM_FINI_WARN) {
+                        WARN(1, "Letting %d allocs leak!!\n", count);
+                } else if (flags & NVGPU_KMEM_FINI_BUG) {
+                        gk20a_err(g->dev, "Letting %d allocs leak!!\n", count);
+                        BUG();
+                }
+        }
+}
+int nvgpu_kmem_init(struct gk20a *g)
+{
+        int err;
+        g->vmallocs = kzalloc(sizeof(*g->vmallocs), GFP_KERNEL);
+        g->kmallocs = kzalloc(sizeof(*g->kmallocs), GFP_KERNEL);
+        if (!g->vmallocs || !g->kmallocs) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        g->vmallocs->name = "vmalloc";
+        g->kmallocs->name = "kmalloc";
+        g->vmallocs->allocs = RB_ROOT;
+        g->kmallocs->allocs = RB_ROOT;
+        mutex_init(&g->vmallocs->lock);
+        mutex_init(&g->kmallocs->lock);
+        g->vmallocs->min_alloc = PAGE_SIZE;
+        g->kmallocs->min_alloc = KMALLOC_MIN_SIZE;
+        /*
+         * This needs to go after all the other initialization since they use
+         * the nvgpu_kzalloc() API.
+         */
+        g->vmallocs->allocs_cache = nvgpu_kmem_cache_create(g,
+                                                sizeof(struct nvgpu_mem_alloc));
+        g->kmallocs->allocs_cache = nvgpu_kmem_cache_create(g,
+                                                sizeof(struct nvgpu_mem_alloc));
+        if (!g->vmallocs->allocs_cache || !g->kmallocs->allocs_cache) {
+                err = -ENOMEM;
+                if (g->vmallocs->allocs_cache)
+                        nvgpu_kmem_cache_destroy(g->vmallocs->allocs_cache);
+                if (g->kmallocs->allocs_cache)
+                        nvgpu_kmem_cache_destroy(g->kmallocs->allocs_cache);
+                goto fail;
+        }
+        return 0;
+fail:
+        if (g->vmallocs)
+                kfree(g->vmallocs);
+        if (g->kmallocs)
+                kfree(g->kmallocs);
+        return err;
+}
+#else /* !CONFIG_NVGPU_TRACK_MEM_USAGE */
+int nvgpu_kmem_init(struct gk20a *g)
+{
+        return 0;
+}
+void nvgpu_kmem_fini(struct gk20a *g, int flags)
+{
+}
+#endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
 struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
 {
        struct nvgpu_kmem_cache *cache =
-                kzalloc(sizeof(struct nvgpu_kmem_cache), GFP_KERNEL);
+                nvgpu_kzalloc(g, sizeof(struct nvgpu_kmem_cache));
        if (!cache)
                return NULL;
@@ -59,7 +837,7 @@ struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
        cache->cache = kmem_cache_create(cache->name,
                                         size, size, 0, NULL);
        if (!cache->cache) {
-                kfree(cache);
+                nvgpu_kfree(g, cache);
                return NULL;
        }
@@ -68,8 +846,10 @@ struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
 void nvgpu_kmem_cache_destroy(struct nvgpu_kmem_cache *cache)
 {
+        struct gk20a *g = cache->g;
        kmem_cache_destroy(cache->cache);
-        kfree(cache);
+        nvgpu_kfree(g, cache);
 }
 void *nvgpu_kmem_cache_alloc(struct nvgpu_kmem_cache *cache)
diff --git a/drivers/gpu/nvgpu/common/linux/kmem_priv.h b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
new file mode 100644
index 00000000..5e38ad5d
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KMEM_PRIV_H__
+#define __KMEM_PRIV_H__
+#include <linux/rbtree.h>
+#define __pstat(s, fmt, msg...)                         \
+        do {                                            \
+                if (s)                                  \
+                        seq_printf(s, fmt, ##msg);      \
+                else                                    \
+                        pr_info(fmt, ##msg);            \
+        } while (0)
+#define MAX_STACK_TRACE                         20
+/*
+ * Linux specific version of the nvgpu_kmem_cache struct. This type is
+ * completely opaque to the rest of the driver.
+ */
+struct nvgpu_kmem_cache {
+        struct gk20a *g;
+        struct kmem_cache *cache;
+        /*
+         * Memory to hold the kmem_cache unique name. Only necessary on our
+         * k3.10 kernel when not using the SLUB allocator but it's easier to
+         * just carry this on to newer kernels.
+         */
+        char name[128];
+};
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+struct nvgpu_mem_alloc {
+        struct nvgpu_mem_alloc_tracker *owner;
+        void *ip;
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+        unsigned long stack[MAX_STACK_TRACE];
+        int stack_length;
+#endif
+        u64 addr;
+        unsigned long size;
+        unsigned long real_size;
+        /* Ugh - linux specific. Will need to be abstracted. */
+        struct rb_node allocs_entry;
+};
+/*
+ * Linux specific tracking of vmalloc, kmalloc, etc.
+ */
+struct nvgpu_mem_alloc_tracker {
+        const char *name;
+        struct nvgpu_kmem_cache *allocs_cache;
+        struct rb_root allocs;
+        struct mutex lock;
+        u64 bytes_alloced;
+        u64 bytes_freed;
+        u64 bytes_alloced_real;
+        u64 bytes_freed_real;
+        u64 nr_allocs;
+        u64 nr_frees;
+        unsigned long min_alloc;
+        unsigned long max_alloc;
+};
+#endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
+#endif /* __KMEM_PRIV_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index f228110e..68e43259 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -986,7 +986,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
        memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
        gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);
-        nvgpu_big_free(ch->gpfifo.pipe);
+        nvgpu_big_free(g, ch->gpfifo.pipe);
        memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -1856,7 +1856,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
        }
        if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
-                c->gpfifo.pipe = nvgpu_big_malloc(
+                c->gpfifo.pipe = nvgpu_big_malloc(g,
                                gpfifo_size * sizeof(struct nvgpu_gpfifo));
                if (!c->gpfifo.pipe) {
                        err = -ENOMEM;
@@ -1927,7 +1927,7 @@ clean_up_sync:
                c->sync = NULL;
        }
 clean_up_unmap:
-        nvgpu_big_free(c->gpfifo.pipe);
+        nvgpu_big_free(g, c->gpfifo.pipe);
        gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
 clean_up:
        memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
@@ -2057,12 +2057,12 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
        if (!g) {
                size = count * sizeof(struct nvgpu_gpfifo);
                if (size) {
-                        g = nvgpu_big_malloc(size);
+                        g = nvgpu_big_malloc(c->g, size);
                        if (!g)
                                return;
                        if (copy_from_user(g, user_gpfifo, size)) {
-                                nvgpu_big_free(g);
+                                nvgpu_big_free(c->g, g);
                                return;
                        }
                }
@@ -2074,7 +2074,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                trace_write_pushbuffer(c, gp);
        if (gpfifo_allocated)
-                nvgpu_big_free(g);
+                nvgpu_big_free(c->g, g);
 }
 static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 4a42e03f..0a0aada7 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -819,7 +819,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
                goto fail_dmabuf_put;
        }
-        buffer = nvgpu_big_zalloc(access_limit_size);
+        buffer = nvgpu_big_zalloc(g, access_limit_size);
        if (!buffer) {
                err = -ENOMEM;
                goto fail_dmabuf_put;
@@ -865,7 +865,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
 fail_idle:
        gk20a_idle(g->dev);
 fail_free_buffer:
-        nvgpu_big_free(buffer);
+        nvgpu_big_free(g, buffer);
 fail_dmabuf_put:
        dma_buf_put(dmabuf);
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index 67f9b532..6341a962 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <nvgpu/semaphore.h>
+#include <nvgpu/kmem.h>
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -485,6 +486,9 @@ void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
        gk20a_mm_debugfs_init(g->dev);
        gk20a_fifo_debugfs_init(g->dev);
        gk20a_sched_debugfs_init(g->dev);
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        nvgpu_kmem_debugfs_init(g->dev);
+#endif
 #endif
 }
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 3504a32f..6b026ee2 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -43,6 +43,7 @@
 #include <linux/version.h>
 #include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
 #include <nvgpu/allocator.h>
 #include <nvgpu/timers.h>
@@ -1598,6 +1599,8 @@ static int gk20a_probe(struct platform_device *dev)
        set_gk20a(dev, gk20a);
        gk20a->dev = &dev->dev;
+        nvgpu_kmem_init(gk20a);
        gk20a->irq_stall = platform_get_irq(dev, 0);
        gk20a->irq_nonstall = platform_get_irq(dev, 1);
        if (gk20a->irq_stall < 0 || gk20a->irq_nonstall < 0)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8006a4fe..69528c1f 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -27,6 +27,7 @@ struct gk20a_ctxsw_ucode_segments;
 struct gk20a_fecs_trace;
 struct gk20a_ctxsw_trace;
 struct acr_desc;
+struct nvgpu_mem_alloc_tracker;
 #include <linux/sched.h>
 #include <nvgpu/lock.h>
@@ -915,6 +916,7 @@ struct gk20a {
        struct dentry *debugfs_runlist_interleave;
        struct dentry *debugfs_allocators;
        struct dentry *debugfs_xve;
+        struct dentry *debugfs_kmem;
 #endif
        struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
@@ -1055,6 +1057,10 @@ struct gk20a {
        /* Check if msi is enabled */
        bool msi_enabled;
 #endif
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        struct nvgpu_mem_alloc_tracker *vmallocs;
+        struct nvgpu_mem_alloc_tracker *kmallocs;
+#endif
 };
 static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
@@ -1131,6 +1137,7 @@ enum gk20a_dbg_categories {
        gpu_dbg_pmu_pstate = BIT(17), /* p state controlled by pmu */
        gpu_dbg_xv      = BIT(18), /* XVE debugging */
        gpu_dbg_shutdown = BIT(19), /* GPU shutdown tracing */
+        gpu_dbg_kmem    = BIT(20), /* Kmem tracking debugging */
        gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 36b85f3b..e695f02e 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -3424,7 +3424,7 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
        gr->ctx_vars.local_golden_image = NULL;
        if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map)
-                nvgpu_big_free(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
+                nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
        gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
        gk20a_comptag_allocator_destroy(&gr->comp_tags);
@@ -8055,7 +8055,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
        hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
        map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
-        map = nvgpu_big_zalloc(map_size);
+        map = nvgpu_big_zalloc(g, map_size);
        if (!map)
                return -ENOMEM;
@@ -8145,7 +8145,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
        return 0;
 cleanup:
        gk20a_err(dev_from_gk20a(g), "Failed to create HWPM buffer offset map");
-        nvgpu_big_free(map);
+        nvgpu_big_free(g, map);
        return -EINVAL;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 7a64f79b..2ff54653 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1487,8 +1487,8 @@ int gk20a_vm_get_buffers(struct vm_gk20a *vm,
        nvgpu_mutex_acquire(&vm->update_gmmu_lock);
-        buffer_list = nvgpu_big_zalloc(sizeof(*buffer_list) *
+        buffer_list = nvgpu_big_zalloc(vm->mm->g, sizeof(*buffer_list) *
-                                          vm->num_user_mapped_buffers);
+                                       vm->num_user_mapped_buffers);
        if (!buffer_list) {
                nvgpu_mutex_release(&vm->update_gmmu_lock);
                return -ENOMEM;
@@ -1572,7 +1572,7 @@ void gk20a_vm_put_buffers(struct vm_gk20a *vm,
        gk20a_vm_mapping_batch_finish_locked(vm, &batch);
        nvgpu_mutex_release(&vm->update_gmmu_lock);
-        nvgpu_big_free(mapped_buffers);
+        nvgpu_big_free(vm->mm->g, mapped_buffers);
 }
 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/kmem.h b/drivers/gpu/nvgpu/include/nvgpu/kmem.h
index c08e40a6..59192525 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/kmem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/kmem.h
@@ -14,18 +14,21 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef NVGPU_KMEM_H
+#ifndef __NVGPU_KMEM_H__
-#define NVGPU_KMEM_H
+#define __NVGPU_KMEM_H__
-#include <linux/mm.h>
+/*
-#include <linux/slab.h>
+ * Incase this isn't defined already.
-#include <linux/vmalloc.h>
+ */
+#ifndef _THIS_IP_
-#include <asm/page.h>
+#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
 struct gk20a;
-/*
+/**
+ * DOC: Kmem cache support
+ *
 * In Linux there is support for the notion of a kmem_cache. It gives better
 * memory usage characteristics for lots of allocations of the same size. Think
 * structs that get allocated over and over. Normal kmalloc() type routines
@@ -37,26 +40,200 @@ struct gk20a;
 */
 struct nvgpu_kmem_cache;
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+/*
+ * Uncomment this if you want to enable stack traces in the memory profiling.
+ * Since this is a fairly high overhead operation and is only necessary for
+ * debugging actual bugs it's left here for developers to enable.
+ */
+/* #define __NVGPU_SAVE_KALLOC_STACK_TRACES */
+/*
+ * Defined per-OS.
+ */
+struct nvgpu_mem_alloc_tracker;
+#endif
+/**
+ * nvgpu_kmem_cache_create - create an nvgpu kernel memory cache.
+ *
+ * @g           The GPU driver struct using this cache.
+ * @size        Size of the object allocated by the cache.
+ *
+ * This cache can be used to allocate objects of size @size. Common usage would
+ * be for a struct that gets allocated a lot. In that case @size should be
+ * sizeof(struct my_struct).
+ *
+ * A given implementation of this need not do anything special. The allocation
+ * routines can simply be passed on to nvgpu_kzalloc() if desired so packing
+ * and alignment of the structs cannot be assumed.
+ */
 struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size);
+/**
+ * nvgpu_kmem_cache_destroy - destroy a cache created by
+ *                            nvgpu_kmem_cache_create().
+ *
+ * @cache       The cache to destroy.
+ */
 void nvgpu_kmem_cache_destroy(struct nvgpu_kmem_cache *cache);
+/**
+ * nvgpu_kmem_cache_alloc - Allocate an object from the cache
+ *
+ * @cache       The cache to alloc from.
+ */
 void *nvgpu_kmem_cache_alloc(struct nvgpu_kmem_cache *cache);
+/**
+ * nvgpu_kmem_cache_free - Free an object back to a cache
+ *
+ * @cache       The cache to return the object to.
+ * @ptr         Pointer to the object to free.
+ */
 void nvgpu_kmem_cache_free(struct nvgpu_kmem_cache *cache, void *ptr);
-static inline void *__nvgpu_big_alloc(size_t size, bool clear)
+/**
+ * nvgpu_kmalloc - Allocate from the kernel's allocator.
+ *
+ * @g:          Current GPU.
+ * @size:       Size of the allocation.
+ *
+ * Allocate a chunk of system memory from the kernel. Allocations larger than 1
+ * page may fail even when there may appear to be enough memory.
+ *
+ * This function may sleep so cannot be used in IRQs.
+ */
+#define nvgpu_kmalloc(g, size)          __nvgpu_kmalloc(g, size, _THIS_IP_)
+/**
+ * nvgpu_kzalloc - Allocate from the kernel's allocator.
+ *
+ * @g:          Current GPU.
+ * @size:       Size of the allocation.
+ *
+ * Identical to nvgpu_kalloc() except the memory will be zeroed before being
+ * returned.
+ */
+#define nvgpu_kzalloc(g, size)          __nvgpu_kzalloc(g, size, _THIS_IP_)
+/**
+ * nvgpu_kcalloc - Allocate from the kernel's allocator.
+ *
+ * @g:          Current GPU.
+ * @n:          Number of objects.
+ * @size:       Size of each object.
+ *
+ * Identical to nvgpu_kalloc() except the size of the memory chunk returned is
+ * @n * @size.
+ */
+#define nvgpu_kcalloc(g, n, size)       __nvgpu_kcalloc(g, n, size, _THIS_IP_)
+/**
+ * nvgpu_vmalloc - Allocate memory and return a map to it.
+ *
+ * @g:          Current GPU.
+ * @size:       Size of the allocation.
+ *
+ * Allocate some memory and return a pointer to a virtual memory mapping of
+ * that memory in the kernel's virtual address space. The underlying physical
+ * memory is not guaranteed to be contiguous (and indeed likely isn't). This
+ * allows for much larger allocations to be done without worrying about as much
+ * about physical memory fragmentation.
+ *
+ * This function may sleep.
+ */
+#define nvgpu_vmalloc(g, size)          __nvgpu_vmalloc(g, size, _THIS_IP_)
+/**
+ * nvgpu_vzalloc - Allocate memory and return a map to it.
+ *
+ * @g:          Current GPU.
+ * @size:       Size of the allocation.
+ *
+ * Identical to nvgpu_vmalloc() except this will return zero'ed memory.
+ */
+#define nvgpu_vzalloc(g, size)          __nvgpu_vzalloc(g, size, _THIS_IP_)
+/**
+ * nvgpu_kfree - Frees an alloc from nvgpu_kmalloc, nvgpu_kzalloc,
+ *               nvgpu_kcalloc.
+ *
+ * @g:          Current GPU.
+ * @addr:       Address of object to free.
+ */
+#define nvgpu_kfree(g, addr)            __nvgpu_kfree(g, addr)
+/**
+ * nvgpu_vfree - Frees an alloc from nvgpu_vmalloc, nvgpu_vzalloc.
+ *
+ * @g:          Current GPU.
+ * @addr:       Address of object to free.
+ */
+#define nvgpu_vfree(g, addr)            __nvgpu_vfree(g, addr)
+#define kmem_dbg(fmt, args...)                  \
+        gk20a_dbg(gpu_dbg_kmem, fmt, ##args)
+/**
+ * nvgpu_kmem_init - Initialize the kmem tracking stuff.
+ *
+ *@g: The driver to init.
+ *
+ * Returns non-zero on failure.
+ */
+int nvgpu_kmem_init(struct gk20a *g);
+/**
+ * nvgpu_kmem_fini - Finalize the kmem tracking code
+ *
+ * @g     - The GPU.
+ * @flags - Flags that control operation of this finalization.
+ *
+ * Cleanup resources used by nvgpu_kmem. Available flags for cleanup are:
+ *
+ *   %NVGPU_KMEM_FINI_DO_NOTHING
+ *   %NVGPU_KMEM_FINI_FORCE_CLEANUP
+ *   %NVGPU_KMEM_FINI_DUMP_ALLOCS
+ *   %NVGPU_KMEM_FINI_WARN
+ *   %NVGPU_KMEM_FINI_BUG
+ *
+ * %NVGPU_KMEM_FINI_DO_NOTHING will be overridden by anything else specified.
+ * Put another way don't just add %NVGPU_KMEM_FINI_DO_NOTHING and expect that
+ * to suppress other flags from doing anything.
+ */
+void nvgpu_kmem_fini(struct gk20a *g, int flags);
+/*
+ * These will simply be ignored if CONFIG_NVGPU_TRACK_MEM_USAGE is not defined.
+ */
+#define NVGPU_KMEM_FINI_DO_NOTHING              0
+#define NVGPU_KMEM_FINI_FORCE_CLEANUP           (1 << 0)
+#define NVGPU_KMEM_FINI_DUMP_ALLOCS             (1 << 1)
+#define NVGPU_KMEM_FINI_WARN                    (1 << 2)
+#define NVGPU_KMEM_FINI_BUG                     (1 << 3)
+/*
+ * When there's other implementations make sure they are included instead of
+ * Linux when not compiling on Linux!
+ */
+#include <nvgpu/kmem_linux.h>
+static inline void *__nvgpu_big_alloc(struct gk20a *g, size_t size, bool clear)
 {
        void *p;
        if (size > PAGE_SIZE) {
                if (clear)
-                        p = vzalloc(size);
+                        p = nvgpu_vzalloc(g, size);
                else
-                        p = vmalloc(size);
+                        p = nvgpu_vmalloc(g, size);
        } else {
                if (clear)
-                        p = kzalloc(size, GFP_KERNEL);
+                        p = nvgpu_kzalloc(g, size);
                else
-                        p = kmalloc(size, GFP_KERNEL);
+                        p = nvgpu_kmalloc(g, size);
        }
        return p;
@@ -65,6 +242,7 @@ static inline void *__nvgpu_big_alloc(size_t size, bool clear)
 /**
 * nvgpu_big_malloc - Pick virtual or physical alloc based on @size
 *
+ * @g - The GPU.
 * @size - Size of the allocation.
 *
 * On some platforms (i.e Linux) it is possible to allocate memory directly
@@ -83,30 +261,31 @@ static inline void *__nvgpu_big_alloc(size_t size, bool clear)
 * Returns a pointer to a virtual address range that the kernel can access or
 * %NULL on failure.
 */
-static inline void *nvgpu_big_malloc(size_t size)
+static inline void *nvgpu_big_malloc(struct gk20a *g, size_t size)
 {
-        return __nvgpu_big_alloc(size, false);
+        return __nvgpu_big_alloc(g, size, false);
 }
 /**
 * nvgpu_big_malloc - Pick virtual or physical alloc based on @size
 *
+ * @g - The GPU.
 * @size - Size of the allocation.
 *
 * Zeroed memory version of nvgpu_big_malloc().
 */
-static inline void *nvgpu_big_zalloc(size_t size)
+static inline void *nvgpu_big_zalloc(struct gk20a *g, size_t size)
 {
-        return __nvgpu_big_alloc(size, true);
+        return __nvgpu_big_alloc(g, size, true);
 }
 /**
 * nvgpu_big_free - Free and alloc from nvgpu_big_zalloc() or
 *                  nvgpu_big_malloc().
- *
+ * @g - The GPU.
 * @p - A pointer allocated by nvgpu_big_zalloc() or nvgpu_big_malloc().
 */
-static inline void nvgpu_big_free(void *p)
+static inline void nvgpu_big_free(struct gk20a *g, void *p)
 {
        /*
         * This will have to be fixed eventually. Allocs that use
@@ -114,9 +293,9 @@ static inline void nvgpu_big_free(void *p)
         * when freeing.
         */
        if (virt_addr_valid(p))
-                kfree(p);
+                nvgpu_kfree(g, p);
        else
-                vfree(p);
+                nvgpu_vfree(g, p);
 }
-#endif
+#endif /* __NVGPU_KMEM_H__ */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h b/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h
new file mode 100644
index 00000000..d1cd27f3
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __NVGPU_KMEM_LINUX_H__
+#define __NVGPU_KMEM_LINUX_H__
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+struct gk20a;
+struct device;
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+void *__nvgpu_track_vmalloc(struct gk20a *g, unsigned long size,
+                            unsigned long ip);
+void *__nvgpu_track_vzalloc(struct gk20a *g, unsigned long size,
+                            unsigned long ip);
+void *__nvgpu_track_kmalloc(struct gk20a *g, size_t size, unsigned long ip);
+void *__nvgpu_track_kzalloc(struct gk20a *g, size_t size, unsigned long ip);
+void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
+                            unsigned long ip);
+void  __nvgpu_track_vfree(struct gk20a *g, void *addr);
+void  __nvgpu_track_kfree(struct gk20a *g, void *addr);
+void nvgpu_kmem_debugfs_init(struct device *dev);
+#else
+static inline void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+}
+#endif
+/**
+ * DOC: Linux pass through kmem implementation.
+ *
+ * These are the Linux implementations of the various kmem functions defined by
+ * nvgpu. This should not be included directly - instead include <nvgpu/kmem.h>.
+ */
+static inline void *__nvgpu_kmalloc(struct gk20a *g, unsigned long size,
+                                    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        return __nvgpu_track_vmalloc(g, size, ip);
+#else
+        return kmalloc(size, GFP_KERNEL);
+#endif
+}
+static inline void *__nvgpu_kzalloc(struct gk20a *g, size_t size,
+                                    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        return __nvgpu_track_kzalloc(g, size, ip);
+#else
+        return kzalloc(size, GFP_KERNEL);
+#endif
+}
+static inline void *__nvgpu_kcalloc(struct gk20a *g, size_t n, size_t size,
+                                    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        return __nvgpu_track_kcalloc(g, n, size, ip);
+#else
+        return kcalloc(n, size, GFP_KERNEL);
+#endif
+}
+static inline void *__nvgpu_vmalloc(struct gk20a *g, unsigned long size,
+                                    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        return __nvgpu_track_vmalloc(g, size, ip);
+#else
+        return vmalloc(size);
+#endif
+}
+static inline void *__nvgpu_vzalloc(struct gk20a *g, unsigned long size,
+                                    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        return __nvgpu_track_vzalloc(g, size, ip);
+#else
+        return vzalloc(size);
+#endif
+}
+static inline void __nvgpu_kfree(struct gk20a *g, void *addr)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        __nvgpu_track_kfree(g, addr);
+#else
+        kfree(addr);
+#endif
+}
+static inline void __nvgpu_vfree(struct gk20a *g, void *addr)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+        __nvgpu_track_vfree(g, addr);
+#else
+        vfree(addr);
+#endif
+}
+#endif
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index 3677b02d..39559dac 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -19,6 +19,7 @@
 #include <linux/pm_runtime.h>
 #include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -358,6 +359,8 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,
        platform->g = g;
        g->dev = &pdev->dev;
+        nvgpu_kmem_init(g);
        err = pci_enable_device(pdev);
        if (err)
                return err;
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index d8e0dfa1..37b4633b 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -19,6 +19,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/pm_qos.h>
+#include <nvgpu/kmem.h>
 #include "vgpu/vgpu.h"
 #include "vgpu/fecs_trace_vgpu.h"
 #include "gk20a/debug_gk20a.h"
@@ -562,6 +564,8 @@ int vgpu_probe(struct platform_device *pdev)
        platform->vgpu_priv = priv;
        gk20a->dev = dev;
+        nvgpu_kmem_init(gk20a);
        err = gk20a_user_init(dev, INTERFACE_NAME, &nvgpu_class);
        if (err)
                return err;