From 4d2d890c01b94d10ad55643a4c2c159a98419efe Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Wed, 25 Oct 2017 09:56:09 -0700
Subject: gpu: nvgpu: Move ctxsw_trace_gk20a.c to common/linux

Migrate ctxsw_trace_gk20a.c to common/linux/ctxsw_trace.c. This
has been done becasue the ctxsw tracing code is currently too
tightly tied to the Linux OS due to usage of a couple system calls:

  - poll()
  - mmap()

And general Linux driver framework code. As a result pulling the
logic out of the FECS tracing code is simply too large a scope for
time time being.

Instead the code was just copied as much as possible. The HAL ops
for the FECS code was hidden behind the FECS tracing config so
that the vm_area_struct is not used when QNX does not define said
config. All other non-HAL functions called by the FECS ctxsw
tracing code ha now also been hidden by this config. This is not
pretty but for the time being it seems like the way to go.

JIRA NVGPU-287

Change-Id: Ib880ab237f4abd330dc66998692c86c4507149c2
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1586547
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/Makefile                   |   4 +-
 drivers/gpu/nvgpu/common/linux/ctxsw_trace.c | 727 +++++++++++++++++++++++++++
 drivers/gpu/nvgpu/common/linux/module.c      |   2 +
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c      |   2 +
 drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c  | 727 ---------------------------
 drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h  |  18 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c         |  27 +-
 drivers/gpu/nvgpu/gk20a/gk20a.c              |   2 +
 drivers/gpu/nvgpu/gk20a/gk20a.h              |  14 +-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c           |   5 +-
 drivers/gpu/nvgpu/gp106/hal_gp106.c          |   2 +
 drivers/gpu/nvgpu/gp10b/hal_gp10b.c          |   2 +
 12 files changed, 784 insertions(+), 748 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/linux/ctxsw_trace.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index af7a8af5..f1a6f267 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -105,7 +105,6 @@ nvgpu-y := \
 	gk20a/fb_gk20a.o \
 	gk20a/hal.o \
 	gk20a/tsg_gk20a.o \
-	gk20a/ctxsw_trace_gk20a.o \
 	gk20a/fecs_trace_gk20a.o \
 	gk20a/mc_gk20a.o \
 	gk20a/sim_gk20a.o \
@@ -152,6 +151,9 @@ nvgpu-$(CONFIG_DEBUG_FS) += \
 	common/linux/debug_kmem.o
 endif
 
+nvgpu-$(CONFIG_GK20A_CTXSW_TRACE) += \
+	common/linux/ctxsw_trace.o
+
 nvgpu-$(CONFIG_TEGRA_GK20A) += common/linux/platform_gk20a_tegra.o
 nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
 nvgpu-$(CONFIG_GK20A_PCI) += common/linux/pci.o
diff --git a/drivers/gpu/nvgpu/common/linux/ctxsw_trace.c b/drivers/gpu/nvgpu/common/linux/ctxsw_trace.c
new file mode 100644
index 00000000..81a54b7e
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/ctxsw_trace.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <asm/barrier.h>
+#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <trace/events/gk20a.h>
+#include <uapi/linux/nvgpu.h>
+
+#include "gk20a/gk20a.h"
+#include "gk20a/gr_gk20a.h"
+#include "gk20a/ctxsw_trace_gk20a.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <nvgpu/kmem.h>
+#include <nvgpu/log.h>
+#include <nvgpu/atomic.h>
+#include <nvgpu/barrier.h>
+
+#include "os_linux.h"
+
+#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+
+#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE	(128*PAGE_SIZE)
+
+/* Userland-facing FIFO (one global + eventually one per VM) */
+struct gk20a_ctxsw_dev {
+	struct gk20a *g;
+
+	struct nvgpu_ctxsw_ring_header *hdr;
+	struct nvgpu_ctxsw_trace_entry *ents;
+	struct nvgpu_ctxsw_trace_filter filter;
+	bool write_enabled;
+	struct nvgpu_cond readout_wq;
+	size_t size;
+	u32 num_ents;
+
+	nvgpu_atomic_t vma_ref;
+
+	struct nvgpu_mutex write_lock;
+};
+
+
+struct gk20a_ctxsw_trace {
+	struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
+};
+
+static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
+{
+	return (hdr->write_idx == hdr->read_idx);
+}
+
+static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
+{
+	return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
+}
+
+static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
+{
+	return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
+}
+
+ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
+	loff_t *off)
+{
+	struct gk20a_ctxsw_dev *dev = filp->private_data;
+	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+	struct nvgpu_ctxsw_trace_entry __user *entry =
+		(struct nvgpu_ctxsw_trace_entry *) buf;
+	size_t copied = 0;
+	int err;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
+		"filp=%p buf=%p size=%zu", filp, buf, size);
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+	while (ring_is_empty(hdr)) {
+		nvgpu_mutex_release(&dev->write_lock);
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+		err = NVGPU_COND_WAIT_INTERRUPTIBLE(&dev->readout_wq,
+			!ring_is_empty(hdr), 0);
+		if (err)
+			return err;
+		nvgpu_mutex_acquire(&dev->write_lock);
+	}
+
+	while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
+		if (ring_is_empty(hdr))
+			break;
+
+		if (copy_to_user(entry, &dev->ents[hdr->read_idx],
+			sizeof(*entry))) {
+			nvgpu_mutex_release(&dev->write_lock);
+			return -EFAULT;
+		}
+
+		hdr->read_idx++;
+		if (hdr->read_idx >= hdr->num_ents)
+			hdr->read_idx = 0;
+
+		entry++;
+		copied += sizeof(*entry);
+		size -= sizeof(*entry);
+	}
+
+	gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
+		hdr->read_idx);
+
+	*off = hdr->read_idx;
+	nvgpu_mutex_release(&dev->write_lock);
+
+	return copied;
+}
+
+static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
+{
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
+	nvgpu_mutex_acquire(&dev->write_lock);
+	dev->write_enabled = true;
+	nvgpu_mutex_release(&dev->write_lock);
+	dev->g->ops.fecs_trace.enable(dev->g);
+	return 0;
+}
+
+static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
+{
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
+	dev->g->ops.fecs_trace.disable(dev->g);
+	nvgpu_mutex_acquire(&dev->write_lock);
+	dev->write_enabled = false;
+	nvgpu_mutex_release(&dev->write_lock);
+	return 0;
+}
+
+static int gk20a_ctxsw_dev_alloc_buffer(struct gk20a_ctxsw_dev *dev,
+					size_t size)
+{
+	struct gk20a *g = dev->g;
+	void *buf;
+	int err;
+
+	if ((dev->write_enabled) || (nvgpu_atomic_read(&dev->vma_ref)))
+		return -EBUSY;
+
+	err = g->ops.fecs_trace.alloc_user_buffer(g, &buf, &size);
+	if (err)
+		return err;
+
+
+	dev->hdr = buf;
+	dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
+	dev->size = size;
+	dev->num_ents = dev->hdr->num_ents;
+
+	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
+		dev->size, dev->hdr, dev->ents, dev->hdr->num_ents);
+	return 0;
+}
+
+int gk20a_ctxsw_dev_ring_alloc(struct gk20a *g,
+		void **buf, size_t *size)
+{
+	struct nvgpu_ctxsw_ring_header *hdr;
+
+	*size = roundup(*size, PAGE_SIZE);
+	hdr = vmalloc_user(*size);
+	if (!hdr)
+		return -ENOMEM;
+
+	hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
+	hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
+	hdr->num_ents = (*size - sizeof(struct nvgpu_ctxsw_ring_header))
+		/ sizeof(struct nvgpu_ctxsw_trace_entry);
+	hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
+	hdr->drop_count = 0;
+	hdr->read_idx = 0;
+	hdr->write_idx = 0;
+	hdr->write_seqno = 0;
+
+	*buf = hdr;
+	return 0;
+}
+
+int gk20a_ctxsw_dev_ring_free(struct gk20a *g)
+{
+	struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[0];
+
+	nvgpu_vfree(g, dev->hdr);
+	return 0;
+}
+
+static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
+	struct nvgpu_ctxsw_ring_setup_args *args)
+{
+	size_t size = args->size;
+	int ret;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
+
+	if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
+		return -EINVAL;
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+	ret = gk20a_ctxsw_dev_alloc_buffer(dev, size);
+	nvgpu_mutex_release(&dev->write_lock);
+
+	return ret;
+}
+
+static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
+	struct nvgpu_ctxsw_trace_filter_args *args)
+{
+	struct gk20a *g = dev->g;
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+	dev->filter = args->filter;
+	nvgpu_mutex_release(&dev->write_lock);
+
+	if (g->ops.fecs_trace.set_filter)
+		g->ops.fecs_trace.set_filter(g, &dev->filter);
+	return 0;
+}
+
+static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
+	struct nvgpu_ctxsw_trace_filter_args *args)
+{
+	nvgpu_mutex_acquire(&dev->write_lock);
+	args->filter = dev->filter;
+	nvgpu_mutex_release(&dev->write_lock);
+
+	return 0;
+}
+
+static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
+{
+	struct gk20a *g = dev->g;
+	int err;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+
+	err = gk20a_busy(g);
+	if (err)
+		return err;
+
+	if (g->ops.fecs_trace.flush)
+		err = g->ops.fecs_trace.flush(g);
+
+	if (likely(!err))
+		err = g->ops.fecs_trace.poll(g);
+
+	gk20a_idle(g);
+	return err;
+}
+
+int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
+{
+	struct nvgpu_os_linux *l;
+	struct gk20a *g;
+	struct gk20a_ctxsw_trace *trace;
+	struct gk20a_ctxsw_dev *dev;
+	int err;
+	size_t size;
+	u32 n;
+
+	/* only one VM for now */
+	const int vmid = 0;
+
+	l = container_of(inode->i_cdev, struct nvgpu_os_linux, ctxsw.cdev);
+	g = gk20a_get(&l->g);
+	if (!g)
+		return -ENODEV;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto free_ref;
+	}
+
+	err = gk20a_busy(g);
+	if (err)
+		goto free_ref;
+
+	trace = g->ctxsw_trace;
+	if (!trace) {
+		err = -ENODEV;
+		goto idle;
+	}
+
+	/* Allow only one user for this device */
+	dev = &trace->devs[vmid];
+	nvgpu_mutex_acquire(&dev->write_lock);
+	if (dev->hdr) {
+		err = -EBUSY;
+		goto done;
+	}
+
+	/* By default, allocate ring buffer big enough to accommodate
+	 * FECS records with default event filter */
+
+	/* enable all traces by default */
+	NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
+
+	/* compute max number of entries generated with this filter */
+	n = g->ops.fecs_trace.max_entries(g, &dev->filter);
+
+	size = sizeof(struct nvgpu_ctxsw_ring_header) +
+			n * sizeof(struct nvgpu_ctxsw_trace_entry);
+	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
+		size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
+
+	err = gk20a_ctxsw_dev_alloc_buffer(dev, size);
+	if (!err) {
+		filp->private_data = dev;
+		gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
+			filp, dev, size);
+	}
+
+done:
+	nvgpu_mutex_release(&dev->write_lock);
+
+idle:
+	gk20a_idle(g);
+free_ref:
+	if (err)
+		gk20a_put(g);
+	return err;
+}
+
+int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
+{
+	struct gk20a_ctxsw_dev *dev = filp->private_data;
+	struct gk20a *g = dev->g;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
+
+	g->ops.fecs_trace.disable(g);
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+	dev->write_enabled = false;
+	nvgpu_mutex_release(&dev->write_lock);
+
+	if (dev->hdr) {
+		dev->g->ops.fecs_trace.free_user_buffer(dev->g);
+		dev->hdr = NULL;
+	}
+	gk20a_put(g);
+	return 0;
+}
+
+long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
+	unsigned long arg)
+{
+	struct gk20a_ctxsw_dev *dev = filp->private_data;
+	struct gk20a *g = dev->g;
+	u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
+	int err = 0;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
+
+	if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) ||
+		(_IOC_NR(cmd) == 0) ||
+		(_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST) ||
+		(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE))
+		return -EINVAL;
+
+	memset(buf, 0, sizeof(buf));
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	switch (cmd) {
+	case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
+		err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
+		break;
+	case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
+		err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
+		break;
+	case NVGPU_CTXSW_IOCTL_RING_SETUP:
+		err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
+			(struct nvgpu_ctxsw_ring_setup_args *) buf);
+		break;
+	case NVGPU_CTXSW_IOCTL_SET_FILTER:
+		err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
+			(struct nvgpu_ctxsw_trace_filter_args *) buf);
+		break;
+	case NVGPU_CTXSW_IOCTL_GET_FILTER:
+		err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
+			(struct nvgpu_ctxsw_trace_filter_args *) buf);
+		break;
+	case NVGPU_CTXSW_IOCTL_POLL:
+		err = gk20a_ctxsw_dev_ioctl_poll(dev);
+		break;
+	default:
+		dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
+			cmd);
+		err = -ENOTTY;
+	}
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
+
+	return err;
+}
+
+unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
+{
+	struct gk20a_ctxsw_dev *dev = filp->private_data;
+	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+	unsigned int mask = 0;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+	poll_wait(filp, &dev->readout_wq.wq, wait);
+	if (!ring_is_empty(hdr))
+		mask |= POLLIN | POLLRDNORM;
+	nvgpu_mutex_release(&dev->write_lock);
+
+	return mask;
+}
+
+static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
+{
+	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+
+	nvgpu_atomic_inc(&dev->vma_ref);
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+		nvgpu_atomic_read(&dev->vma_ref));
+}
+
+static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
+{
+	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+
+	nvgpu_atomic_dec(&dev->vma_ref);
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+		nvgpu_atomic_read(&dev->vma_ref));
+}
+
+static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
+	.open = gk20a_ctxsw_dev_vma_open,
+	.close = gk20a_ctxsw_dev_vma_close,
+};
+
+int gk20a_ctxsw_dev_mmap_buffer(struct gk20a *g,
+				struct vm_area_struct *vma)
+{
+	return remap_vmalloc_range(vma, g->ctxsw_trace->devs[0].hdr, 0);
+}
+
+int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct gk20a_ctxsw_dev *dev = filp->private_data;
+	int ret;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
+		vma->vm_start, vma->vm_end);
+
+	ret = dev->g->ops.fecs_trace.mmap_user_buffer(dev->g, vma);
+	if (likely(!ret)) {
+		vma->vm_private_data = dev;
+		vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
+		vma->vm_ops->open(vma);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+static int gk20a_ctxsw_init_devs(struct gk20a *g)
+{
+	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+	struct gk20a_ctxsw_dev *dev = trace->devs;
+	int err;
+	int i;
+
+	for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
+		dev->g = g;
+		dev->hdr = NULL;
+		dev->write_enabled = false;
+		nvgpu_cond_init(&dev->readout_wq);
+		err = nvgpu_mutex_init(&dev->write_lock);
+		if (err)
+			return err;
+		nvgpu_atomic_set(&dev->vma_ref, 0);
+		dev++;
+	}
+	return 0;
+}
+#endif
+
+int gk20a_ctxsw_trace_init(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+	int err;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
+
+	/* if tracing is not supported, skip this */
+	if (!g->ops.fecs_trace.init)
+		return 0;
+
+	if (likely(trace))
+		return 0;
+
+	trace = nvgpu_kzalloc(g, sizeof(*trace));
+	if (unlikely(!trace))
+		return -ENOMEM;
+	g->ctxsw_trace = trace;
+
+	err = gk20a_ctxsw_init_devs(g);
+	if (err)
+		goto fail;
+
+	err = g->ops.fecs_trace.init(g);
+	if (unlikely(err))
+		goto fail;
+
+	return 0;
+
+fail:
+	memset(&g->ops.fecs_trace, 0, sizeof(g->ops.fecs_trace));
+	nvgpu_kfree(g, trace);
+	g->ctxsw_trace = NULL;
+	return err;
+#else
+	return 0;
+#endif
+}
+
+void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+	struct gk20a_ctxsw_trace *trace;
+	struct gk20a_ctxsw_dev *dev;
+	int i;
+
+	if (!g->ctxsw_trace)
+		return;
+
+	trace = g->ctxsw_trace;
+	dev = trace->devs;
+
+	for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
+		nvgpu_mutex_destroy(&dev->write_lock);
+		dev++;
+	}
+
+	nvgpu_kfree(g, g->ctxsw_trace);
+	g->ctxsw_trace = NULL;
+
+	g->ops.fecs_trace.deinit(g);
+#endif
+}
+
+int gk20a_ctxsw_trace_write(struct gk20a *g,
+		struct nvgpu_ctxsw_trace_entry *entry)
+{
+	struct nvgpu_ctxsw_ring_header *hdr;
+	struct gk20a_ctxsw_dev *dev;
+	int ret = 0;
+	const char *reason;
+	u32 write_idx;
+
+	if (!g->ctxsw_trace)
+		return 0;
+
+	if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
+		return -ENODEV;
+
+	dev = &g->ctxsw_trace->devs[entry->vmid];
+	hdr = dev->hdr;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+		"dev=%p hdr=%p", dev, hdr);
+
+	nvgpu_mutex_acquire(&dev->write_lock);
+
+	if (unlikely(!hdr)) {
+		/* device has been released */
+		ret = -ENODEV;
+		goto done;
+	}
+
+	write_idx = hdr->write_idx;
+	if (write_idx >= dev->num_ents) {
+		nvgpu_err(dev->g,
+			"write_idx=%u out of range [0..%u]",
+			write_idx, dev->num_ents);
+		ret = -ENOSPC;
+		reason = "write_idx out of range";
+		goto disable;
+	}
+
+	entry->seqno = hdr->write_seqno++;
+
+	if (!dev->write_enabled) {
+		ret = -EBUSY;
+		reason = "write disabled";
+		goto drop;
+	}
+
+	if (unlikely(ring_is_full(hdr))) {
+		ret = -ENOSPC;
+		reason = "user fifo full";
+		goto drop;
+	}
+
+	if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
+		reason = "filtered out";
+		goto filter;
+	}
+
+	gk20a_dbg(gpu_dbg_ctxsw,
+		"seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
+		entry->seqno, entry->context_id, entry->pid,
+		entry->tag, entry->timestamp);
+
+	dev->ents[write_idx] = *entry;
+
+	/* ensure record is written before updating write index */
+	nvgpu_smp_wmb();
+
+	write_idx++;
+	if (unlikely(write_idx >= hdr->num_ents))
+		write_idx = 0;
+	hdr->write_idx = write_idx;
+	gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
+		hdr->read_idx, hdr->write_idx, ring_len(hdr));
+
+	nvgpu_mutex_release(&dev->write_lock);
+	return ret;
+
+disable:
+	g->ops.fecs_trace.disable(g);
+
+drop:
+	hdr->drop_count++;
+
+filter:
+	gk20a_dbg(gpu_dbg_ctxsw,
+			"dropping seqno=%d context_id=%08x pid=%lld "
+			"tag=%x time=%llx (%s)",
+			entry->seqno, entry->context_id, entry->pid,
+			entry->tag, entry->timestamp, reason);
+
+done:
+	nvgpu_mutex_release(&dev->write_lock);
+	return ret;
+}
+
+void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
+{
+	struct gk20a_ctxsw_dev *dev;
+
+	if (!g->ctxsw_trace)
+		return;
+
+	dev = &g->ctxsw_trace->devs[vmid];
+	nvgpu_cond_signal_interruptible(&dev->readout_wq);
+}
+
+void gk20a_ctxsw_trace_channel_reset(struct gk20a *g, struct channel_gk20a *ch)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+	struct nvgpu_ctxsw_trace_entry entry = {
+		.vmid = 0,
+		.tag = NVGPU_CTXSW_TAG_ENGINE_RESET,
+		.context_id = 0,
+		.pid = ch->tgid,
+	};
+
+	if (!g->ctxsw_trace)
+		return;
+
+	g->ops.bus.read_ptimer(g, &entry.timestamp);
+	gk20a_ctxsw_trace_write(g, &entry);
+	gk20a_ctxsw_trace_wake_up(g, 0);
+#endif
+	trace_gk20a_channel_reset(ch->chid, ch->tsgid);
+}
+
+void gk20a_ctxsw_trace_tsg_reset(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+	struct nvgpu_ctxsw_trace_entry entry = {
+		.vmid = 0,
+		.tag = NVGPU_CTXSW_TAG_ENGINE_RESET,
+		.context_id = 0,
+		.pid = tsg->tgid,
+	};
+
+	if (!g->ctxsw_trace)
+		return;
+
+	g->ops.bus.read_ptimer(g, &entry.timestamp);
+	gk20a_ctxsw_trace_write(g, &entry);
+	gk20a_ctxsw_trace_wake_up(g, 0);
+#endif
+	trace_gk20a_channel_reset(~0, tsg->tsgid);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 68ae1a66..fb5d3614 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -1104,7 +1104,9 @@ int nvgpu_remove(struct device *dev, struct class *class)
 	if (platform->has_cde)
 		gk20a_cde_destroy(l);
 
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 	gk20a_ctxsw_trace_cleanup(g);
+#endif
 
 	gk20a_sched_ctrl_cleanup(g);
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0d011b06..546f4164 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -553,8 +553,10 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 	gk20a_dbg_info("freeing bound channel context, timeout=%ld",
 			timeout);
 
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 	if (g->ops.fecs_trace.unbind_channel && !ch->vpr)
 		g->ops.fecs_trace.unbind_channel(g, ch);
+#endif
 
 	/* release channel ctx */
 	g->ops.gr.free_channel_ctx(ch, was_tsg);
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
deleted file mode 100644
index fb33de23..00000000
--- a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <asm/barrier.h>
-#include <linux/wait.h>
-#include <linux/ktime.h>
-#include <linux/uaccess.h>
-#include <linux/poll.h>
-#include <trace/events/gk20a.h>
-#include <uapi/linux/nvgpu.h>
-
-#include <nvgpu/kmem.h>
-
-#include "ctxsw_trace_gk20a.h"
-#include "gk20a.h"
-#include "platform_gk20a.h"
-#include "gr_gk20a.h"
-#include "common/linux/os_linux.h"
-
-#include <nvgpu/log.h>
-#include <nvgpu/atomic.h>
-#include <nvgpu/barrier.h>
-
-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
-
-#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE	(128*PAGE_SIZE)
-
-/* Userland-facing FIFO (one global + eventually one per VM) */
-struct gk20a_ctxsw_dev {
-	struct gk20a *g;
-
-	struct nvgpu_ctxsw_ring_header *hdr;
-	struct nvgpu_ctxsw_trace_entry *ents;
-	struct nvgpu_ctxsw_trace_filter filter;
-	bool write_enabled;
-	struct nvgpu_cond readout_wq;
-	size_t size;
-	u32 num_ents;
-
-	nvgpu_atomic_t vma_ref;
-
-	struct nvgpu_mutex write_lock;
-};
-
-
-struct gk20a_ctxsw_trace {
-	struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
-};
-
-static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
-{
-	return (hdr->write_idx == hdr->read_idx);
-}
-
-static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
-{
-	return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
-}
-
-static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
-{
-	return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
-}
-
-ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
-	loff_t *off)
-{
-	struct gk20a_ctxsw_dev *dev = filp->private_data;
-	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
-	struct nvgpu_ctxsw_trace_entry __user *entry =
-		(struct nvgpu_ctxsw_trace_entry *) buf;
-	size_t copied = 0;
-	int err;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
-		"filp=%p buf=%p size=%zu", filp, buf, size);
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-	while (ring_is_empty(hdr)) {
-		nvgpu_mutex_release(&dev->write_lock);
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		err = NVGPU_COND_WAIT_INTERRUPTIBLE(&dev->readout_wq,
-			!ring_is_empty(hdr), 0);
-		if (err)
-			return err;
-		nvgpu_mutex_acquire(&dev->write_lock);
-	}
-
-	while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
-		if (ring_is_empty(hdr))
-			break;
-
-		if (copy_to_user(entry, &dev->ents[hdr->read_idx],
-			sizeof(*entry))) {
-			nvgpu_mutex_release(&dev->write_lock);
-			return -EFAULT;
-		}
-
-		hdr->read_idx++;
-		if (hdr->read_idx >= hdr->num_ents)
-			hdr->read_idx = 0;
-
-		entry++;
-		copied += sizeof(*entry);
-		size -= sizeof(*entry);
-	}
-
-	gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
-		hdr->read_idx);
-
-	*off = hdr->read_idx;
-	nvgpu_mutex_release(&dev->write_lock);
-
-	return copied;
-}
-
-static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
-{
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
-	nvgpu_mutex_acquire(&dev->write_lock);
-	dev->write_enabled = true;
-	nvgpu_mutex_release(&dev->write_lock);
-	dev->g->ops.fecs_trace.enable(dev->g);
-	return 0;
-}
-
-static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
-{
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
-	dev->g->ops.fecs_trace.disable(dev->g);
-	nvgpu_mutex_acquire(&dev->write_lock);
-	dev->write_enabled = false;
-	nvgpu_mutex_release(&dev->write_lock);
-	return 0;
-}
-
-static int gk20a_ctxsw_dev_alloc_buffer(struct gk20a_ctxsw_dev *dev,
-					size_t size)
-{
-	struct gk20a *g = dev->g;
-	void *buf;
-	int err;
-
-	if ((dev->write_enabled) || (nvgpu_atomic_read(&dev->vma_ref)))
-		return -EBUSY;
-
-	err = g->ops.fecs_trace.alloc_user_buffer(g, &buf, &size);
-	if (err)
-		return err;
-
-
-	dev->hdr = buf;
-	dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
-	dev->size = size;
-	dev->num_ents = dev->hdr->num_ents;
-
-	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
-		dev->size, dev->hdr, dev->ents, dev->hdr->num_ents);
-	return 0;
-}
-
-int gk20a_ctxsw_dev_ring_alloc(struct gk20a *g,
-		void **buf, size_t *size)
-{
-	struct nvgpu_ctxsw_ring_header *hdr;
-
-	*size = roundup(*size, PAGE_SIZE);
-	hdr = vmalloc_user(*size);
-	if (!hdr)
-		return -ENOMEM;
-
-	hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
-	hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
-	hdr->num_ents = (*size - sizeof(struct nvgpu_ctxsw_ring_header))
-		/ sizeof(struct nvgpu_ctxsw_trace_entry);
-	hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
-	hdr->drop_count = 0;
-	hdr->read_idx = 0;
-	hdr->write_idx = 0;
-	hdr->write_seqno = 0;
-
-	*buf = hdr;
-	return 0;
-}
-
-int gk20a_ctxsw_dev_ring_free(struct gk20a *g)
-{
-	struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[0];
-
-	nvgpu_vfree(g, dev->hdr);
-	return 0;
-}
-
-static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
-	struct nvgpu_ctxsw_ring_setup_args *args)
-{
-	size_t size = args->size;
-	int ret;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
-
-	if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
-		return -EINVAL;
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-	ret = gk20a_ctxsw_dev_alloc_buffer(dev, size);
-	nvgpu_mutex_release(&dev->write_lock);
-
-	return ret;
-}
-
-static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
-	struct nvgpu_ctxsw_trace_filter_args *args)
-{
-	struct gk20a *g = dev->g;
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-	dev->filter = args->filter;
-	nvgpu_mutex_release(&dev->write_lock);
-
-	if (g->ops.fecs_trace.set_filter)
-		g->ops.fecs_trace.set_filter(g, &dev->filter);
-	return 0;
-}
-
-static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
-	struct nvgpu_ctxsw_trace_filter_args *args)
-{
-	nvgpu_mutex_acquire(&dev->write_lock);
-	args->filter = dev->filter;
-	nvgpu_mutex_release(&dev->write_lock);
-
-	return 0;
-}
-
-static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
-{
-	struct gk20a *g = dev->g;
-	int err;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
-
-	err = gk20a_busy(g);
-	if (err)
-		return err;
-
-	if (g->ops.fecs_trace.flush)
-		err = g->ops.fecs_trace.flush(g);
-
-	if (likely(!err))
-		err = g->ops.fecs_trace.poll(g);
-
-	gk20a_idle(g);
-	return err;
-}
-
-int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
-{
-	struct nvgpu_os_linux *l;
-	struct gk20a *g;
-	struct gk20a_ctxsw_trace *trace;
-	struct gk20a_ctxsw_dev *dev;
-	int err;
-	size_t size;
-	u32 n;
-
-	/* only one VM for now */
-	const int vmid = 0;
-
-	l = container_of(inode->i_cdev, struct nvgpu_os_linux, ctxsw.cdev);
-	g = gk20a_get(&l->g);
-	if (!g)
-		return -ENODEV;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		err = -EPERM;
-		goto free_ref;
-	}
-
-	err = gk20a_busy(g);
-	if (err)
-		goto free_ref;
-
-	trace = g->ctxsw_trace;
-	if (!trace) {
-		err = -ENODEV;
-		goto idle;
-	}
-
-	/* Allow only one user for this device */
-	dev = &trace->devs[vmid];
-	nvgpu_mutex_acquire(&dev->write_lock);
-	if (dev->hdr) {
-		err = -EBUSY;
-		goto done;
-	}
-
-	/* By default, allocate ring buffer big enough to accommodate
-	 * FECS records with default event filter */
-
-	/* enable all traces by default */
-	NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
-
-	/* compute max number of entries generated with this filter */
-	n = g->ops.fecs_trace.max_entries(g, &dev->filter);
-
-	size = sizeof(struct nvgpu_ctxsw_ring_header) +
-			n * sizeof(struct nvgpu_ctxsw_trace_entry);
-	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
-		size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
-
-	err = gk20a_ctxsw_dev_alloc_buffer(dev, size);
-	if (!err) {
-		filp->private_data = dev;
-		gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
-			filp, dev, size);
-	}
-
-done:
-	nvgpu_mutex_release(&dev->write_lock);
-
-idle:
-	gk20a_idle(g);
-free_ref:
-	if (err)
-		gk20a_put(g);
-	return err;
-}
-
-int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
-{
-	struct gk20a_ctxsw_dev *dev = filp->private_data;
-	struct gk20a *g = dev->g;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
-
-	g->ops.fecs_trace.disable(g);
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-	dev->write_enabled = false;
-	nvgpu_mutex_release(&dev->write_lock);
-
-	if (dev->hdr) {
-		dev->g->ops.fecs_trace.free_user_buffer(dev->g);
-		dev->hdr = NULL;
-	}
-	gk20a_put(g);
-	return 0;
-}
-
-long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
-	unsigned long arg)
-{
-	struct gk20a_ctxsw_dev *dev = filp->private_data;
-	struct gk20a *g = dev->g;
-	u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
-	int err = 0;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
-
-	if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) ||
-		(_IOC_NR(cmd) == 0) ||
-		(_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST) ||
-		(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE))
-		return -EINVAL;
-
-	memset(buf, 0, sizeof(buf));
-	if (_IOC_DIR(cmd) & _IOC_WRITE) {
-		if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
-			return -EFAULT;
-	}
-
-	switch (cmd) {
-	case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
-		err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
-		break;
-	case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
-		err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
-		break;
-	case NVGPU_CTXSW_IOCTL_RING_SETUP:
-		err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
-			(struct nvgpu_ctxsw_ring_setup_args *) buf);
-		break;
-	case NVGPU_CTXSW_IOCTL_SET_FILTER:
-		err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
-			(struct nvgpu_ctxsw_trace_filter_args *) buf);
-		break;
-	case NVGPU_CTXSW_IOCTL_GET_FILTER:
-		err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
-			(struct nvgpu_ctxsw_trace_filter_args *) buf);
-		break;
-	case NVGPU_CTXSW_IOCTL_POLL:
-		err = gk20a_ctxsw_dev_ioctl_poll(dev);
-		break;
-	default:
-		dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
-			cmd);
-		err = -ENOTTY;
-	}
-
-	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
-		err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
-
-	return err;
-}
-
-unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
-{
-	struct gk20a_ctxsw_dev *dev = filp->private_data;
-	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
-	unsigned int mask = 0;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-	poll_wait(filp, &dev->readout_wq.wq, wait);
-	if (!ring_is_empty(hdr))
-		mask |= POLLIN | POLLRDNORM;
-	nvgpu_mutex_release(&dev->write_lock);
-
-	return mask;
-}
-
-static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
-{
-	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
-
-	nvgpu_atomic_inc(&dev->vma_ref);
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
-		nvgpu_atomic_read(&dev->vma_ref));
-}
-
-static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
-{
-	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
-
-	nvgpu_atomic_dec(&dev->vma_ref);
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
-		nvgpu_atomic_read(&dev->vma_ref));
-}
-
-static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
-	.open = gk20a_ctxsw_dev_vma_open,
-	.close = gk20a_ctxsw_dev_vma_close,
-};
-
-int gk20a_ctxsw_dev_mmap_buffer(struct gk20a *g,
-				struct vm_area_struct *vma)
-{
-	return remap_vmalloc_range(vma, g->ctxsw_trace->devs[0].hdr, 0);
-}
-
-int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct gk20a_ctxsw_dev *dev = filp->private_data;
-	int ret;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
-		vma->vm_start, vma->vm_end);
-
-	ret = dev->g->ops.fecs_trace.mmap_user_buffer(dev->g, vma);
-	if (likely(!ret)) {
-		vma->vm_private_data = dev;
-		vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
-		vma->vm_ops->open(vma);
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_GK20A_CTXSW_TRACE
-static int gk20a_ctxsw_init_devs(struct gk20a *g)
-{
-	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
-	struct gk20a_ctxsw_dev *dev = trace->devs;
-	int err;
-	int i;
-
-	for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
-		dev->g = g;
-		dev->hdr = NULL;
-		dev->write_enabled = false;
-		nvgpu_cond_init(&dev->readout_wq);
-		err = nvgpu_mutex_init(&dev->write_lock);
-		if (err)
-			return err;
-		nvgpu_atomic_set(&dev->vma_ref, 0);
-		dev++;
-	}
-	return 0;
-}
-#endif
-
-int gk20a_ctxsw_trace_init(struct gk20a *g)
-{
-#ifdef CONFIG_GK20A_CTXSW_TRACE
-	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
-	int err;
-
-	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
-
-	/* if tracing is not supported, skip this */
-	if (!g->ops.fecs_trace.init)
-		return 0;
-
-	if (likely(trace))
-		return 0;
-
-	trace = nvgpu_kzalloc(g, sizeof(*trace));
-	if (unlikely(!trace))
-		return -ENOMEM;
-	g->ctxsw_trace = trace;
-
-	err = gk20a_ctxsw_init_devs(g);
-	if (err)
-		goto fail;
-
-	err = g->ops.fecs_trace.init(g);
-	if (unlikely(err))
-		goto fail;
-
-	return 0;
-
-fail:
-	memset(&g->ops.fecs_trace, 0, sizeof(g->ops.fecs_trace));
-	nvgpu_kfree(g, trace);
-	g->ctxsw_trace = NULL;
-	return err;
-#else
-	return 0;
-#endif
-}
-
-void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
-{
-#ifdef CONFIG_GK20A_CTXSW_TRACE
-	struct gk20a_ctxsw_trace *trace;
-	struct gk20a_ctxsw_dev *dev;
-	int i;
-
-	if (!g->ctxsw_trace)
-		return;
-
-	trace = g->ctxsw_trace;
-	dev = trace->devs;
-
-	for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
-		nvgpu_mutex_destroy(&dev->write_lock);
-		dev++;
-	}
-
-	nvgpu_kfree(g, g->ctxsw_trace);
-	g->ctxsw_trace = NULL;
-
-	g->ops.fecs_trace.deinit(g);
-#endif
-}
-
-int gk20a_ctxsw_trace_write(struct gk20a *g,
-		struct nvgpu_ctxsw_trace_entry *entry)
-{
-	struct nvgpu_ctxsw_ring_header *hdr;
-	struct gk20a_ctxsw_dev *dev;
-	int ret = 0;
-	const char *reason;
-	u32 write_idx;
-
-	if (!g->ctxsw_trace)
-		return 0;
-
-	if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
-		return -ENODEV;
-
-	dev = &g->ctxsw_trace->devs[entry->vmid];
-	hdr = dev->hdr;
-
-	gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
-		"dev=%p hdr=%p", dev, hdr);
-
-	nvgpu_mutex_acquire(&dev->write_lock);
-
-	if (unlikely(!hdr)) {
-		/* device has been released */
-		ret = -ENODEV;
-		goto done;
-	}
-
-	write_idx = hdr->write_idx;
-	if (write_idx >= dev->num_ents) {
-		nvgpu_err(dev->g,
-			"write_idx=%u out of range [0..%u]",
-			write_idx, dev->num_ents);
-		ret = -ENOSPC;
-		reason = "write_idx out of range";
-		goto disable;
-	}
-
-	entry->seqno = hdr->write_seqno++;
-
-	if (!dev->write_enabled) {
-		ret = -EBUSY;
-		reason = "write disabled";
-		goto drop;
-	}
-
-	if (unlikely(ring_is_full(hdr))) {
-		ret = -ENOSPC;
-		reason = "user fifo full";
-		goto drop;
-	}
-
-	if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
-		reason = "filtered out";
-		goto filter;
-	}
-
-	gk20a_dbg(gpu_dbg_ctxsw,
-		"seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
-		entry->seqno, entry->context_id, entry->pid,
-		entry->tag, entry->timestamp);
-
-	dev->ents[write_idx] = *entry;
-
-	/* ensure record is written before updating write index */
-	nvgpu_smp_wmb();
-
-	write_idx++;
-	if (unlikely(write_idx >= hdr->num_ents))
-		write_idx = 0;
-	hdr->write_idx = write_idx;
-	gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
-		hdr->read_idx, hdr->write_idx, ring_len(hdr));
-
-	nvgpu_mutex_release(&dev->write_lock);
-	return ret;
-
-disable:
-	g->ops.fecs_trace.disable(g);
-
-drop:
-	hdr->drop_count++;
-
-filter:
-	gk20a_dbg(gpu_dbg_ctxsw,
-			"dropping seqno=%d context_id=%08x pid=%lld "
-			"tag=%x time=%llx (%s)",
-			entry->seqno, entry->context_id, entry->pid,
-			entry->tag, entry->timestamp, reason);
-
-done:
-	nvgpu_mutex_release(&dev->write_lock);
-	return ret;
-}
-
-void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
-{
-	struct gk20a_ctxsw_dev *dev;
-
-	if (!g->ctxsw_trace)
-		return;
-
-	dev = &g->ctxsw_trace->devs[vmid];
-	nvgpu_cond_signal_interruptible(&dev->readout_wq);
-}
-
-void gk20a_ctxsw_trace_channel_reset(struct gk20a *g, struct channel_gk20a *ch)
-{
-#ifdef CONFIG_GK20A_CTXSW_TRACE
-	struct nvgpu_ctxsw_trace_entry entry = {
-		.vmid = 0,
-		.tag = NVGPU_CTXSW_TAG_ENGINE_RESET,
-		.context_id = 0,
-		.pid = ch->tgid,
-	};
-
-	if (!g->ctxsw_trace)
-		return;
-
-	g->ops.bus.read_ptimer(g, &entry.timestamp);
-	gk20a_ctxsw_trace_write(g, &entry);
-	gk20a_ctxsw_trace_wake_up(g, 0);
-#endif
-	trace_gk20a_channel_reset(ch->chid, ch->tsgid);
-}
-
-void gk20a_ctxsw_trace_tsg_reset(struct gk20a *g, struct tsg_gk20a *tsg)
-{
-#ifdef CONFIG_GK20A_CTXSW_TRACE
-	struct nvgpu_ctxsw_trace_entry entry = {
-		.vmid = 0,
-		.tag = NVGPU_CTXSW_TAG_ENGINE_RESET,
-		.context_id = 0,
-		.pid = tsg->tgid,
-	};
-
-	if (!g->ctxsw_trace)
-		return;
-
-	g->ops.bus.read_ptimer(g, &entry.timestamp);
-	gk20a_ctxsw_trace_write(g, &entry);
-	gk20a_ctxsw_trace_wake_up(g, 0);
-#endif
-	trace_gk20a_channel_reset(~0, tsg->tsgid);
-}
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
index b270581b..dddb8603 100644
--- a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
@@ -23,6 +23,8 @@
 #ifndef __CTXSW_TRACE_GK20A_H
 #define __CTXSW_TRACE_GK20A_H
 
+#include <nvgpu/types.h>
+
 #define GK20A_CTXSW_TRACE_NUM_DEVS			1
 
 struct file;
@@ -41,20 +43,22 @@ int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp);
 int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp);
 long gk20a_ctxsw_dev_ioctl(struct file *filp,
 			 unsigned int cmd, unsigned long arg);
-ssize_t gk20a_ctxsw_dev_read(struct file *, char __user *, size_t, loff_t *);
-unsigned int gk20a_ctxsw_dev_poll(struct file *, struct poll_table_struct *);
-int gk20a_ctxsw_dev_mmap(struct file *, struct vm_area_struct *);
+ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf,
+			     size_t size, loff_t *offs);
+unsigned int gk20a_ctxsw_dev_poll(struct file *filp,
+				  struct poll_table_struct *pts);
+int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma);
 int gk20a_ctxsw_dev_ring_alloc(struct gk20a *g, void **buf, size_t *size);
 int gk20a_ctxsw_dev_ring_free(struct gk20a *g);
 int gk20a_ctxsw_dev_mmap_buffer(struct gk20a *g, struct vm_area_struct *vma);
 
-int gk20a_ctxsw_trace_init(struct gk20a *);
-void gk20a_ctxsw_trace_cleanup(struct gk20a *);
-int gk20a_ctxsw_trace_write(struct gk20a *, struct nvgpu_ctxsw_trace_entry *);
+int gk20a_ctxsw_trace_init(struct gk20a *g);
+void gk20a_ctxsw_trace_cleanup(struct gk20a *g);
+int gk20a_ctxsw_trace_write(struct gk20a *g,
+			    struct nvgpu_ctxsw_trace_entry *entry);
 void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid);
 
 void gk20a_ctxsw_trace_channel_reset(struct gk20a *g, struct channel_gk20a *ch);
 void gk20a_ctxsw_trace_tsg_reset(struct gk20a *g, struct tsg_gk20a *tsg);
 
-
 #endif /* __CTXSW_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index d10af9e9..17ae626b 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1228,16 +1228,24 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
 			if (nvgpu_pmu_disable_elpg(g))
 				nvgpu_err(g, "failed to set disable elpg");
 		}
-		/* resetting engine will alter read/write index.
-		 * need to flush circular buffer before re-enabling FECS.
+
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+		/*
+		 * Resetting engine will alter read/write index. Need to flush
+		 * circular buffer before re-enabling FECS.
 		 */
 		if (g->ops.fecs_trace.reset)
 			g->ops.fecs_trace.reset(g);
-		/*HALT_PIPELINE method, halt GR engine*/
+#endif
+
+		/* HALT_PIPELINE method, halt GR engine. */
 		if (gr_gk20a_halt_pipe(g))
 			nvgpu_err(g, "failed to HALT gr pipe");
-		/* resetting engine using mc_enable_r() is not
-		enough, we do full init sequence */
+
+		/*
+		 * Resetting engine using mc_enable_r() is not enough; we must
+		 * do full init sequence.
+		 */
 		gk20a_gr_reset(g);
 		if (g->support_pmu && g->can_elpg)
 			nvgpu_pmu_enable_elpg(g);
@@ -1618,6 +1626,8 @@ static bool gk20a_fifo_handle_mmu_fault(
 				}
 			}
 		}
+
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 		/*
 		 * For non fake mmu fault, both tsg and ch pointers
 		 * could be valid. Check tsg first.
@@ -1626,10 +1636,11 @@ static bool gk20a_fifo_handle_mmu_fault(
 			gk20a_ctxsw_trace_tsg_reset(g, tsg);
 		else if (ch)
 			gk20a_ctxsw_trace_channel_reset(g, ch);
+#endif
 
-		/* disable the channel/TSG from hw and increment
-		 * syncpoints */
-
+		/*
+		 * Disable the channel/TSG from hw and increment syncpoints.
+		 */
 		if (tsg) {
 			if (!g->fifo.deferred_reset_pending) {
 				if (!fake_fault)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 47f6c56c..703a7c0c 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -295,9 +295,11 @@ int gk20a_finalize_poweron(struct gk20a *g)
 		goto done;
 	}
 
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 	err = gk20a_ctxsw_trace_init(g);
 	if (err)
 		nvgpu_warn(g, "could not initialize ctxsw tracing");
+#endif
 
 	err = gk20a_sched_ctrl_init(g);
 	if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index d7fdffb0..a34f06b2 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -689,18 +689,25 @@ struct gpu_ops {
 		int (*get_netlist_name)(struct gk20a *g, int index, char *name);
 		bool (*is_fw_defined)(void);
 	} gr_ctx;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+	/*
+	 * Currently only supported on Linux due to the extremely tight
+	 * integration with Linux device driver structure (in particular
+	 * mmap).
+	 */
 	struct {
 		int (*init)(struct gk20a *g);
 		int (*max_entries)(struct gk20a *,
-			struct nvgpu_ctxsw_trace_filter *);
+			struct nvgpu_ctxsw_trace_filter *filter);
 		int (*flush)(struct gk20a *g);
 		int (*poll)(struct gk20a *g);
 		int (*enable)(struct gk20a *g);
 		int (*disable)(struct gk20a *g);
 		bool (*is_enabled)(struct gk20a *g);
 		int (*reset)(struct gk20a *g);
-		int (*bind_channel)(struct gk20a *, struct channel_gk20a *);
-		int (*unbind_channel)(struct gk20a *, struct channel_gk20a *);
+		int (*bind_channel)(struct gk20a *g, struct channel_gk20a *ch);
+		int (*unbind_channel)(struct gk20a *g,
+					struct channel_gk20a *ch);
 		int (*deinit)(struct gk20a *g);
 		int (*alloc_user_buffer)(struct gk20a *g,
 					void **buf, size_t *size);
@@ -710,6 +717,7 @@ struct gpu_ops {
 		int (*set_filter)(struct gk20a *g,
 			struct nvgpu_ctxsw_trace_filter *filter);
 	} fecs_trace;
+#endif
 	struct {
 		bool (*support_sparse)(struct gk20a *g);
 		u64 (*gmmu_map)(struct vm_gk20a *vm,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 1ea59a9d..f78d862c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -3070,13 +3070,14 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
 				"fail to load golden ctx image");
 			goto out;
 		}
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 		if (g->ops.fecs_trace.bind_channel && !c->vpr) {
 			err = g->ops.fecs_trace.bind_channel(g, c);
-			if (err) {
+			if (err)
 				nvgpu_warn(g,
 					"fail to bind channel for ctxsw trace");
-			}
 		}
+#endif
 		c->first_init = true;
 	}
 
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index f576278d..c5b66201 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -714,7 +714,9 @@ int gp106_init_hal(struct gk20a *g)
 	gops->clock_gating = gp106_ops.clock_gating;
 	gops->fifo = gp106_ops.fifo;
 	gops->gr_ctx = gp106_ops.gr_ctx;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 	gops->fecs_trace = gp106_ops.fecs_trace;
+#endif
 	gops->mm = gp106_ops.mm;
 	gops->pramin = gp106_ops.pramin;
 	gops->therm = gp106_ops.therm;
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index cbec89bc..bb95f6db 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -619,7 +619,9 @@ int gp10b_init_hal(struct gk20a *g)
 	gops->clock_gating = gp10b_ops.clock_gating;
 	gops->fifo = gp10b_ops.fifo;
 	gops->gr_ctx = gp10b_ops.gr_ctx;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
 	gops->fecs_trace = gp10b_ops.fecs_trace;
+#endif
 	gops->mm = gp10b_ops.mm;
 	gops->pramin = gp10b_ops.pramin;
 	gops->therm = gp10b_ops.therm;
-- 
cgit v1.2.2