1 files changed, 586 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
new file mode 100644
index 00000000..9e7c04ad
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <asm/barrier.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/circ_buf.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/nvgpu.h>
+#include <linux/hashtable.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <uapi/linux/nvgpu.h>
+#include "ctxsw_trace_gk20a.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_gr_gk20a.h"
+#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE      (128*PAGE_SIZE)
+/* Userland-facing FIFO (one global + eventually one per VM) */
+struct gk20a_ctxsw_dev {
+        struct gk20a *g;
+        struct nvgpu_ctxsw_ring_header *hdr;
+        struct nvgpu_ctxsw_trace_entry *ents;
+        struct nvgpu_ctxsw_trace_filter filter;
+        bool write_enabled;
+        wait_queue_head_t readout_wq;
+        size_t size;
+        atomic_t vma_ref;
+        struct mutex lock;
+};
+struct gk20a_ctxsw_trace {
+        struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
+};
+static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->write_idx == hdr->read_idx);
+}
+static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
+}
+static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
+}
+static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
+{
+        return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
+}
+ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
+        loff_t *off)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+        struct nvgpu_ctxsw_trace_entry __user *entry =
+                (struct nvgpu_ctxsw_trace_entry *) buf;
+        size_t copied = 0;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
+                "filp=%p buf=%p size=%zu", filp, buf, size);
+        mutex_lock(&dev->lock);
+        while (ring_is_empty(hdr)) {
+                mutex_unlock(&dev->lock);
+                if (filp->f_flags & O_NONBLOCK)
+                        return -EAGAIN;
+                err = wait_event_interruptible(dev->readout_wq,
+                        !ring_is_empty(hdr));
+                if (err)
+                        return err;
+                mutex_lock(&dev->lock);
+        }
+        while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
+                if (ring_is_empty(hdr))
+                        break;
+                if (copy_to_user(entry, &dev->ents[hdr->read_idx],
+                        sizeof(*entry))) {
+                        mutex_unlock(&dev->lock);
+                        return -EFAULT;
+                }
+                hdr->read_idx++;
+                if (hdr->read_idx >= hdr->num_ents)
+                        hdr->read_idx = 0;
+                entry++;
+                copied += sizeof(*entry);
+                size -= sizeof(*entry);
+        }
+        gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
+                hdr->read_idx);
+        *off = hdr->read_idx;
+        mutex_unlock(&dev->lock);
+        return copied;
+}
+static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
+{
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
+        dev->write_enabled = true;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
+{
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
+        dev->write_enabled = false;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
+                size_t size)
+{
+        struct nvgpu_ctxsw_ring_header *hdr;
+        if (atomic_read(&dev->vma_ref))
+                return -EBUSY;
+        if ((dev->write_enabled) || (atomic_read(&dev->vma_ref)))
+                return -EBUSY;
+        size = roundup(size, PAGE_SIZE);
+        hdr = vmalloc_user(size);
+        if (!hdr)
+                return -ENOMEM;
+        if (dev->hdr)
+                vfree(dev->hdr);
+        dev->hdr = hdr;
+        dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
+        dev->size = size;
+        hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
+        hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
+        hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
+                / sizeof(struct nvgpu_ctxsw_trace_entry);
+        hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
+        hdr->drop_count = 0;
+        hdr->read_idx = 0;
+        hdr->write_idx = 0;
+        hdr->write_seqno = 0;
+        gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
+                dev->size, dev->hdr, dev->ents, hdr->num_ents);
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_ring_setup_args *args)
+{
+        size_t size = args->size;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
+        if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
+                return -EINVAL;
+        return gk20a_ctxsw_dev_ring_alloc(dev, size);
+}
+static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_trace_filter_args *args)
+{
+        dev->filter = args->filter;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
+        struct nvgpu_ctxsw_trace_filter_args *args)
+{
+        args->filter = dev->filter;
+        return 0;
+}
+static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
+{
+        struct gk20a *g = dev->g;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+        err = gk20a_busy(g->dev);
+        if (err)
+                return err;
+        if (g->ops.fecs_trace.flush(g))
+                err = g->ops.fecs_trace.flush(g);
+        if (likely(!err))
+                err = g->ops.fecs_trace.poll(g);
+        gk20a_idle(g->dev);
+        return err;
+}
+int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
+{
+        struct gk20a *g;
+        struct gk20a_ctxsw_trace *trace;
+        struct gk20a_ctxsw_dev *dev;
+        int err;
+        size_t size;
+        u32 n;
+        /* only one VM for now */
+        const int vmid = 0;
+        g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        err = gk20a_busy(g->dev);
+        if (err)
+                return err;
+        trace = g->ctxsw_trace;
+        if (!trace) {
+                err = -ENODEV;
+                goto idle;
+        }
+        /* Allow only one user for this device */
+        dev = &trace->devs[vmid];
+        mutex_lock(&dev->lock);
+        if (dev->hdr) {
+                err = -EBUSY;
+                goto done;
+        }
+        /* By default, allocate ring buffer big enough to accommodate
+         * FECS records with default event filter */
+        /* enable all traces by default */
+        NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
+        /* compute max number of entries generated with this filter */
+        n = g->ops.fecs_trace.max_entries(g, &dev->filter);
+        size = sizeof(struct nvgpu_ctxsw_ring_header) +
+                        n * sizeof(struct nvgpu_ctxsw_trace_entry);
+        gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
+                size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
+        err = gk20a_ctxsw_dev_ring_alloc(dev, size);
+        if (!err) {
+                filp->private_data = dev;
+                gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
+                        filp, dev, size);
+        }
+        err = g->ops.fecs_trace.enable(g);
+done:
+        mutex_unlock(&dev->lock);
+idle:
+        gk20a_idle(g->dev);
+        return err;
+}
+int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
+        mutex_lock(&dev->lock);
+        dev->write_enabled = false;
+        if (dev->hdr) {
+                vfree(dev->hdr);
+                dev->hdr = NULL;
+        }
+        g->ops.fecs_trace.disable(g);
+        mutex_unlock(&dev->lock);
+        return 0;
+}
+long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
+        unsigned long arg)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct gk20a *g = dev->g;
+        u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
+        int err = 0;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
+        if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) || (_IOC_NR(cmd) == 0)
+                || (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
+                return -EINVAL;
+        BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
+        memset(buf, 0, sizeof(buf));
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        mutex_lock(&dev->lock);
+        switch (cmd) {
+        case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
+                err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
+                break;
+        case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
+                err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
+                break;
+        case NVGPU_CTXSW_IOCTL_RING_SETUP:
+                err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
+                        (struct nvgpu_ctxsw_ring_setup_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_SET_FILTER:
+                err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
+                        (struct nvgpu_ctxsw_trace_filter_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_GET_FILTER:
+                err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
+                        (struct nvgpu_ctxsw_trace_filter_args *) buf);
+                break;
+        case NVGPU_CTXSW_IOCTL_POLL:
+                mutex_unlock(&dev->lock);
+                err = gk20a_ctxsw_dev_ioctl_poll(dev);
+                mutex_lock(&dev->lock);
+                break;
+        default:
+                dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
+                        cmd);
+                err = -ENOTTY;
+        }
+        mutex_unlock(&dev->lock);
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
+        return err;
+}
+unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
+        unsigned int mask = 0;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
+        mutex_lock(&dev->lock);
+        poll_wait(filp, &dev->readout_wq, wait);
+        if (!ring_is_empty(hdr))
+                mask |= POLLIN | POLLRDNORM;
+        mutex_unlock(&dev->lock);
+        return mask;
+}
+static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+        atomic_inc(&dev->vma_ref);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+                atomic_read(&dev->vma_ref));
+}
+static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
+        atomic_dec(&dev->vma_ref);
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
+                atomic_read(&dev->vma_ref));
+}
+static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
+        .open = gk20a_ctxsw_dev_vma_open,
+        .close = gk20a_ctxsw_dev_vma_close,
+};
+int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        struct gk20a_ctxsw_dev *dev = filp->private_data;
+        int ret;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
+                vma->vm_start, vma->vm_end);
+        ret = remap_vmalloc_range(vma, dev->hdr, 0);
+        if (likely(!ret)) {
+                vma->vm_private_data = dev;
+                vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
+                vma->vm_ops->open(vma);
+        }
+        return ret;
+}
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+static int gk20a_ctxsw_init_devs(struct gk20a *g)
+{
+        struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+        struct gk20a_ctxsw_dev *dev = trace->devs;
+        int i;
+        for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
+                dev->g = g;
+                dev->hdr = NULL;
+                dev->write_enabled = false;
+                init_waitqueue_head(&dev->readout_wq);
+                mutex_init(&dev->lock);
+                atomic_set(&dev->vma_ref, 0);
+                dev++;
+        }
+        return 0;
+}
+#endif
+int gk20a_ctxsw_trace_init(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
+        int err;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
+        if (likely(trace))
+                return 0;
+        trace = kzalloc(sizeof(*trace), GFP_KERNEL);
+        if (unlikely(!trace))
+                return -ENOMEM;
+        g->ctxsw_trace = trace;
+        err = gk20a_ctxsw_init_devs(g);
+        if (err)
+                goto fail;
+        err = g->ops.fecs_trace.init(g);
+        if (unlikely(err))
+                goto fail;
+        return 0;
+fail:
+        kfree(trace);
+        g->ctxsw_trace = NULL;
+        return err;
+#else
+        return 0;
+#endif
+}
+void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
+{
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        kfree(g->ctxsw_trace);
+        g->ctxsw_trace = NULL;
+        g->ops.fecs_trace.deinit(g);
+#endif
+}
+int gk20a_ctxsw_trace_write(struct gk20a *g,
+                struct nvgpu_ctxsw_trace_entry *entry)
+{
+        struct nvgpu_ctxsw_ring_header *hdr;
+        struct gk20a_ctxsw_dev *dev;
+        int ret = 0;
+        const char *reason;
+        if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
+                return -ENODEV;
+        dev = &g->ctxsw_trace->devs[entry->vmid];
+        hdr = dev->hdr;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
+                "dev=%p hdr=%p", dev, hdr);
+        mutex_lock(&dev->lock);
+        if (unlikely(!hdr)) {
+                /* device has been released */
+                ret = -ENODEV;
+                goto done;
+        }
+        entry->seqno = hdr->write_seqno++;
+        if (!dev->write_enabled) {
+                ret = -EBUSY;
+                reason = "write disabled";
+                goto drop;
+        }
+        if (unlikely(ring_is_full(hdr))) {
+                ret = -ENOSPC;
+                reason = "user fifo full";
+                goto drop;
+        }
+        if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
+                reason = "filtered out";
+                goto filter;
+        }
+        gk20a_dbg(gpu_dbg_ctxsw,
+                "seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
+                entry->seqno, entry->context_id, entry->pid,
+                entry->tag, entry->timestamp);
+        dev->ents[hdr->write_idx] = *entry;
+        /* ensure record is written before updating write index */
+        smp_wmb();
+        hdr->write_idx++;
+        if (unlikely(hdr->write_idx >= hdr->num_ents))
+                hdr->write_idx = 0;
+        gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
+                hdr->read_idx, hdr->write_idx, ring_len(hdr));
+        mutex_unlock(&dev->lock);
+        return ret;
+drop:
+        hdr->drop_count++;
+filter:
+        gk20a_dbg(gpu_dbg_ctxsw,
+                        "dropping seqno=%d context_id=%08x pid=%lld "
+                        "tag=%x time=%llx (%s)",
+                        entry->seqno, entry->context_id, entry->pid,
+                        entry->tag, entry->timestamp, reason);
+done:
+        mutex_unlock(&dev->lock);
+        return ret;
+}
+void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
+{
+        struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
+        wake_up_interruptible(&dev->readout_wq);
+}

diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c new file mode 100644 index 00000000..9e7c04ad --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -0,0 +1,586 @@
	1	/*
	2	* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or modify it
	5	* under the terms and conditions of the GNU General Public License,
	6	* version 2, as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope it will be useful, but WITHOUT
	9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	11	* more details.
	12	*/
	13
	14	#include <asm/barrier.h>
	15	#include <linux/slab.h>
	16	#include <linux/kthread.h>
	17	#include <linux/circ_buf.h>
	18	#include <linux/delay.h>
	19	#include <linux/jiffies.h>
	20	#include <linux/wait.h>
	21	#include <linux/ktime.h>
	22	#include <linux/nvgpu.h>
	23	#include <linux/hashtable.h>
	24	#include <linux/debugfs.h>
	25	#include <linux/log2.h>
	26	#include <uapi/linux/nvgpu.h>
	27	#include "ctxsw_trace_gk20a.h"
	28	#include "gk20a.h"
	29	#include "gr_gk20a.h"
	30	#include "hw_ctxsw_prog_gk20a.h"
	31	#include "hw_gr_gk20a.h"
	32
	33	#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE (128*PAGE_SIZE)
	34
	35	/* Userland-facing FIFO (one global + eventually one per VM) */
	36	struct gk20a_ctxsw_dev {
	37	struct gk20a *g;
	38
	39	struct nvgpu_ctxsw_ring_header *hdr;
	40	struct nvgpu_ctxsw_trace_entry *ents;
	41	struct nvgpu_ctxsw_trace_filter filter;
	42	bool write_enabled;
	43	wait_queue_head_t readout_wq;
	44	size_t size;
	45
	46	atomic_t vma_ref;
	47
	48	struct mutex lock;
	49	};
	50
	51
	52	struct gk20a_ctxsw_trace {
	53	struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
	54	};
	55
	56	static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
	57	{
	58	return (hdr->write_idx == hdr->read_idx);
	59	}
	60
	61	static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
	62	{
	63	return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
	64	}
	65
	66	static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
	67	{
	68	return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
	69	}
	70
	71	static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
	72	{
	73	return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
	74	}
	75
	76	ssize_t gk20a_ctxsw_dev_read(struct file filp, char __user buf, size_t size,
	77	loff_t *off)
	78	{
	79	struct gk20a_ctxsw_dev *dev = filp->private_data;
	80	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
	81	struct nvgpu_ctxsw_trace_entry __user *entry =
	82	(struct nvgpu_ctxsw_trace_entry *) buf;
	83	size_t copied = 0;
	84	int err;
	85
	86	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw,
	87	"filp=%p buf=%p size=%zu", filp, buf, size);
	88
	89	mutex_lock(&dev->lock);
	90	while (ring_is_empty(hdr)) {
	91	mutex_unlock(&dev->lock);
	92	if (filp->f_flags & O_NONBLOCK)
	93	return -EAGAIN;
	94	err = wait_event_interruptible(dev->readout_wq,
	95	!ring_is_empty(hdr));
	96	if (err)
	97	return err;
	98	mutex_lock(&dev->lock);
	99	}
	100
	101	while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
	102	if (ring_is_empty(hdr))
	103	break;
	104
	105	if (copy_to_user(entry, &dev->ents[hdr->read_idx],
	106	sizeof(*entry))) {
	107	mutex_unlock(&dev->lock);
	108	return -EFAULT;
	109	}
	110
	111	hdr->read_idx++;
	112	if (hdr->read_idx >= hdr->num_ents)
	113	hdr->read_idx = 0;
	114
	115	entry++;
	116	copied += sizeof(*entry);
	117	size -= sizeof(*entry);
	118	}
	119
	120	gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
	121	hdr->read_idx);
	122
	123	*off = hdr->read_idx;
	124	mutex_unlock(&dev->lock);
	125
	126	return copied;
	127	}
	128
	129	static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
	130	{
	131	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "trace enabled");
	132	dev->write_enabled = true;
	133	return 0;
	134	}
	135
	136	static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
	137	{
	138	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "trace disabled");
	139	dev->write_enabled = false;
	140	return 0;
	141	}
	142
	143	static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
	144	size_t size)
	145	{
	146	struct nvgpu_ctxsw_ring_header *hdr;
	147
	148	if (atomic_read(&dev->vma_ref))
	149	return -EBUSY;
	150
	151	if ((dev->write_enabled) \|\| (atomic_read(&dev->vma_ref)))
	152	return -EBUSY;
	153
	154	size = roundup(size, PAGE_SIZE);
	155	hdr = vmalloc_user(size);
	156	if (!hdr)
	157	return -ENOMEM;
	158
	159	if (dev->hdr)
	160	vfree(dev->hdr);
	161
	162	dev->hdr = hdr;
	163	dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
	164	dev->size = size;
	165
	166	hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
	167	hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
	168	hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
	169	/ sizeof(struct nvgpu_ctxsw_trace_entry);
	170	hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
	171	hdr->drop_count = 0;
	172	hdr->read_idx = 0;
	173	hdr->write_idx = 0;
	174	hdr->write_seqno = 0;
	175
	176	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
	177	dev->size, dev->hdr, dev->ents, hdr->num_ents);
	178	return 0;
	179	}
	180
	181	static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
	182	struct nvgpu_ctxsw_ring_setup_args *args)
	183	{
	184	size_t size = args->size;
	185
	186	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "size=%zu", size);
	187
	188	if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
	189	return -EINVAL;
	190
	191	return gk20a_ctxsw_dev_ring_alloc(dev, size);
	192	}
	193
	194	static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
	195	struct nvgpu_ctxsw_trace_filter_args *args)
	196	{
	197	dev->filter = args->filter;
	198	return 0;
	199	}
	200
	201	static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
	202	struct nvgpu_ctxsw_trace_filter_args *args)
	203	{
	204	args->filter = dev->filter;
	205	return 0;
	206	}
	207
	208	static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
	209	{
	210	struct gk20a *g = dev->g;
	211	int err;
	212
	213	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "");
	214
	215	err = gk20a_busy(g->dev);
	216	if (err)
	217	return err;
	218
	219	if (g->ops.fecs_trace.flush(g))
	220	err = g->ops.fecs_trace.flush(g);
	221
	222	if (likely(!err))
	223	err = g->ops.fecs_trace.poll(g);
	224
	225	gk20a_idle(g->dev);
	226	return err;
	227	}
	228
	229	int gk20a_ctxsw_dev_open(struct inode inode, struct file filp)
	230	{
	231	struct gk20a *g;
	232	struct gk20a_ctxsw_trace *trace;
	233	struct gk20a_ctxsw_dev *dev;
	234	int err;
	235	size_t size;
	236	u32 n;
	237
	238	/* only one VM for now */
	239	const int vmid = 0;
	240
	241	g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
	242	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "g=%p", g);
	243
	244	if (!capable(CAP_SYS_ADMIN))
	245	return -EPERM;
	246
	247	err = gk20a_busy(g->dev);
	248	if (err)
	249	return err;
	250
	251	trace = g->ctxsw_trace;
	252	if (!trace) {
	253	err = -ENODEV;
	254	goto idle;
	255	}
	256
	257	/* Allow only one user for this device */
	258	dev = &trace->devs[vmid];
	259	mutex_lock(&dev->lock);
	260	if (dev->hdr) {
	261	err = -EBUSY;
	262	goto done;
	263	}
	264
	265	/* By default, allocate ring buffer big enough to accommodate
	266	* FECS records with default event filter */
	267
	268	/* enable all traces by default */
	269	NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
	270
	271	/* compute max number of entries generated with this filter */
	272	n = g->ops.fecs_trace.max_entries(g, &dev->filter);
	273
	274	size = sizeof(struct nvgpu_ctxsw_ring_header) +
	275	n * sizeof(struct nvgpu_ctxsw_trace_entry);
	276	gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
	277	size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
	278
	279	err = gk20a_ctxsw_dev_ring_alloc(dev, size);
	280	if (!err) {
	281	filp->private_data = dev;
	282	gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
	283	filp, dev, size);
	284	}
	285
	286	err = g->ops.fecs_trace.enable(g);
	287
	288	done:
	289	mutex_unlock(&dev->lock);
	290
	291	idle:
	292	gk20a_idle(g->dev);
	293
	294	return err;
	295	}
	296
	297	int gk20a_ctxsw_dev_release(struct inode inode, struct file filp)
	298	{
	299	struct gk20a_ctxsw_dev *dev = filp->private_data;
	300	struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
	301
	302	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "dev: %p", dev);
	303
	304	mutex_lock(&dev->lock);
	305	dev->write_enabled = false;
	306	if (dev->hdr) {
	307	vfree(dev->hdr);
	308	dev->hdr = NULL;
	309	}
	310
	311	g->ops.fecs_trace.disable(g);
	312
	313	mutex_unlock(&dev->lock);
	314
	315	return 0;
	316	}
	317
	318	long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
	319	unsigned long arg)
	320	{
	321	struct gk20a_ctxsw_dev *dev = filp->private_data;
	322	struct gk20a *g = dev->g;
	323	u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
	324	int err = 0;
	325
	326	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
	327
	328	if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) \|\| (_IOC_NR(cmd) == 0)
	329	\|\| (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
	330	return -EINVAL;
	331
	332	BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
	333
	334	memset(buf, 0, sizeof(buf));
	335	if (_IOC_DIR(cmd) & _IOC_WRITE) {
	336	if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
	337	return -EFAULT;
	338	}
	339
	340	mutex_lock(&dev->lock);
	341
	342	switch (cmd) {
	343	case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
	344	err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
	345	break;
	346	case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
	347	err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
	348	break;
	349	case NVGPU_CTXSW_IOCTL_RING_SETUP:
	350	err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
	351	(struct nvgpu_ctxsw_ring_setup_args *) buf);
	352	break;
	353	case NVGPU_CTXSW_IOCTL_SET_FILTER:
	354	err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
	355	(struct nvgpu_ctxsw_trace_filter_args *) buf);
	356	break;
	357	case NVGPU_CTXSW_IOCTL_GET_FILTER:
	358	err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
	359	(struct nvgpu_ctxsw_trace_filter_args *) buf);
	360	break;
	361	case NVGPU_CTXSW_IOCTL_POLL:
	362	mutex_unlock(&dev->lock);
	363	err = gk20a_ctxsw_dev_ioctl_poll(dev);
	364	mutex_lock(&dev->lock);
	365	break;
	366	default:
	367	dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
	368	cmd);
	369	err = -ENOTTY;
	370	}
	371
	372	mutex_unlock(&dev->lock);
	373
	374	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
	375	err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
	376
	377	return err;
	378	}
	379
	380	unsigned int gk20a_ctxsw_dev_poll(struct file filp, poll_table wait)
	381	{
	382	struct gk20a_ctxsw_dev *dev = filp->private_data;
	383	struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
	384	unsigned int mask = 0;
	385
	386	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "");
	387
	388	mutex_lock(&dev->lock);
	389	poll_wait(filp, &dev->readout_wq, wait);
	390	if (!ring_is_empty(hdr))
	391	mask \|= POLLIN \| POLLRDNORM;
	392	mutex_unlock(&dev->lock);
	393
	394	return mask;
	395	}
	396
	397	static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
	398	{
	399	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
	400
	401	atomic_inc(&dev->vma_ref);
	402	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "vma_ref=%d",
	403	atomic_read(&dev->vma_ref));
	404	}
	405
	406	static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
	407	{
	408	struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
	409
	410	atomic_dec(&dev->vma_ref);
	411	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "vma_ref=%d",
	412	atomic_read(&dev->vma_ref));
	413	}
	414
	415	static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
	416	.open = gk20a_ctxsw_dev_vma_open,
	417	.close = gk20a_ctxsw_dev_vma_close,
	418	};
	419
	420	int gk20a_ctxsw_dev_mmap(struct file filp, struct vm_area_struct vma)
	421	{
	422	struct gk20a_ctxsw_dev *dev = filp->private_data;
	423	int ret;
	424
	425	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
	426	vma->vm_start, vma->vm_end);
	427
	428	ret = remap_vmalloc_range(vma, dev->hdr, 0);
	429	if (likely(!ret)) {
	430	vma->vm_private_data = dev;
	431	vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
	432	vma->vm_ops->open(vma);
	433	}
	434
	435	return ret;
	436	}
	437
	438	#ifdef CONFIG_GK20A_CTXSW_TRACE
	439	static int gk20a_ctxsw_init_devs(struct gk20a *g)
	440	{
	441	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
	442	struct gk20a_ctxsw_dev *dev = trace->devs;
	443	int i;
	444
	445	for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
	446	dev->g = g;
	447	dev->hdr = NULL;
	448	dev->write_enabled = false;
	449	init_waitqueue_head(&dev->readout_wq);
	450	mutex_init(&dev->lock);
	451	atomic_set(&dev->vma_ref, 0);
	452	dev++;
	453	}
	454	return 0;
	455	}
	456	#endif
	457
	458	int gk20a_ctxsw_trace_init(struct gk20a *g)
	459	{
	460	#ifdef CONFIG_GK20A_CTXSW_TRACE
	461	struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
	462	int err;
	463
	464	gk20a_dbg(gpu_dbg_fn\|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
	465
	466	if (likely(trace))
	467	return 0;
	468
	469	trace = kzalloc(sizeof(*trace), GFP_KERNEL);
	470	if (unlikely(!trace))
	471	return -ENOMEM;
	472	g->ctxsw_trace = trace;
	473
	474	err = gk20a_ctxsw_init_devs(g);
	475	if (err)
	476	goto fail;
	477
	478	err = g->ops.fecs_trace.init(g);
	479	if (unlikely(err))
	480	goto fail;
	481
	482	return 0;
	483
	484	fail:
	485	kfree(trace);
	486	g->ctxsw_trace = NULL;
	487	return err;
	488	#else
	489	return 0;
	490	#endif
	491	}
	492
	493	void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
	494	{
	495	#ifdef CONFIG_GK20A_CTXSW_TRACE
	496	kfree(g->ctxsw_trace);
	497	g->ctxsw_trace = NULL;
	498
	499	g->ops.fecs_trace.deinit(g);
	500	#endif
	501	}
	502
	503	int gk20a_ctxsw_trace_write(struct gk20a *g,
	504	struct nvgpu_ctxsw_trace_entry *entry)
	505	{
	506	struct nvgpu_ctxsw_ring_header *hdr;
	507	struct gk20a_ctxsw_dev *dev;
	508	int ret = 0;
	509	const char *reason;
	510
	511	if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
	512	return -ENODEV;
	513
	514	dev = &g->ctxsw_trace->devs[entry->vmid];
	515	hdr = dev->hdr;
	516
	517	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_ctxsw,
	518	"dev=%p hdr=%p", dev, hdr);
	519
	520	mutex_lock(&dev->lock);
	521
	522	if (unlikely(!hdr)) {
	523	/* device has been released */
	524	ret = -ENODEV;
	525	goto done;
	526	}
	527
	528	entry->seqno = hdr->write_seqno++;
	529
	530	if (!dev->write_enabled) {
	531	ret = -EBUSY;
	532	reason = "write disabled";
	533	goto drop;
	534	}
	535
	536	if (unlikely(ring_is_full(hdr))) {
	537	ret = -ENOSPC;
	538	reason = "user fifo full";
	539	goto drop;
	540	}
	541
	542	if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
	543	reason = "filtered out";
	544	goto filter;
	545	}
	546
	547	gk20a_dbg(gpu_dbg_ctxsw,
	548	"seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
	549	entry->seqno, entry->context_id, entry->pid,
	550	entry->tag, entry->timestamp);
	551
	552	dev->ents[hdr->write_idx] = *entry;
	553
	554	/* ensure record is written before updating write index */
	555	smp_wmb();
	556
	557	hdr->write_idx++;
	558	if (unlikely(hdr->write_idx >= hdr->num_ents))
	559	hdr->write_idx = 0;
	560	gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
	561	hdr->read_idx, hdr->write_idx, ring_len(hdr));
	562
	563	mutex_unlock(&dev->lock);
	564	return ret;
	565
	566	drop:
	567	hdr->drop_count++;
	568
	569	filter:
	570	gk20a_dbg(gpu_dbg_ctxsw,
	571	"dropping seqno=%d context_id=%08x pid=%lld "
	572	"tag=%x time=%llx (%s)",
	573	entry->seqno, entry->context_id, entry->pid,
	574	entry->tag, entry->timestamp, reason);
	575
	576	done:
	577	mutex_unlock(&dev->lock);
	578	return ret;
	579	}
	580
	581	void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
	582	{
	583	struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
	584
	585	wake_up_interruptible(&dev->readout_wq);
	586	}