summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorAnton Vorontsov <avorontsov@nvidia.com>2015-08-19 17:27:51 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-03-23 10:48:47 -0400
commit1c40d09c4c9c011c1318c328c0b4b6b17d1f537e (patch)
tree8b93fcd00739f9ada9302f06175278c9cb1d6785 /drivers
parent82da6ed595a87c8a3038eecd75880ab21dd4c5de (diff)
gpu: nvgpu: Add support for FECS ctxsw tracing
bug 1648908 This commit adds support for FECS ctxsw tracing. Code is compiled conditionnaly under CONFIG_GK20_CTXSW_TRACE. This feature requires an updated FECS ucode that writes one record to a ring buffer on each context switch. On RM/Kernel side, the GPU driver reads records from the master ring buffer and generates trace entries into a user-facing VM ring buffer. For each record in the master ring buffer, RM/Kernel has to retrieve the vmid+pid of the user process that submitted related work. Features currently implemented: - master ring buffer allocation - debugfs to dump master ring buffer - FECS record per context switch (with both current and new contexts) - dedicated device for ctxsw tracing (access to VM ring buffer) - SOF generation (and access to PTIMER) - VM ring buffer allocation, and reconfiguration - enable/disable tracing at user level - event-based trace filtering - context_ptr to vmid+pid mapping - read system call for ctxsw dev - mmap system call for ctxsw dev (direct access to VM ring buffer) - poll system call for ctxsw dev - save/restore register on ELPG/CG6 - separate user ring from FECS ring handling Features requiring ucode changes: - enable/disable tracing at FECS level - actual busy time on engine (bug 1642354) - master ring buffer threshold interrupt (P1) - API for GPU to CPU timestamp conversion (P1) - vmid/pid/uid based filtering (P1) Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: http://git-master/r/1022737 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/Kconfig10
-rw-r--r--drivers/gpu/nvgpu/Makefile4
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c586
-rw-r--r--drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h41
-rw-r--r--drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c763
-rw-r--r--drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h20
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c19
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c49
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h26
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c8
-rw-r--r--drivers/gpu/nvgpu/gk20a/hal_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h190
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c21
-rw-r--r--drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h20
-rw-r--r--drivers/gpu/nvgpu/vgpu/vgpu.c2
17 files changed, 1756 insertions, 11 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index d0e25aa2..94173976 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -54,6 +54,16 @@ config GK20A_CYCLE_STATS
54 help 54 help
55 Say Y here to enable the cycle stats debugging features. 55 Say Y here to enable the cycle stats debugging features.
56 56
57config GK20A_CTXSW_TRACE
58 bool "Support GK20A Context Switch tracing"
59 depends on GK20A
60 default n
61 help
62 Enable support for the GK20A Context Switch Tracing. In this mode,
63 FECS collects timestamps for contexts loaded on GR engine. This
64 allows tracking context switches on GR engine, as well as
65 identifying processes that submitted work.
66
57config TEGRA_GK20A 67config TEGRA_GK20A
58 bool "Enable the GK20A GPU on Tegra" 68 bool "Enable the GK20A GPU on Tegra"
59 depends on TEGRA_GRHOST || TEGRA_HOST1X 69 depends on TEGRA_GRHOST || TEGRA_HOST1X
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 932dde1a..df660eb7 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -46,6 +46,8 @@ nvgpu-y := \
46 gk20a/cde_gk20a.o \ 46 gk20a/cde_gk20a.o \
47 gk20a/platform_gk20a_generic.o \ 47 gk20a/platform_gk20a_generic.o \
48 gk20a/tsg_gk20a.o \ 48 gk20a/tsg_gk20a.o \
49 gk20a/ctxsw_trace_gk20a.o \
50 gk20a/fecs_trace_gk20a.o \
49 gk20a/mc_gk20a.o \ 51 gk20a/mc_gk20a.o \
50 gm20b/hal_gm20b.o \ 52 gm20b/hal_gm20b.o \
51 gm20b/ltc_gm20b.o \ 53 gm20b/ltc_gm20b.o \
@@ -64,7 +66,6 @@ nvgpu-y := \
64 gm20b/debug_gm20b.o \ 66 gm20b/debug_gm20b.o \
65 gm20b/cde_gm20b.o \ 67 gm20b/cde_gm20b.o \
66 gm20b/therm_gm20b.o 68 gm20b/therm_gm20b.o
67
68nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o 69nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o
69nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o 70nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
70 71
@@ -78,6 +79,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
78 vgpu/debug_vgpu.o \ 79 vgpu/debug_vgpu.o \
79 vgpu/vgpu.o \ 80 vgpu/vgpu.o \
80 vgpu/dbg_vgpu.o \ 81 vgpu/dbg_vgpu.o \
82 vgpu/fecs_trace_vgpu.o \
81 vgpu/gk20a/vgpu_hal_gk20a.o \ 83 vgpu/gk20a/vgpu_hal_gk20a.o \
82 vgpu/gk20a/vgpu_gr_gk20a.o \ 84 vgpu/gk20a/vgpu_gr_gk20a.o \
83 vgpu/gm20b/vgpu_hal_gm20b.o \ 85 vgpu/gm20b/vgpu_hal_gm20b.o \
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 9b1f2987..0dd1fb8b 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -28,6 +28,7 @@
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29 29
30#include "debug_gk20a.h" 30#include "debug_gk20a.h"
31#include "ctxsw_trace_gk20a.h"
31 32
32#include "gk20a.h" 33#include "gk20a.h"
33#include "dbg_gpu_gk20a.h" 34#include "dbg_gpu_gk20a.h"
@@ -920,6 +921,9 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
920 921
921 gk20a_free_error_notifiers(ch); 922 gk20a_free_error_notifiers(ch);
922 923
924 if (g->ops.fecs_trace.unbind_channel)
925 g->ops.fecs_trace.unbind_channel(g, ch);
926
923 /* release channel ctx */ 927 /* release channel ctx */
924 g->ops.gr.free_channel_ctx(ch); 928 g->ops.gr.free_channel_ctx(ch);
925 929
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
new file mode 100644
index 00000000..9e7c04ad
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <asm/barrier.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17#include <linux/circ_buf.h>
18#include <linux/delay.h>
19#include <linux/jiffies.h>
20#include <linux/wait.h>
21#include <linux/ktime.h>
22#include <linux/nvgpu.h>
23#include <linux/hashtable.h>
24#include <linux/debugfs.h>
25#include <linux/log2.h>
26#include <uapi/linux/nvgpu.h>
27#include "ctxsw_trace_gk20a.h"
28#include "gk20a.h"
29#include "gr_gk20a.h"
30#include "hw_ctxsw_prog_gk20a.h"
31#include "hw_gr_gk20a.h"
32
33#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE (128*PAGE_SIZE)
34
35/* Userland-facing FIFO (one global + eventually one per VM) */
36struct gk20a_ctxsw_dev {
37 struct gk20a *g;
38
39 struct nvgpu_ctxsw_ring_header *hdr;
40 struct nvgpu_ctxsw_trace_entry *ents;
41 struct nvgpu_ctxsw_trace_filter filter;
42 bool write_enabled;
43 wait_queue_head_t readout_wq;
44 size_t size;
45
46 atomic_t vma_ref;
47
48 struct mutex lock;
49};
50
51
52struct gk20a_ctxsw_trace {
53 struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
54};
55
56static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
57{
58 return (hdr->write_idx == hdr->read_idx);
59}
60
61static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
62{
63 return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
64}
65
66static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
67{
68 return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
69}
70
71static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
72{
73 return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
74}
75
76ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
77 loff_t *off)
78{
79 struct gk20a_ctxsw_dev *dev = filp->private_data;
80 struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
81 struct nvgpu_ctxsw_trace_entry __user *entry =
82 (struct nvgpu_ctxsw_trace_entry *) buf;
83 size_t copied = 0;
84 int err;
85
86 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
87 "filp=%p buf=%p size=%zu", filp, buf, size);
88
89 mutex_lock(&dev->lock);
90 while (ring_is_empty(hdr)) {
91 mutex_unlock(&dev->lock);
92 if (filp->f_flags & O_NONBLOCK)
93 return -EAGAIN;
94 err = wait_event_interruptible(dev->readout_wq,
95 !ring_is_empty(hdr));
96 if (err)
97 return err;
98 mutex_lock(&dev->lock);
99 }
100
101 while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
102 if (ring_is_empty(hdr))
103 break;
104
105 if (copy_to_user(entry, &dev->ents[hdr->read_idx],
106 sizeof(*entry))) {
107 mutex_unlock(&dev->lock);
108 return -EFAULT;
109 }
110
111 hdr->read_idx++;
112 if (hdr->read_idx >= hdr->num_ents)
113 hdr->read_idx = 0;
114
115 entry++;
116 copied += sizeof(*entry);
117 size -= sizeof(*entry);
118 }
119
120 gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
121 hdr->read_idx);
122
123 *off = hdr->read_idx;
124 mutex_unlock(&dev->lock);
125
126 return copied;
127}
128
129static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
130{
131 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
132 dev->write_enabled = true;
133 return 0;
134}
135
136static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
137{
138 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
139 dev->write_enabled = false;
140 return 0;
141}
142
143static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
144 size_t size)
145{
146 struct nvgpu_ctxsw_ring_header *hdr;
147
148 if (atomic_read(&dev->vma_ref))
149 return -EBUSY;
150
151 if ((dev->write_enabled) || (atomic_read(&dev->vma_ref)))
152 return -EBUSY;
153
154 size = roundup(size, PAGE_SIZE);
155 hdr = vmalloc_user(size);
156 if (!hdr)
157 return -ENOMEM;
158
159 if (dev->hdr)
160 vfree(dev->hdr);
161
162 dev->hdr = hdr;
163 dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
164 dev->size = size;
165
166 hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
167 hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
168 hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
169 / sizeof(struct nvgpu_ctxsw_trace_entry);
170 hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
171 hdr->drop_count = 0;
172 hdr->read_idx = 0;
173 hdr->write_idx = 0;
174 hdr->write_seqno = 0;
175
176 gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
177 dev->size, dev->hdr, dev->ents, hdr->num_ents);
178 return 0;
179}
180
181static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
182 struct nvgpu_ctxsw_ring_setup_args *args)
183{
184 size_t size = args->size;
185
186 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
187
188 if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
189 return -EINVAL;
190
191 return gk20a_ctxsw_dev_ring_alloc(dev, size);
192}
193
194static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
195 struct nvgpu_ctxsw_trace_filter_args *args)
196{
197 dev->filter = args->filter;
198 return 0;
199}
200
201static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
202 struct nvgpu_ctxsw_trace_filter_args *args)
203{
204 args->filter = dev->filter;
205 return 0;
206}
207
208static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
209{
210 struct gk20a *g = dev->g;
211 int err;
212
213 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
214
215 err = gk20a_busy(g->dev);
216 if (err)
217 return err;
218
219 if (g->ops.fecs_trace.flush(g))
220 err = g->ops.fecs_trace.flush(g);
221
222 if (likely(!err))
223 err = g->ops.fecs_trace.poll(g);
224
225 gk20a_idle(g->dev);
226 return err;
227}
228
229int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
230{
231 struct gk20a *g;
232 struct gk20a_ctxsw_trace *trace;
233 struct gk20a_ctxsw_dev *dev;
234 int err;
235 size_t size;
236 u32 n;
237
238 /* only one VM for now */
239 const int vmid = 0;
240
241 g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
242 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 err = gk20a_busy(g->dev);
248 if (err)
249 return err;
250
251 trace = g->ctxsw_trace;
252 if (!trace) {
253 err = -ENODEV;
254 goto idle;
255 }
256
257 /* Allow only one user for this device */
258 dev = &trace->devs[vmid];
259 mutex_lock(&dev->lock);
260 if (dev->hdr) {
261 err = -EBUSY;
262 goto done;
263 }
264
265 /* By default, allocate ring buffer big enough to accommodate
266 * FECS records with default event filter */
267
268 /* enable all traces by default */
269 NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
270
271 /* compute max number of entries generated with this filter */
272 n = g->ops.fecs_trace.max_entries(g, &dev->filter);
273
274 size = sizeof(struct nvgpu_ctxsw_ring_header) +
275 n * sizeof(struct nvgpu_ctxsw_trace_entry);
276 gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
277 size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
278
279 err = gk20a_ctxsw_dev_ring_alloc(dev, size);
280 if (!err) {
281 filp->private_data = dev;
282 gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
283 filp, dev, size);
284 }
285
286 err = g->ops.fecs_trace.enable(g);
287
288done:
289 mutex_unlock(&dev->lock);
290
291idle:
292 gk20a_idle(g->dev);
293
294 return err;
295}
296
297int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
298{
299 struct gk20a_ctxsw_dev *dev = filp->private_data;
300 struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
301
302 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
303
304 mutex_lock(&dev->lock);
305 dev->write_enabled = false;
306 if (dev->hdr) {
307 vfree(dev->hdr);
308 dev->hdr = NULL;
309 }
310
311 g->ops.fecs_trace.disable(g);
312
313 mutex_unlock(&dev->lock);
314
315 return 0;
316}
317
318long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
319 unsigned long arg)
320{
321 struct gk20a_ctxsw_dev *dev = filp->private_data;
322 struct gk20a *g = dev->g;
323 u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
324 int err = 0;
325
326 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
327
328 if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) || (_IOC_NR(cmd) == 0)
329 || (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
330 return -EINVAL;
331
332 BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
333
334 memset(buf, 0, sizeof(buf));
335 if (_IOC_DIR(cmd) & _IOC_WRITE) {
336 if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
337 return -EFAULT;
338 }
339
340 mutex_lock(&dev->lock);
341
342 switch (cmd) {
343 case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
344 err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
345 break;
346 case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
347 err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
348 break;
349 case NVGPU_CTXSW_IOCTL_RING_SETUP:
350 err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
351 (struct nvgpu_ctxsw_ring_setup_args *) buf);
352 break;
353 case NVGPU_CTXSW_IOCTL_SET_FILTER:
354 err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
355 (struct nvgpu_ctxsw_trace_filter_args *) buf);
356 break;
357 case NVGPU_CTXSW_IOCTL_GET_FILTER:
358 err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
359 (struct nvgpu_ctxsw_trace_filter_args *) buf);
360 break;
361 case NVGPU_CTXSW_IOCTL_POLL:
362 mutex_unlock(&dev->lock);
363 err = gk20a_ctxsw_dev_ioctl_poll(dev);
364 mutex_lock(&dev->lock);
365 break;
366 default:
367 dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
368 cmd);
369 err = -ENOTTY;
370 }
371
372 mutex_unlock(&dev->lock);
373
374 if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
375 err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
376
377 return err;
378}
379
380unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
381{
382 struct gk20a_ctxsw_dev *dev = filp->private_data;
383 struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
384 unsigned int mask = 0;
385
386 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
387
388 mutex_lock(&dev->lock);
389 poll_wait(filp, &dev->readout_wq, wait);
390 if (!ring_is_empty(hdr))
391 mask |= POLLIN | POLLRDNORM;
392 mutex_unlock(&dev->lock);
393
394 return mask;
395}
396
397static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
398{
399 struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
400
401 atomic_inc(&dev->vma_ref);
402 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
403 atomic_read(&dev->vma_ref));
404}
405
406static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
407{
408 struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
409
410 atomic_dec(&dev->vma_ref);
411 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
412 atomic_read(&dev->vma_ref));
413}
414
415static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
416 .open = gk20a_ctxsw_dev_vma_open,
417 .close = gk20a_ctxsw_dev_vma_close,
418};
419
420int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
421{
422 struct gk20a_ctxsw_dev *dev = filp->private_data;
423 int ret;
424
425 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
426 vma->vm_start, vma->vm_end);
427
428 ret = remap_vmalloc_range(vma, dev->hdr, 0);
429 if (likely(!ret)) {
430 vma->vm_private_data = dev;
431 vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
432 vma->vm_ops->open(vma);
433 }
434
435 return ret;
436}
437
438#ifdef CONFIG_GK20A_CTXSW_TRACE
439static int gk20a_ctxsw_init_devs(struct gk20a *g)
440{
441 struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
442 struct gk20a_ctxsw_dev *dev = trace->devs;
443 int i;
444
445 for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
446 dev->g = g;
447 dev->hdr = NULL;
448 dev->write_enabled = false;
449 init_waitqueue_head(&dev->readout_wq);
450 mutex_init(&dev->lock);
451 atomic_set(&dev->vma_ref, 0);
452 dev++;
453 }
454 return 0;
455}
456#endif
457
458int gk20a_ctxsw_trace_init(struct gk20a *g)
459{
460#ifdef CONFIG_GK20A_CTXSW_TRACE
461 struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
462 int err;
463
464 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
465
466 if (likely(trace))
467 return 0;
468
469 trace = kzalloc(sizeof(*trace), GFP_KERNEL);
470 if (unlikely(!trace))
471 return -ENOMEM;
472 g->ctxsw_trace = trace;
473
474 err = gk20a_ctxsw_init_devs(g);
475 if (err)
476 goto fail;
477
478 err = g->ops.fecs_trace.init(g);
479 if (unlikely(err))
480 goto fail;
481
482 return 0;
483
484fail:
485 kfree(trace);
486 g->ctxsw_trace = NULL;
487 return err;
488#else
489 return 0;
490#endif
491}
492
493void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
494{
495#ifdef CONFIG_GK20A_CTXSW_TRACE
496 kfree(g->ctxsw_trace);
497 g->ctxsw_trace = NULL;
498
499 g->ops.fecs_trace.deinit(g);
500#endif
501}
502
503int gk20a_ctxsw_trace_write(struct gk20a *g,
504 struct nvgpu_ctxsw_trace_entry *entry)
505{
506 struct nvgpu_ctxsw_ring_header *hdr;
507 struct gk20a_ctxsw_dev *dev;
508 int ret = 0;
509 const char *reason;
510
511 if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
512 return -ENODEV;
513
514 dev = &g->ctxsw_trace->devs[entry->vmid];
515 hdr = dev->hdr;
516
517 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
518 "dev=%p hdr=%p", dev, hdr);
519
520 mutex_lock(&dev->lock);
521
522 if (unlikely(!hdr)) {
523 /* device has been released */
524 ret = -ENODEV;
525 goto done;
526 }
527
528 entry->seqno = hdr->write_seqno++;
529
530 if (!dev->write_enabled) {
531 ret = -EBUSY;
532 reason = "write disabled";
533 goto drop;
534 }
535
536 if (unlikely(ring_is_full(hdr))) {
537 ret = -ENOSPC;
538 reason = "user fifo full";
539 goto drop;
540 }
541
542 if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
543 reason = "filtered out";
544 goto filter;
545 }
546
547 gk20a_dbg(gpu_dbg_ctxsw,
548 "seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
549 entry->seqno, entry->context_id, entry->pid,
550 entry->tag, entry->timestamp);
551
552 dev->ents[hdr->write_idx] = *entry;
553
554 /* ensure record is written before updating write index */
555 smp_wmb();
556
557 hdr->write_idx++;
558 if (unlikely(hdr->write_idx >= hdr->num_ents))
559 hdr->write_idx = 0;
560 gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
561 hdr->read_idx, hdr->write_idx, ring_len(hdr));
562
563 mutex_unlock(&dev->lock);
564 return ret;
565
566drop:
567 hdr->drop_count++;
568
569filter:
570 gk20a_dbg(gpu_dbg_ctxsw,
571 "dropping seqno=%d context_id=%08x pid=%lld "
572 "tag=%x time=%llx (%s)",
573 entry->seqno, entry->context_id, entry->pid,
574 entry->tag, entry->timestamp, reason);
575
576done:
577 mutex_unlock(&dev->lock);
578 return ret;
579}
580
581void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
582{
583 struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
584
585 wake_up_interruptible(&dev->readout_wq);
586}
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
new file mode 100644
index 00000000..c57d95d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __CTXSW_TRACE_GK20A_H
15#define __CTXSW_TRACE_GK20A_H
16
17#define GK20A_CTXSW_TRACE_NUM_DEVS 1
18
19struct gk20a;
20struct nvgpu_ctxsw_trace_entry;
21struct channel_gk20a;
22struct channel_ctx_gk20a;
23struct gk20a_ctxsw_dev;
24struct gk20a_fecs_trace;
25
26
27int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp);
28int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp);
29long gk20a_ctxsw_dev_ioctl(struct file *filp,
30 unsigned int cmd, unsigned long arg);
31ssize_t gk20a_ctxsw_dev_read(struct file *, char __user *, size_t, loff_t *);
32unsigned int gk20a_ctxsw_dev_poll(struct file *, struct poll_table_struct *);
33int gk20a_ctxsw_dev_mmap(struct file *, struct vm_area_struct *);
34
35int gk20a_ctxsw_trace_init(struct gk20a *);
36int gk20a_ctxsw_trace_setup(struct gk20a *, void *ctx_ptr);
37void gk20a_ctxsw_trace_cleanup(struct gk20a *);
38int gk20a_ctxsw_trace_write(struct gk20a *, struct nvgpu_ctxsw_trace_entry *);
39void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid);
40
41#endif /* __CTXSW_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
new file mode 100644
index 00000000..bac36403
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -0,0 +1,763 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <asm/barrier.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17#include <linux/circ_buf.h>
18#include <linux/delay.h>
19#include <linux/jiffies.h>
20#include <linux/wait.h>
21#include <linux/ktime.h>
22#include <linux/nvgpu.h>
23#include <linux/hashtable.h>
24#include <linux/debugfs.h>
25#include <linux/log2.h>
26#include <uapi/linux/nvgpu.h>
27#include "ctxsw_trace_gk20a.h"
28#include "fecs_trace_gk20a.h"
29#include "gk20a.h"
30#include "gr_gk20a.h"
31#include "hw_ctxsw_prog_gk20a.h"
32#include "hw_gr_gk20a.h"
33
34/*
35 * If HW circular buffer is getting too many "buffer full" conditions,
36 * increasing this constant should help (it drives Linux' internal buffer size).
37 */
38#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6)
39#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */
40#define GK20A_FECS_TRACE_FRAME_PERIOD_NS (1000000000ULL/60ULL)
41#define GK20A_FECS_TRACE_PTIMER_SHIFT 5
42
43struct gk20a_fecs_trace_record {
44 u32 magic_lo;
45 u32 magic_hi;
46 u32 context_id;
47 u32 context_ptr;
48 u32 new_context_id;
49 u32 new_context_ptr;
50 u64 ts[];
51};
52
53struct gk20a_fecs_trace_hash_ent {
54 u32 context_ptr;
55 pid_t pid;
56 struct hlist_node node;
57};
58
59struct gk20a_fecs_trace {
60
61 struct mem_desc trace_buf;
62 DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
63 struct mutex hash_lock;
64 struct mutex poll_lock;
65 u64 sof;
66 u32 sof_mask; /* did we already send a SOF for this VM */
67
68 struct task_struct *poll_task;
69};
70
71#ifdef CONFIG_GK20A_CTXSW_TRACE
72static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
73{
74 return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
75}
76
77static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
78{
79 return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
80}
81
82
83static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch)
84{
85 return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL);
86}
87
88static inline int gk20a_fecs_trace_num_ts(void)
89{
90 return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
91 - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
92}
93
94struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
95 struct gk20a_fecs_trace *trace, int idx)
96{
97 return (struct gk20a_fecs_trace_record *)
98 ((u8 *) trace->trace_buf.cpu_va
99 + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
100}
101
102static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
103{
104 /*
105 * testing magic_hi should suffice. magic_lo is sometimes used
106 * as a sequence number in experimental ucode.
107 */
108 return (r->magic_hi
109 == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
110}
111
112static int gk20a_fecs_trace_get_read_index(struct gk20a *g)
113{
114 return gr_gk20a_elpg_protected_call(g,
115 gk20a_readl(g, gr_fecs_mailbox1_r()));
116}
117
118static int gk20a_fecs_trace_get_write_index(struct gk20a *g)
119{
120 return gr_gk20a_elpg_protected_call(g,
121 gk20a_readl(g, gr_fecs_mailbox0_r()));
122}
123
124static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
125{
126 gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index);
127 return gr_gk20a_elpg_protected_call(g,
128 (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
129}
130
131void gk20a_fecs_trace_hash_dump(struct gk20a *g)
132{
133 u32 bkt;
134 struct gk20a_fecs_trace_hash_ent *ent;
135 struct gk20a_fecs_trace *trace = g->fecs_trace;
136
137 gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table");
138
139 mutex_lock(&trace->hash_lock);
140 hash_for_each(trace->pid_hash_table, bkt, ent, node)
141 {
142 gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
143 ent, bkt, ent->context_ptr, ent->pid);
144
145 }
146 mutex_unlock(&trace->hash_lock);
147}
148
149static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
150{
151 struct gk20a_fecs_trace_hash_ent *he;
152 struct gk20a_fecs_trace *trace = g->fecs_trace;
153
154 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
155 "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);
156
157 he = kzalloc(sizeof(*he), GFP_KERNEL);
158 if (unlikely(!he)) {
159 gk20a_warn(dev_from_gk20a(g),
160 "can't alloc new hash entry for context_ptr=%x pid=%d",
161 context_ptr, pid);
162 return -ENOMEM;
163 }
164
165 he->context_ptr = context_ptr;
166 he->pid = pid;
167 mutex_lock(&trace->hash_lock);
168 hash_add(trace->pid_hash_table, &he->node, context_ptr);
169 mutex_unlock(&trace->hash_lock);
170 return 0;
171}
172
173static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
174{
175 struct hlist_node *tmp;
176 struct gk20a_fecs_trace_hash_ent *ent;
177 struct gk20a_fecs_trace *trace = g->fecs_trace;
178
179 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
180 "freeing hash entry context_ptr=%x", context_ptr);
181
182 mutex_lock(&trace->hash_lock);
183 hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
184 context_ptr) {
185 if (ent->context_ptr == context_ptr) {
186 hash_del(&ent->node);
187 gk20a_dbg(gpu_dbg_ctxsw,
188 "freed hash entry=%p context_ptr=%x", ent,
189 ent->context_ptr);
190 kfree(ent);
191 break;
192 }
193 }
194 mutex_unlock(&trace->hash_lock);
195}
196
197static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
198{
199 u32 bkt;
200 struct hlist_node *tmp;
201 struct gk20a_fecs_trace_hash_ent *ent;
202 struct gk20a_fecs_trace *trace = g->fecs_trace;
203
204 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);
205
206 mutex_lock(&trace->hash_lock);
207 hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
208 hash_del(&ent->node);
209 kfree(ent);
210 }
211 mutex_unlock(&trace->hash_lock);
212
213}
214
215static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
216{
217 struct gk20a_fecs_trace_hash_ent *ent;
218 struct gk20a_fecs_trace *trace = g->fecs_trace;
219 pid_t pid = 0;
220
221 mutex_lock(&trace->hash_lock);
222 hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
223 if (ent->context_ptr == context_ptr) {
224 gk20a_dbg(gpu_dbg_ctxsw,
225 "found context_ptr=%x -> pid=%d",
226 ent->context_ptr, ent->pid);
227 pid = ent->pid;
228 break;
229 }
230 }
231 mutex_unlock(&trace->hash_lock);
232
233 return pid;
234}
235
236/*
237 * Converts HW entry format to userspace-facing format and pushes it to the
238 * queue.
239 */
240static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
241{
242 int i;
243 struct nvgpu_ctxsw_trace_entry entry = { };
244 struct gk20a_fecs_trace *trace = g->fecs_trace;
245 pid_t cur_pid;
246 pid_t new_pid;
247
248 /* for now, only one VM */
249 const int vmid = 0;
250
251 struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(
252 trace, index);
253
254 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
255 "consuming record trace=%p read=%d record=%p", trace, index, r);
256
257 if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
258 gk20a_warn(dev_from_gk20a(g),
259 "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
260 trace, index, r, r->magic_lo, r->magic_hi);
261 return -EINVAL;
262 }
263
264 cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
265 new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);
266
267 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
268 "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
269 r->context_ptr, cur_pid, r->new_context_ptr, new_pid);
270
271 entry.context_id = r->context_id;
272 entry.vmid = vmid;
273
274 /* insert SOF event if needed */
275 if (!(trace->sof_mask & BIT(vmid))) {
276 entry.tag = NVGPU_CTXSW_TAG_SOF;
277 entry.timestamp = trace->sof;
278 entry.context_id = 0;
279 entry.pid = 0;
280
281 gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp);
282 gk20a_ctxsw_trace_write(g, &entry);
283 trace->sof_mask |= BIT(vmid);
284 }
285
286 /* break out FECS record into trace events */
287 for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
288
289 entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
290 entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
291 entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
292
293 gk20a_dbg(gpu_dbg_ctxsw,
294 "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
295 entry.tag, entry.timestamp, r->context_id,
296 r->new_context_id);
297
298 switch (entry.tag) {
299 case NVGPU_CTXSW_TAG_RESTORE_START:
300 case NVGPU_CTXSW_TAG_CONTEXT_START:
301 entry.context_id = r->new_context_id;
302 entry.pid = new_pid;
303 break;
304
305 case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
306 case NVGPU_CTXSW_TAG_FE_ACK:
307 case NVGPU_CTXSW_TAG_FE_ACK_WFI:
308 case NVGPU_CTXSW_TAG_FE_ACK_GFXP:
309 case NVGPU_CTXSW_TAG_FE_ACK_CTAP:
310 case NVGPU_CTXSW_TAG_FE_ACK_CILP:
311 case NVGPU_CTXSW_TAG_SAVE_END:
312 entry.context_id = r->context_id;
313 entry.pid = cur_pid;
314 break;
315
316 default:
317 /* tags are not guaranteed to start at the beginning */
318 WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP));
319 continue;
320 }
321
322 gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
323 entry.tag, entry.context_id, entry.pid);
324
325 if (!entry.context_id)
326 continue;
327
328 gk20a_ctxsw_trace_write(g, &entry);
329 }
330
331 gk20a_ctxsw_trace_wake_up(g, vmid);
332 return 0;
333}
334
335static int gk20a_fecs_trace_poll(struct gk20a *g)
336{
337 struct gk20a_fecs_trace *trace = g->fecs_trace;
338
339 int read = 0;
340 int write = 0;
341 int cnt;
342 int err;
343
344 err = gk20a_busy(g->dev);
345 if (unlikely(err))
346 return err;
347
348 mutex_lock(&trace->poll_lock);
349 write = gk20a_fecs_trace_get_write_index(g);
350 if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
351 gk20a_err(dev_from_gk20a(g),
352 "failed to acquire write index, write=%d", write);
353 err = write;
354 goto done;
355 }
356
357 read = gk20a_fecs_trace_get_read_index(g);
358
359 cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
360 if (!cnt)
361 goto done;
362
363 gk20a_dbg(gpu_dbg_ctxsw,
364 "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
365 read, gk20a_fecs_trace_get_read_index(g), write, cnt);
366
367 /* we did not send any SOF yet */
368 trace->sof_mask = 0;
369
370 /* consume all records */
371 while (read != write) {
372 gk20a_fecs_trace_ring_read(g, read);
373
374 /* Get to next record. */
375 read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
376 gk20a_fecs_trace_set_read_index(g, read);
377 }
378
379done:
380 /*
381 * OK, we read out all the entries... a new "frame" starts here.
382 * We remember the Start Of Frame time and insert it on the next
383 * iteration.
384 */
385 trace->sof = gk20a_read_ptimer(g);
386
387 mutex_unlock(&trace->poll_lock);
388 gk20a_idle(g->dev);
389 return err;
390}
391
392static int gk20a_fecs_trace_periodic_polling(void *arg)
393{
394 struct gk20a *g = (struct gk20a *)arg;
395 struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS);
396
397 pr_info("%s: running\n", __func__);
398
399 while (!kthread_should_stop()) {
400
401 hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
402
403 gk20a_fecs_trace_poll(g);
404 }
405
406 return 0;
407}
408
409static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)
410{
411 struct gk20a_fecs_trace *trace = g->fecs_trace;
412
413 return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS
414 * ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
415 &trace->trace_buf);
416}
417
418static void gk20a_fecs_trace_free_ring(struct gk20a *g)
419{
420 struct gk20a_fecs_trace *trace = g->fecs_trace;
421
422 gk20a_gmmu_free(g, &trace->trace_buf);
423}
424
425#ifdef CONFIG_DEBUG_FS
426/*
427 * The sequence iterator functions. We simply use the count of the
428 * next line as our internal position.
429 */
430static void *gk20a_fecs_trace_debugfs_ring_seq_start(
431 struct seq_file *s, loff_t *pos)
432{
433 if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
434 return NULL;
435
436 return pos;
437}
438
439static void *gk20a_fecs_trace_debugfs_ring_seq_next(
440 struct seq_file *s, void *v, loff_t *pos)
441{
442 ++(*pos);
443 if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
444 return NULL;
445 return pos;
446}
447
448static void gk20a_fecs_trace_debugfs_ring_seq_stop(
449 struct seq_file *s, void *v)
450{
451}
452
453static int gk20a_fecs_trace_debugfs_ring_seq_show(
454 struct seq_file *s, void *v)
455{
456 loff_t *pos = (loff_t *) v;
457 struct gk20a *g = *(struct gk20a **)s->private;
458 struct gk20a_fecs_trace *trace = g->fecs_trace;
459 struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos);
460 int i;
461 const u32 invalid_tag =
462 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
463 u32 tag;
464 u64 timestamp;
465
466 seq_printf(s, "record #%lld (%p)\n", *pos, r);
467 seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo);
468 seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi);
469 if (gk20a_fecs_trace_is_valid_record(r)) {
470 seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr);
471 seq_printf(s, "\tcontext_id=%08x\n", r->context_id);
472 seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr);
473 seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id);
474 for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
475 tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
476 if (tag == invalid_tag)
477 continue;
478 timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
479 timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
480 seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp);
481 }
482 }
483 return 0;
484}
485
486/*
487 * Tie them all together into a set of seq_operations.
488 */
489const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = {
490 .start = gk20a_fecs_trace_debugfs_ring_seq_start,
491 .next = gk20a_fecs_trace_debugfs_ring_seq_next,
492 .stop = gk20a_fecs_trace_debugfs_ring_seq_stop,
493 .show = gk20a_fecs_trace_debugfs_ring_seq_show
494};
495
496/*
497 * Time to set up the file operations for our /proc file. In this case,
498 * all we need is an open function which sets up the sequence ops.
499 */
500
501static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode,
502 struct file *file)
503{
504 struct gk20a **p;
505
506 if (!capable(CAP_SYS_ADMIN))
507 return -EPERM;
508
509 p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops,
510 sizeof(struct gk20a *));
511 if (!p)
512 return -ENOMEM;
513
514 *p = (struct gk20a *)inode->i_private;
515 return 0;
516};
517
518/*
519 * The file operations structure contains our open function along with
520 * set of the canned seq_ ops.
521 */
522const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = {
523 .owner = THIS_MODULE,
524 .open = gk20a_ctxsw_debugfs_ring_open,
525 .read = seq_read,
526 .llseek = seq_lseek,
527 .release = seq_release_private
528};
529
530static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val)
531{
532 *val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg);
533 return 0;
534}
535DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops,
536 gk20a_fecs_trace_debugfs_read, NULL, "%llu\n");
537
538static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val)
539{
540 *val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg);
541 return 0;
542}
543DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops,
544 gk20a_fecs_trace_debugfs_write, NULL, "%llu\n");
545
546static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
547{
548 struct gk20a_platform *plat = platform_get_drvdata(g->dev);
549
550 debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g,
551 &gk20a_fecs_trace_debugfs_read_fops);
552 debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g,
553 &gk20a_fecs_trace_debugfs_write_fops);
554 debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g,
555 &gk20a_fecs_trace_debugfs_ring_fops);
556}
557
558static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
559{
560 struct gk20a_platform *plat = platform_get_drvdata(g->dev);
561
562 debugfs_remove_recursive(plat->debugfs);
563}
564
565#else
566
567static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
568{
569}
570
571static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
572{
573}
574
575#endif /* CONFIG_DEBUG_FS */
576
577static int gk20a_fecs_trace_init(struct gk20a *g)
578{
579 struct gk20a_fecs_trace *trace;
580 int err;
581
582 trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL);
583 if (!trace) {
584 gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace");
585 return -ENOMEM;
586 }
587 g->fecs_trace = trace;
588
589 BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
590 err = gk20a_fecs_trace_alloc_ring(g);
591 if (err) {
592 gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring");
593 goto clean;
594 }
595
596 mutex_init(&trace->poll_lock);
597 mutex_init(&trace->hash_lock);
598 hash_init(trace->pid_hash_table);
599
600 gk20a_fecs_trace_debugfs_init(g);
601 return 0;
602
603clean:
604 kfree(trace);
605 g->fecs_trace = NULL;
606 return err;
607}
608
609static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
610 struct channel_gk20a *ch)
611{
612 /*
613 * map our circ_buf to the context space and store the GPU VA
614 * in the context header.
615 */
616
617 u32 lo;
618 u32 hi;
619 phys_addr_t pa;
620 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
621 struct gk20a_fecs_trace *trace = g->fecs_trace;
622 void *ctx_ptr;
623 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
624
625 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
626 "hw_chid=%d context_ptr=%x inst_block=%llx",
627 ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block));
628
629 if (!trace)
630 return -ENOMEM;
631
632 pa = gk20a_mem_phys(&trace->trace_buf);
633 if (!pa)
634 return -ENOMEM;
635
636 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
637 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
638 pgprot_writecombine(PAGE_KERNEL));
639 if (!ctx_ptr)
640 return -ENOMEM;
641
642 lo = u64_lo32(pa);
643 hi = u64_hi32(pa);
644
645 gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
646 lo, GK20A_FECS_TRACE_NUM_RECORDS);
647
648 gk20a_mem_wr32(ctx_ptr
649 + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
650 0, lo);
651 gk20a_mem_wr32(ctx_ptr
652 + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
653 0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
654 gk20a_mem_wr32(ctx_ptr
655 + ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
656 0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
657 GK20A_FECS_TRACE_NUM_RECORDS));
658
659 vunmap(ctx_ptr);
660 gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
661
662 return 0;
663}
664
665static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
666{
667 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
668
669 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
670 "ch=%p context_ptr=%x", ch, context_ptr);
671
672 if (g->ops.fecs_trace.flush)
673 g->ops.fecs_trace.flush(g);
674 gk20a_fecs_trace_poll(g);
675 gk20a_fecs_trace_hash_del(g, context_ptr);
676 return 0;
677}
678
679static int gk20a_fecs_trace_reset(struct gk20a *g)
680{
681 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
682
683 if (g->ops.fecs_trace.flush)
684 g->ops.fecs_trace.flush(g);
685 gk20a_fecs_trace_poll(g);
686 return gk20a_fecs_trace_set_read_index(g, 0);
687}
688
689static int gk20a_fecs_trace_deinit(struct gk20a *g)
690{
691 struct gk20a_fecs_trace *trace = g->fecs_trace;
692
693 gk20a_fecs_trace_debugfs_cleanup(g);
694 kthread_stop(trace->poll_task);
695 gk20a_fecs_trace_free_ring(g);
696 gk20a_fecs_trace_free_hash_table(g);
697
698 kfree(g->fecs_trace);
699 g->fecs_trace = NULL;
700 return 0;
701}
702
703static int gk20a_gr_max_entries(struct gk20a *g,
704 struct nvgpu_ctxsw_trace_filter *filter)
705{
706 int n;
707 int tag;
708
709 /* Compute number of entries per record, with given filter */
710 for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
711 n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
712
713 /* Return max number of entries generated for the whole ring */
714 return n * GK20A_FECS_TRACE_NUM_RECORDS;
715}
716
717static int gk20a_fecs_trace_enable(struct gk20a *g)
718{
719 struct gk20a_fecs_trace *trace = g->fecs_trace;
720 struct task_struct *task;
721
722 if (!trace->poll_task) {
723 task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__);
724 if (unlikely(IS_ERR(task))) {
725 gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task");
726 return PTR_ERR(task);
727 }
728 trace->poll_task = task;
729 }
730
731 return 0;
732}
733
734static int gk20a_fecs_trace_disable(struct gk20a *g)
735{
736 struct gk20a_fecs_trace *trace = g->fecs_trace;
737
738 if (trace->poll_task) {
739 kthread_stop(trace->poll_task);
740 trace->poll_task = NULL;
741 }
742
743 return -EPERM;
744}
745
746void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
747{
748 ops->fecs_trace.init = gk20a_fecs_trace_init;
749 ops->fecs_trace.deinit = gk20a_fecs_trace_deinit;
750 ops->fecs_trace.enable = gk20a_fecs_trace_enable;
751 ops->fecs_trace.disable = gk20a_fecs_trace_disable;
752 ops->fecs_trace.reset = gk20a_fecs_trace_reset;
753 ops->fecs_trace.flush = NULL;
754 ops->fecs_trace.poll = gk20a_fecs_trace_poll;
755 ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel;
756 ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel;
757 ops->fecs_trace.max_entries = gk20a_gr_max_entries;
758}
759#else
760void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
761{
762}
763#endif /* CONFIG_GK20A_CTXSW_TRACE */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
new file mode 100644
index 00000000..4979d6c6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __FECS_TRACE_GK20A_H
15#define __FECS_TRACE_GK20A_H
16
17struct gpu_ops;
18void gk20a_init_fecs_trace_ops(struct gpu_ops *ops);
19
20#endif /* __FECS_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 661c2c38..029a713f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -25,6 +25,7 @@
25 25
26#include "gk20a.h" 26#include "gk20a.h"
27#include "debug_gk20a.h" 27#include "debug_gk20a.h"
28#include "ctxsw_trace_gk20a.h"
28#include "semaphore_gk20a.h" 29#include "semaphore_gk20a.h"
29#include "hw_fifo_gk20a.h" 30#include "hw_fifo_gk20a.h"
30#include "hw_pbdma_gk20a.h" 31#include "hw_pbdma_gk20a.h"
@@ -776,13 +777,17 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
776 if (engine_id == top_device_info_type_enum_graphics_v()) { 777 if (engine_id == top_device_info_type_enum_graphics_v()) {
777 if (support_gk20a_pmu(g->dev) && g->elpg_enabled) 778 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
778 gk20a_pmu_disable_elpg(g); 779 gk20a_pmu_disable_elpg(g);
779 /*HALT_PIPELINE method, halt GR engine*/ 780 /*HALT_PIPELINE method, halt GR engine*/
780 if (gr_gk20a_halt_pipe(g)) 781 if (gr_gk20a_halt_pipe(g))
781 gk20a_err(dev_from_gk20a(g), 782 gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");
782 "failed to HALT gr pipe"); 783 /* resetting engine will alter read/write index.
783 /* resetting engine using mc_enable_r() is not 784 * need to flush circular buffer before re-enabling FECS.
784 enough, we do full init sequence */ 785 */
785 gk20a_gr_reset(g); 786 if (g->ops.fecs_trace.reset)
787 g->ops.fecs_trace.reset(g);
788 /* resetting engine using mc_enable_r() is not
789 enough, we do full init sequence */
790 gk20a_gr_reset(g);
786 if (support_gk20a_pmu(g->dev) && g->elpg_enabled) 791 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
787 gk20a_pmu_enable_elpg(g); 792 gk20a_pmu_enable_elpg(g);
788 } 793 }
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 0cc9564b..735bf90b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -60,6 +60,7 @@
60#include "hw_gr_gk20a.h" 60#include "hw_gr_gk20a.h"
61#include "hw_fb_gk20a.h" 61#include "hw_fb_gk20a.h"
62#include "gk20a_scale.h" 62#include "gk20a_scale.h"
63#include "ctxsw_trace_gk20a.h"
63#include "dbg_gpu_gk20a.h" 64#include "dbg_gpu_gk20a.h"
64#include "gk20a_allocator.h" 65#include "gk20a_allocator.h"
65#include "hal.h" 66#include "hal.h"
@@ -80,7 +81,7 @@
80/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */ 81/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
81#define INTERFACE_NAME "nvhost%s-gpu" 82#define INTERFACE_NAME "nvhost%s-gpu"
82 83
83#define GK20A_NUM_CDEVS 6 84#define GK20A_NUM_CDEVS 7
84 85
85#define EMC3D_DEFAULT_RATIO 750 86#define EMC3D_DEFAULT_RATIO 750
86 87
@@ -169,6 +170,19 @@ static const struct file_operations gk20a_tsg_ops = {
169 .unlocked_ioctl = gk20a_tsg_dev_ioctl, 170 .unlocked_ioctl = gk20a_tsg_dev_ioctl,
170}; 171};
171 172
173static const struct file_operations gk20a_ctxsw_ops = {
174 .owner = THIS_MODULE,
175 .release = gk20a_ctxsw_dev_release,
176 .open = gk20a_ctxsw_dev_open,
177#ifdef CONFIG_COMPAT
178 .compat_ioctl = gk20a_ctxsw_dev_ioctl,
179#endif
180 .unlocked_ioctl = gk20a_ctxsw_dev_ioctl,
181 .poll = gk20a_ctxsw_dev_poll,
182 .read = gk20a_ctxsw_dev_read,
183 .mmap = gk20a_ctxsw_dev_mmap,
184};
185
172static inline void sim_writel(struct gk20a *g, u32 r, u32 v) 186static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
173{ 187{
174 writel(v, g->sim.regs+r); 188 writel(v, g->sim.regs+r);
@@ -881,6 +895,10 @@ static int gk20a_pm_finalize_poweron(struct device *dev)
881 goto done; 895 goto done;
882 } 896 }
883 897
898 err = gk20a_ctxsw_trace_init(g);
899 if (err)
900 gk20a_warn(dev, "could not initialize ctxsw tracing");
901
884 /* Restore the debug setting */ 902 /* Restore the debug setting */
885 g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl); 903 g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl);
886 904
@@ -1009,6 +1027,11 @@ void gk20a_user_deinit(struct platform_device *dev)
1009 cdev_del(&g->tsg.cdev); 1027 cdev_del(&g->tsg.cdev);
1010 } 1028 }
1011 1029
1030 if (g->ctxsw.node) {
1031 device_destroy(g->class, g->ctxsw.cdev.dev);
1032 cdev_del(&g->ctxsw.cdev);
1033 }
1034
1012 if (g->cdev_region) 1035 if (g->cdev_region)
1013 unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS); 1036 unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
1014 1037
@@ -1074,6 +1097,15 @@ int gk20a_user_init(struct platform_device *dev)
1074 if (err) 1097 if (err)
1075 goto fail; 1098 goto fail;
1076 1099
1100#ifdef CONFIG_GK20A_CTXSW_TRACE
1101 err = gk20a_create_device(dev, devno++, "-ctxsw",
1102 &g->ctxsw.cdev, &g->ctxsw.node,
1103 &gk20a_ctxsw_ops);
1104 if (err)
1105 goto fail;
1106#endif
1107
1108
1077 return 0; 1109 return 0;
1078fail: 1110fail:
1079 gk20a_user_deinit(dev); 1111 gk20a_user_deinit(dev);
@@ -1554,6 +1586,8 @@ static int __exit gk20a_remove(struct platform_device *dev)
1554 if (platform->has_cde) 1586 if (platform->has_cde)
1555 gk20a_cde_destroy(g); 1587 gk20a_cde_destroy(g);
1556 1588
1589 gk20a_ctxsw_trace_cleanup(g);
1590
1557 if (IS_ENABLED(CONFIG_GK20A_DEVFREQ)) 1591 if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
1558 gk20a_scale_exit(dev); 1592 gk20a_scale_exit(dev);
1559 1593
@@ -2091,6 +2125,19 @@ gk20a_request_firmware(struct gk20a *g, const char *fw_name)
2091 return fw; 2125 return fw;
2092} 2126}
2093 2127
2128
2129u64 gk20a_read_ptimer(struct gk20a *g)
2130{
2131 u32 time_hi0 = gk20a_readl(g, timer_time_1_r());
2132 u32 time_lo = gk20a_readl(g, timer_time_0_r());
2133 u32 time_hi1 = gk20a_readl(g, timer_time_1_r());
2134 u32 time_hi = (time_lo & (1L << 31)) ? time_hi0 : time_hi1;
2135 u64 time = ((u64)time_hi << 32) | time_lo;
2136
2137 return time;
2138}
2139
2140
2094MODULE_LICENSE("GPL v2"); 2141MODULE_LICENSE("GPL v2");
2095module_init(gk20a_init); 2142module_init(gk20a_init);
2096module_exit(gk20a_exit); 2143module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8b87c7aa..541e7b50 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -25,6 +25,8 @@ struct channel_gk20a;
25struct gr_gk20a; 25struct gr_gk20a;
26struct sim_gk20a; 26struct sim_gk20a;
27struct gk20a_ctxsw_ucode_segments; 27struct gk20a_ctxsw_ucode_segments;
28struct gk20a_fecs_trace;
29struct gk20a_ctxsw_trace;
28struct acr_gm20b; 30struct acr_gm20b;
29 31
30#include <linux/sched.h> 32#include <linux/sched.h>
@@ -373,6 +375,19 @@ struct gpu_ops {
373 bool use_dma_for_fw_bootstrap; 375 bool use_dma_for_fw_bootstrap;
374 } gr_ctx; 376 } gr_ctx;
375 struct { 377 struct {
378 int (*init)(struct gk20a *g);
379 int (*max_entries)(struct gk20a *,
380 struct nvgpu_ctxsw_trace_filter *);
381 int (*flush)(struct gk20a *g);
382 int (*poll)(struct gk20a *g);
383 int (*enable)(struct gk20a *g);
384 int (*disable)(struct gk20a *g);
385 int (*reset)(struct gk20a *g);
386 int (*bind_channel)(struct gk20a *, struct channel_gk20a *);
387 int (*unbind_channel)(struct gk20a *, struct channel_gk20a *);
388 int (*deinit)(struct gk20a *g);
389 } fecs_trace;
390 struct {
376 bool (*support_sparse)(struct gk20a *g); 391 bool (*support_sparse)(struct gk20a *g);
377 bool (*is_debug_mode_enabled)(struct gk20a *g); 392 bool (*is_debug_mode_enabled)(struct gk20a *g);
378 void (*set_debug_mode)(struct gk20a *g, bool enable); 393 void (*set_debug_mode)(struct gk20a *g, bool enable);
@@ -613,6 +628,11 @@ struct gk20a {
613 struct device *node; 628 struct device *node;
614 } tsg; 629 } tsg;
615 630
631 struct {
632 struct cdev cdev;
633 struct device *node;
634 } ctxsw;
635
616 struct mutex client_lock; 636 struct mutex client_lock;
617 int client_refcount; /* open channels and ctrl nodes */ 637 int client_refcount; /* open channels and ctrl nodes */
618 638
@@ -639,6 +659,9 @@ struct gk20a {
639 659
640 struct gk20a_scale_profile *scale_profile; 660 struct gk20a_scale_profile *scale_profile;
641 661
662 struct gk20a_ctxsw_trace *ctxsw_trace;
663 struct gk20a_fecs_trace *fecs_trace;
664
642 struct device_dma_parameters dma_parms; 665 struct device_dma_parameters dma_parms;
643 666
644 struct gk20a_cde_app cde_app; 667 struct gk20a_cde_app cde_app;
@@ -716,6 +739,7 @@ enum gk20a_dbg_categories {
716 gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */ 739 gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */
717 gpu_dbg_cde = BIT(10), /* cde info messages */ 740 gpu_dbg_cde = BIT(10), /* cde info messages */
718 gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */ 741 gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
742 gpu_dbg_ctxsw = BIT(12), /* ctxsw tracing */
719 gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */ 743 gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */
720}; 744};
721 745
@@ -962,4 +986,6 @@ static inline u32 scale_ptimer(u32 timeout , u32 scale10x)
962 else 986 else
963 return (timeout * 10) / scale10x; 987 return (timeout * 10) / scale10x;
964} 988}
989
990u64 gk20a_read_ptimer(struct gk20a *g);
965#endif /* GK20A_H */ 991#endif /* GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 60bba0b8..08f1d921 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -56,6 +56,7 @@
56#include "debug_gk20a.h" 56#include "debug_gk20a.h"
57#include "semaphore_gk20a.h" 57#include "semaphore_gk20a.h"
58#include "platform_gk20a.h" 58#include "platform_gk20a.h"
59#include "ctxsw_trace_gk20a.h"
59 60
60#define BLK_SIZE (256) 61#define BLK_SIZE (256)
61 62
@@ -2855,6 +2856,13 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
2855 "fail to load golden ctx image"); 2856 "fail to load golden ctx image");
2856 goto out; 2857 goto out;
2857 } 2858 }
2859 if (g->ops.fecs_trace.bind_channel) {
2860 err = g->ops.fecs_trace.bind_channel(g, c);
2861 if (err) {
2862 gk20a_warn(dev_from_gk20a(g),
2863 "fail to bind channel for ctxsw trace");
2864 }
2865 }
2858 c->first_init = true; 2866 c->first_init = true;
2859 } 2867 }
2860 2868
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index a9ad970a..9718aad2 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -22,6 +22,7 @@
22#include "gk20a_gating_reglist.h" 22#include "gk20a_gating_reglist.h"
23#include "channel_gk20a.h" 23#include "channel_gk20a.h"
24#include "gr_ctx_gk20a.h" 24#include "gr_ctx_gk20a.h"
25#include "fecs_trace_gk20a.h"
25#include "mm_gk20a.h" 26#include "mm_gk20a.h"
26#include "mc_gk20a.h" 27#include "mc_gk20a.h"
27#include "pmu_gk20a.h" 28#include "pmu_gk20a.h"
@@ -57,6 +58,7 @@ int gk20a_init_hal(struct gk20a *g)
57 gk20a_init_mc(gops); 58 gk20a_init_mc(gops);
58 gk20a_init_ltc(gops); 59 gk20a_init_ltc(gops);
59 gk20a_init_gr_ops(gops); 60 gk20a_init_gr_ops(gops);
61 gk20a_init_fecs_trace_ops(gops);
60 gk20a_init_fb(gops); 62 gk20a_init_fb(gops);
61 gk20a_init_fifo(gops); 63 gk20a_init_fifo(gops);
62 gk20a_init_ce2(gops); 64 gk20a_init_ce2(gops);
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
index 39cbbb58..da555f7c 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2012-2015, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2012-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -246,4 +246,192 @@ static inline u32 ctxsw_prog_main_image_context_id_o(void)
246{ 246{
247 return 0x000000f0; 247 return 0x000000f0;
248} 248}
249static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_o(void)
250{
251 return 0x000000ac;
252}
253static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(u32 v)
254{
255 return (v & 0xffff) << 0;
256}
257static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(void)
258{
259 return 0x000000b0;
260}
261static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_v_m(void)
262{
263 return 0xfffffff << 0;
264}
265static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_m(void)
266{
267 return 0x3 << 28;
268}
269static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f(void)
270{
271 return 0x0;
272}
273static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(void)
274{
275 return 0x20000000;
276}
277static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(void)
278{
279 return 0x30000000;
280}
281static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(void)
282{
283 return 0x000000b4;
284}
285static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u32 v)
286{
287 return (v & 0xffffffff) << 0;
288}
289static inline u32 ctxsw_prog_record_timestamp_record_size_in_bytes_v(void)
290{
291 return 0x00000080;
292}
293static inline u32 ctxsw_prog_record_timestamp_record_size_in_words_v(void)
294{
295 return 0x00000020;
296}
297static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_o(void)
298{
299 return 0x00000000;
300}
301static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_v_value_v(void)
302{
303 return 0x00000000;
304}
305static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_o(void)
306{
307 return 0x00000004;
308}
309static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_v_value_v(void)
310{
311 return 0x600dbeef;
312}
313static inline u32 ctxsw_prog_record_timestamp_context_id_o(void)
314{
315 return 0x00000008;
316}
317static inline u32 ctxsw_prog_record_timestamp_context_ptr_o(void)
318{
319 return 0x0000000c;
320}
321static inline u32 ctxsw_prog_record_timestamp_new_context_id_o(void)
322{
323 return 0x00000010;
324}
325static inline u32 ctxsw_prog_record_timestamp_new_context_ptr_o(void)
326{
327 return 0x00000014;
328}
329static inline u32 ctxsw_prog_record_timestamp_timestamp_lo_o(void)
330{
331 return 0x00000018;
332}
333static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_o(void)
334{
335 return 0x0000001c;
336}
337static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_f(u32 v)
338{
339 return (v & 0xffffff) << 0;
340}
341static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_v(u32 r)
342{
343 return (r >> 0) & 0xffffff;
344}
345static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_f(u32 v)
346{
347 return (v & 0xff) << 24;
348}
349static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_m(void)
350{
351 return 0xff << 24;
352}
353static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_v(u32 r)
354{
355 return (r >> 24) & 0xff;
356}
357static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_v(void)
358{
359 return 0x00000001;
360}
361static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_f(void)
362{
363 return 0x1000000;
364}
365static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_v(void)
366{
367 return 0x00000002;
368}
369static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_f(void)
370{
371 return 0x2000000;
372}
373static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_v(void)
374{
375 return 0x0000000a;
376}
377static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_f(void)
378{
379 return 0xa000000;
380}
381static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_v(void)
382{
383 return 0x0000000b;
384}
385static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_f(void)
386{
387 return 0xb000000;
388}
389static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_v(void)
390{
391 return 0x0000000c;
392}
393static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_f(void)
394{
395 return 0xc000000;
396}
397static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_v(void)
398{
399 return 0x0000000d;
400}
401static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_f(void)
402{
403 return 0xd000000;
404}
405static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_v(void)
406{
407 return 0x00000003;
408}
409static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_f(void)
410{
411 return 0x3000000;
412}
413static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_v(void)
414{
415 return 0x00000004;
416}
417static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_f(void)
418{
419 return 0x4000000;
420}
421static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_v(void)
422{
423 return 0x00000005;
424}
425static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_f(void)
426{
427 return 0x5000000;
428}
429static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(void)
430{
431 return 0x000000ff;
432}
433static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_f(void)
434{
435 return 0xff000000;
436}
249#endif 437#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
index dbbc914f..4cb36cbe 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2013-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
new file mode 100644
index 00000000..cb955811
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
@@ -0,0 +1,21 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <linux/string.h>
15#include "gk20a/gk20a.h"
16#include "fecs_trace_vgpu.h"
17
18void vgpu_init_fecs_trace_ops(struct gpu_ops *ops)
19{
20 memset(&ops->fecs_trace, 0, sizeof(ops->fecs_trace));
21}
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
new file mode 100644
index 00000000..1aace1fe
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __FECS_TRACE_VGPU_H
15#define __FECS_TRACE_VGPU_H
16
17struct gpu_ops;
18void vgpu_init_fecs_trace_ops(struct gpu_ops *ops);
19
20#endif /* __FECS_TRACE_VGPU_H */
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 4cc61cb1..57f510ca 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -18,6 +18,7 @@
18#include <linux/dma-mapping.h> 18#include <linux/dma-mapping.h>
19#include <linux/pm_runtime.h> 19#include <linux/pm_runtime.h>
20#include "vgpu/vgpu.h" 20#include "vgpu/vgpu.h"
21#include "vgpu/fecs_trace_vgpu.h"
21#include "gk20a/debug_gk20a.h" 22#include "gk20a/debug_gk20a.h"
22#include "gk20a/hal_gk20a.h" 23#include "gk20a/hal_gk20a.h"
23#include "gk20a/hw_mc_gk20a.h" 24#include "gk20a/hw_mc_gk20a.h"
@@ -259,6 +260,7 @@ void vgpu_init_hal_common(struct gk20a *g)
259 vgpu_init_ltc_ops(gops); 260 vgpu_init_ltc_ops(gops);
260 vgpu_init_mm_ops(gops); 261 vgpu_init_mm_ops(gops);
261 vgpu_init_debug_ops(gops); 262 vgpu_init_debug_ops(gops);
263 vgpu_init_fecs_trace_ops(gops);
262} 264}
263 265
264static int vgpu_init_hal(struct gk20a *g) 266static int vgpu_init_hal(struct gk20a *g)