summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLeonid Moiseichuk <lmoiseichuk@nvidia.com>2015-05-08 08:06:42 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2015-06-06 10:23:24 -0400
commit837ceffcab417865db8adbdf1a2038ebde8ec1a5 (patch)
tree8890ac07cd6cca9d5cff054dd899a8f6052b0669
parentef8fa4999fcd8c01275ef3790b0abd5e69c55ea5 (diff)
gpu: nvgpu: cyclestats mode E snapshots support
That is a kernel supporting code for cyclestats mode E. Cyclestats mode E implemented following Windows-design in user-space and required the following operations to be implemented: - attach a client for shared hardware buffer of device - detach client from shared hardware buffer - flush means copy of available data from hardware buffer to private client buffers according to perfmon IDs assigned for clients - perfmon IDs management for user-space clients - a NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS_SNAPSHOT capability added Bug 1573150 Change-Id: I9e09f0fbb2be5a95c47e6d80a2e23fa839b46f9a Signed-off-by: Leonid Moiseichuk <lmoiseichuk@nvidia.com> Reviewed-on: http://git-master/r/740653 (cherry picked from commit 79fe89fd4cea39d8ab9dbef0558cd806ddfda87f) Reviewed-on: http://git-master/r/753274 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/Makefile3
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c105
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h3
-rw-r--r--drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c759
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c7
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.h32
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h88
-rw-r--r--drivers/gpu/nvgpu/gm20b/hw_perf_gm20b.h205
-rw-r--r--include/uapi/linux/nvgpu.h21
10 files changed, 1225 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index f20e67b2..053cdde3 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -82,6 +82,9 @@ nvgpu-$(CONFIG_TEGRA_CLK_FRAMEWORK) += \
82nvgpu-$(CONFIG_GK20A_DEVFREQ) += \ 82nvgpu-$(CONFIG_GK20A_DEVFREQ) += \
83 gk20a/gk20a_scale.o 83 gk20a/gk20a_scale.o
84 84
85nvgpu-$(CONFIG_GK20A_CYCLE_STATS) += \
86 gk20a/css_gr_gk20a.o
87
85ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y) 88ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y)
86ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu 89ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu
87ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include 90ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c83da8b4..643adca5 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -486,6 +486,95 @@ static int gk20a_channel_cycle_stats(struct channel_gk20a *ch,
486 return -EINVAL; 486 return -EINVAL;
487 } 487 }
488} 488}
489
490
491static int gk20a_flush_cycle_stats_snapshot(struct channel_gk20a *ch)
492{
493 int ret;
494
495 mutex_lock(&ch->cs_client_mutex);
496 if (ch->cs_client)
497 ret = gr_gk20a_css_flush(ch->g, ch->cs_client);
498 else
499 ret = -EBADF;
500 mutex_unlock(&ch->cs_client_mutex);
501
502 return ret;
503}
504
505static int gk20a_attach_cycle_stats_snapshot(struct channel_gk20a *ch,
506 u32 dmabuf_fd,
507 u32 perfmon_id_count,
508 u32 *perfmon_id_start)
509{
510 int ret;
511
512 mutex_lock(&ch->cs_client_mutex);
513 if (ch->cs_client) {
514 ret = -EEXIST;
515 } else {
516 ret = gr_gk20a_css_attach(ch->g,
517 dmabuf_fd,
518 perfmon_id_count,
519 perfmon_id_start,
520 &ch->cs_client);
521 }
522 mutex_unlock(&ch->cs_client_mutex);
523
524 return ret;
525}
526
527static int gk20a_free_cycle_stats_snapshot(struct channel_gk20a *ch)
528{
529 int ret;
530
531 mutex_lock(&ch->cs_client_mutex);
532 if (ch->cs_client) {
533 ret = gr_gk20a_css_detach(ch->g, ch->cs_client);
534 ch->cs_client = NULL;
535 } else {
536 ret = 0;
537 }
538 mutex_unlock(&ch->cs_client_mutex);
539
540 return ret;
541}
542
543static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
544 struct nvgpu_cycle_stats_snapshot_args *args)
545{
546 int ret;
547
548 if (!args->dmabuf_fd)
549 return -EINVAL;
550
551 /* handle the command (most frequent cases first) */
552 switch (args->cmd) {
553 case NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_FLUSH:
554 ret = gk20a_flush_cycle_stats_snapshot(ch);
555 args->extra = 0;
556 break;
557
558 case NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH:
559 ret = gk20a_attach_cycle_stats_snapshot(ch,
560 args->dmabuf_fd,
561 args->extra,
562 &args->extra);
563 break;
564
565 case NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH:
566 ret = gk20a_free_cycle_stats_snapshot(ch);
567 args->extra = 0;
568 break;
569
570 default:
571 pr_err("cyclestats: unknown command %u\n", args->cmd);
572 ret = -EINVAL;
573 break;
574 }
575
576 return ret;
577}
489#endif 578#endif
490 579
491static int gk20a_init_error_notifier(struct channel_gk20a *ch, 580static int gk20a_init_error_notifier(struct channel_gk20a *ch,
@@ -602,6 +691,7 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
602 691
603#if defined(CONFIG_GK20A_CYCLE_STATS) 692#if defined(CONFIG_GK20A_CYCLE_STATS)
604 gk20a_free_cycle_stats_buffer(ch); 693 gk20a_free_cycle_stats_buffer(ch);
694 gk20a_free_cycle_stats_snapshot(ch);
605#endif 695#endif
606 696
607 channel_gk20a_free_priv_cmdbuf(ch); 697 channel_gk20a_free_priv_cmdbuf(ch);
@@ -1639,6 +1729,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
1639 INIT_LIST_HEAD(&c->jobs); 1729 INIT_LIST_HEAD(&c->jobs);
1640#if defined(CONFIG_GK20A_CYCLE_STATS) 1730#if defined(CONFIG_GK20A_CYCLE_STATS)
1641 mutex_init(&c->cyclestate.cyclestate_buffer_mutex); 1731 mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
1732 mutex_init(&c->cs_client_mutex);
1642#endif 1733#endif
1643 INIT_LIST_HEAD(&c->dbg_s_list); 1734 INIT_LIST_HEAD(&c->dbg_s_list);
1644 mutex_init(&c->dbg_s_lock); 1735 mutex_init(&c->dbg_s_lock);
@@ -2335,6 +2426,20 @@ long gk20a_channel_ioctl(struct file *filp,
2335 err = gk20a_channel_events_ctrl(ch, 2426 err = gk20a_channel_events_ctrl(ch,
2336 (struct nvgpu_channel_events_ctrl_args *)buf); 2427 (struct nvgpu_channel_events_ctrl_args *)buf);
2337 break; 2428 break;
2429#ifdef CONFIG_GK20A_CYCLE_STATS
2430 case NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT:
2431 err = gk20a_busy(dev);
2432 if (err) {
2433 dev_err(&dev->dev,
2434 "%s: failed to host gk20a for ioctl cmd: 0x%x",
2435 __func__, cmd);
2436 break;
2437 }
2438 err = gk20a_channel_cycle_stats_snapshot(ch,
2439 (struct nvgpu_cycle_stats_snapshot_args *)buf);
2440 gk20a_idle(dev);
2441 break;
2442#endif
2338 default: 2443 default:
2339 dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd); 2444 dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
2340 err = -ENOTTY; 2445 err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 5fe03cef..f022fe36 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -133,6 +133,9 @@ struct channel_gk20a {
133 struct dma_buf *cyclestate_buffer_handler; 133 struct dma_buf *cyclestate_buffer_handler;
134 struct mutex cyclestate_buffer_mutex; 134 struct mutex cyclestate_buffer_mutex;
135 } cyclestate; 135 } cyclestate;
136
137 struct mutex cs_client_mutex;
138 struct gk20a_cs_snapshot_client *cs_client;
136#endif 139#endif
137 struct mutex dbg_s_lock; 140 struct mutex dbg_s_lock;
138 struct list_head dbg_s_list; 141 struct list_head dbg_s_list;
diff --git a/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c
new file mode 100644
index 00000000..7509acd7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c
@@ -0,0 +1,759 @@
1/*
2 * GK20A Cycle stats snapshots support (subsystem for gr_gk20a).
3 *
4 * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19#include <linux/bitops.h>
20#include <linux/dma-mapping.h>
21#include <linux/dma-buf.h>
22#include <linux/mutex.h>
23#include <linux/vmalloc.h>
24
25#include "gk20a.h"
26#include "hw_perf_gk20a.h"
27#include "hw_mc_gk20a.h"
28
29
30
31/* cycle stats fifo header (must match NvSnapshotBufferFifo) */
32struct gk20a_cs_snapshot_fifo {
33 /* layout description of the buffer */
34 u32 start;
35 u32 end;
36
37 /* snafu bits */
38 u32 hw_overflow_events_occured;
39 u32 sw_overflow_events_occured;
40
41 /* the kernel copies new entries to put and
42 * increment the put++. if put == get then
43 * overflowEventsOccured++
44 */
45 u32 put;
46 u32 _reserved10;
47 u32 _reserved11;
48 u32 _reserved12;
49
50 /* the driver/client reads from get until
51 * put==get, get++ */
52 u32 get;
53 u32 _reserved20;
54 u32 _reserved21;
55 u32 _reserved22;
56
57 /* unused */
58 u32 _reserved30;
59 u32 _reserved31;
60 u32 _reserved32;
61 u32 _reserved33;
62};
63
64/* cycle stats fifo entry (must match NvSnapshotBufferFifoEntry) */
65struct gk20a_cs_snapshot_fifo_entry {
66 /* global 48 timestamp */
67 u32 timestamp31_00:32;
68 u32 timestamp39_32:8;
69
70 /* id of perfmon, should correlate with CSS_MAX_PERFMON_IDS */
71 u32 perfmon_id:8;
72
73 /* typically samples_counter is wired to #pmtrigger count */
74 u32 samples_counter:12;
75
76 /* DS=Delay Sample, SZ=Size (0=32B, 1=16B) */
77 u32 ds:1;
78 u32 sz:1;
79 u32 zero0:1;
80 u32 zero1:1;
81
82 /* counter results */
83 u32 event_cnt:32;
84 u32 trigger0_cnt:32;
85 u32 trigger1_cnt:32;
86 u32 sample_cnt:32;
87
88 /* Local PmTrigger results for Maxwell+ or padding otherwise */
89 u16 local_trigger_b_count:16;
90 u16 book_mark_b:16;
91 u16 local_trigger_a_count:16;
92 u16 book_mark_a:16;
93};
94
95
96/* cycle stats snapshot client data (e.g. associated with channel) */
97struct gk20a_cs_snapshot_client {
98 struct list_head list;
99 u32 dmabuf_fd;
100 struct dma_buf *dma_handler;
101 struct gk20a_cs_snapshot_fifo *snapshot;
102 u32 snapshot_size;
103 u32 perfmon_start;
104 u32 perfmon_count;
105};
106
107/* check client for pointed perfmon ownership */
108#define CONTAINS_PERFMON(cl, pm) \
109 ((cl)->perfmon_start <= (pm) && \
110 ((pm) - (cl)->perfmon_start) < (cl)->perfmon_count)
111
112/* the minimal size of HW buffer - should be enough to avoid HW overflows */
113#define CSS_MIN_HW_SNAPSHOT_SIZE (8 * 1024 * 1024)
114
115/* the minimal size of client buffer */
116#define CSS_MIN_CLIENT_SNAPSHOT_SIZE \
117 (sizeof(struct gk20a_cs_snapshot_fifo) + \
118 sizeof(struct gk20a_cs_snapshot_fifo_entry) * 256)
119
120/* address of fifo entry by offset */
121#define CSS_FIFO_ENTRY(fifo, offs) \
122 ((struct gk20a_cs_snapshot_fifo_entry *)(((char *)(fifo)) + (offs)))
123
124/* calculate area capacity in number of fifo entries */
125#define CSS_FIFO_ENTRY_CAPACITY(s) \
126 (((s) - sizeof(struct gk20a_cs_snapshot_fifo)) \
127 / sizeof(struct gk20a_cs_snapshot_fifo_entry))
128
129/* reserved to indicate failures with data */
130#define CSS_FIRST_PERFMON_ID 32
131/* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
132#define CSS_MAX_PERFMON_IDS 256
133
134
135/* this type is used for storing bits in perfmon mask */
136typedef u32 css_perfmon_t;
137
138/* local definitions to avoid hardcodes sizes and shifts */
139#define PM_BITS (sizeof(css_perfmon_t) * BITS_PER_BYTE)
140#define PM_BITS_MASK (PM_BITS - 1)
141
142#define PM_BITMAP_SIZE ((CSS_MAX_PERFMON_IDS + PM_BITS - 1) / PM_BITS)
143
144#define PM_SLOT(i) ((i) / PM_BITS)
145#define PM_SHIFT(i) ((i) & PM_BITS_MASK)
146#define PM_BIT(i) (1u << PM_SHIFT(i))
147
148#define CSS_PERFMON_GET(p, i) (1 == ((p[PM_SLOT(i)] >> PM_SHIFT(i)) & 1))
149#define CSS_PERFMON_USE(p, i) (p[PM_SLOT(i)] |= PM_BIT(i))
150#define CSS_PERFMON_REL(p, i) (p[PM_SLOT(i)] &= ~PM_BIT(i))
151
152
153/* cycle stats snapshot control structure for one HW entry and many clients */
154struct gk20a_cs_snapshot {
155 css_perfmon_t perfmon_ids[PM_BITMAP_SIZE];
156 struct list_head clients;
157 struct mem_desc hw_memdesc;
158 /* pointer to allocated cpu_va memory where GPU place data */
159 struct gk20a_cs_snapshot_fifo_entry *hw_snapshot;
160 struct gk20a_cs_snapshot_fifo_entry *hw_end;
161 struct gk20a_cs_snapshot_fifo_entry *hw_get;
162};
163
164/* reports whether the hw queue overflowed */
165static inline bool css_hw_get_overflow_status(struct gk20a *g)
166{
167 const u32 st = perf_pmasys_control_membuf_status_overflowed_f();
168 return st == (gk20a_readl(g, perf_pmasys_control_r()) & st);
169}
170
171/* returns how many pending snapshot entries are pending */
172static inline u32 css_hw_get_pending_snapshots(struct gk20a *g)
173{
174 return gk20a_readl(g, perf_pmasys_mem_bytes_r()) /
175 sizeof(struct gk20a_cs_snapshot_fifo_entry);
176}
177
178/* informs hw how many snapshots have been processed (frees up fifo space) */
179static inline void css_hw_set_handled_snapshots(struct gk20a *g, u32 done)
180{
181 if (done > 0) {
182 gk20a_writel(g, perf_pmasys_mem_bump_r(),
183 done * sizeof(struct gk20a_cs_snapshot_fifo_entry));
184 }
185}
186
187/* disable streaming to memory */
188static void css_hw_reset_streaming(struct gk20a *g)
189{
190 u32 engine_status;
191 u32 old_pmc = gk20a_readl(g, mc_enable_r());
192
193 /* reset the perfmon */
194 gk20a_writel(g, mc_enable_r(),
195 old_pmc & ~mc_enable_perfmon_enabled_f());
196 gk20a_writel(g, mc_enable_r(), old_pmc);
197
198 /* RBUFEMPTY must be set -- otherwise we'll pick up */
199 /* snapshot that have been queued up from earlier */
200 engine_status = gk20a_readl(g, perf_pmasys_enginestatus_r());
201 WARN_ON(0 == (engine_status
202 & perf_pmasys_enginestatus_rbufempty_empty_f()));
203
204 /* turn off writes */
205 gk20a_writel(g, perf_pmasys_control_r(),
206 perf_pmasys_control_membuf_clear_status_doit_f());
207
208 /* pointing all pending snapshots as handled */
209 css_hw_set_handled_snapshots(g, css_hw_get_pending_snapshots(g));
210}
211
212/*
213 * WARNING: all css_gr_XXX functions are local and expected to be called
214 * from locked context (protected by cs_lock)
215 */
216
217static int css_gr_create_shared_data(struct gr_gk20a *gr)
218{
219 struct gk20a_cs_snapshot *data;
220
221 if (gr->cs_data)
222 return 0;
223
224 data = kzalloc(sizeof(*data), GFP_KERNEL);
225 if (!data)
226 return -ENOMEM;
227
228 INIT_LIST_HEAD(&data->clients);
229 gr->cs_data = data;
230
231 return 0;
232}
233
234static int css_hw_enable_snapshot(struct gr_gk20a *gr, u32 snapshot_size)
235{
236 struct gk20a *g = gr->g;
237 struct gk20a_cs_snapshot *data = gr->cs_data;
238 int ret;
239
240 u32 virt_addr_lo;
241 u32 virt_addr_hi;
242 u32 inst_pa_page;
243
244 if (data->hw_snapshot)
245 return 0;
246
247 if (snapshot_size < CSS_MIN_HW_SNAPSHOT_SIZE)
248 snapshot_size = CSS_MIN_HW_SNAPSHOT_SIZE;
249
250 ret = gk20a_gmmu_alloc_map(&g->mm.pmu.vm, snapshot_size,
251 &data->hw_memdesc);
252 if (ret)
253 return ret;
254
255 /* perf output buffer may not cross a 4GB boundary - with a separate */
256 /* va smaller than that, it won't but check anyway */
257 if (!data->hw_memdesc.cpu_va ||
258 data->hw_memdesc.size < snapshot_size ||
259 data->hw_memdesc.gpu_va + u64_lo32(snapshot_size) > SZ_4G) {
260 ret = -EFAULT;
261 goto failed_allocation;
262 }
263
264 data->hw_snapshot =
265 (struct gk20a_cs_snapshot_fifo_entry *)data->hw_memdesc.cpu_va;
266 data->hw_end = data->hw_snapshot +
267 snapshot_size / sizeof(struct gk20a_cs_snapshot_fifo_entry);
268 data->hw_get = data->hw_snapshot;
269 memset(data->hw_snapshot, 0xff, snapshot_size);
270
271 /* address and size are aligned to 32 bytes, the lowest bits read back
272 * as zeros */
273 virt_addr_lo = u64_lo32(data->hw_memdesc.gpu_va);
274 virt_addr_hi = u64_hi32(data->hw_memdesc.gpu_va);
275
276 css_hw_reset_streaming(g);
277
278 gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
279 gk20a_writel(g, perf_pmasys_outbaseupper_r(),
280 perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
281 gk20a_writel(g, perf_pmasys_outsize_r(), snapshot_size);
282
283 /* this field is aligned to 4K */
284 inst_pa_page = gk20a_mem_phys(&g->mm.hwpm.inst_block) >> 12;
285
286 /* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
287 * should be written last */
288 gk20a_writel(g, perf_pmasys_mem_block_r(),
289 perf_pmasys_mem_block_base_f(inst_pa_page) |
290 perf_pmasys_mem_block_valid_true_f() |
291 perf_pmasys_mem_block_target_lfb_f());
292
293 gk20a_dbg_info("cyclestats: buffer for hardware snapshots enabled\n");
294
295 return 0;
296
297failed_allocation:
298 if (data->hw_memdesc.size) {
299 gk20a_gmmu_unmap_free(&g->mm.pmu.vm, &data->hw_memdesc);
300 memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
301 }
302 data->hw_snapshot = NULL;
303
304 return ret;
305}
306
307static void css_hw_disable_snapshot(struct gr_gk20a *gr)
308{
309 struct gk20a *g = gr->g;
310 struct gk20a_cs_snapshot *data = gr->cs_data;
311
312 if (!data->hw_snapshot)
313 return;
314
315 css_hw_reset_streaming(g);
316
317 gk20a_writel(g, perf_pmasys_outbase_r(), 0);
318 gk20a_writel(g, perf_pmasys_outbaseupper_r(),
319 perf_pmasys_outbaseupper_ptr_f(0));
320 gk20a_writel(g, perf_pmasys_outsize_r(), 0);
321
322 gk20a_writel(g, perf_pmasys_mem_block_r(),
323 perf_pmasys_mem_block_base_f(0) |
324 perf_pmasys_mem_block_valid_false_f() |
325 perf_pmasys_mem_block_target_f(0));
326
327 gk20a_gmmu_unmap_free(&g->mm.pmu.vm, &data->hw_memdesc);
328 memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
329 data->hw_snapshot = NULL;
330
331 gk20a_dbg_info("cyclestats: buffer for hardware snapshots disabled\n");
332}
333
334static void css_gr_free_shared_data(struct gr_gk20a *gr)
335{
336 if (gr->cs_data) {
337 /* the clients list is expected to be empty */
338 css_hw_disable_snapshot(gr);
339
340 /* release the objects */
341 kfree(gr->cs_data);
342 gr->cs_data = NULL;
343 }
344}
345
346
347static struct gk20a_cs_snapshot_client*
348css_gr_search_client(struct list_head *clients, u32 perfmon)
349{
350 struct list_head *pos;
351
352 list_for_each(pos, clients) {
353 struct gk20a_cs_snapshot_client *client =
354 container_of(pos,
355 struct gk20a_cs_snapshot_client, list);
356 if (CONTAINS_PERFMON(client, perfmon))
357 return client;
358 }
359
360 return NULL;
361}
362
363static int css_gr_flush_snapshots(struct gr_gk20a *gr)
364{
365 struct gk20a *g = gr->g;
366 struct gk20a_cs_snapshot *css = gr->cs_data;
367 struct gk20a_cs_snapshot_client *cur;
368 u32 pending;
369
370 /* variables for iterating over HW entries */
371 u32 sid;
372 struct gk20a_cs_snapshot_fifo_entry *src;
373
374 /* due to data sharing with userspace we allowed update only */
375 /* overflows and put field in the fifo header */
376 struct gk20a_cs_snapshot_fifo *dst;
377 struct gk20a_cs_snapshot_fifo_entry *dst_get;
378 struct gk20a_cs_snapshot_fifo_entry *dst_put;
379 struct gk20a_cs_snapshot_fifo_entry *dst_head;
380 struct gk20a_cs_snapshot_fifo_entry *dst_tail;
381
382 if (!css)
383 return -EINVAL;
384
385 if (!css->hw_snapshot)
386 return -EINVAL;
387
388 if (list_empty(&css->clients))
389 return -EBADF;
390
391 /* check data available */
392 pending = css_hw_get_pending_snapshots(g);
393 if (!pending)
394 return 0;
395
396 if (css_hw_get_overflow_status(g)) {
397 struct list_head *pos;
398
399 list_for_each(pos, &css->clients) {
400 cur = container_of(pos,
401 struct gk20a_cs_snapshot_client, list);
402 cur->snapshot->hw_overflow_events_occured++;
403 }
404
405 gk20a_warn(dev_from_gk20a(g),
406 "cyclestats: hardware overflow detected\n");
407 }
408
409 /* proceed all items in HW buffer */
410 sid = 0;
411 cur = NULL;
412 dst = NULL;
413 dst_put = NULL;
414 src = css->hw_get;
415
416 /* proceed all completed records */
417 while (sid < pending && 0 == src->zero0) {
418 /* we may have a new perfmon_id which required to */
419 /* switch to a new client -> let's forget current */
420 if (cur && !CONTAINS_PERFMON(cur, src->perfmon_id)) {
421 dst->put = (char *)dst_put - (char *)dst;
422 dst = NULL;
423 cur = NULL;
424 }
425
426 /* now we have to select a new current client */
427 /* the client selection rate depends from experiment */
428 /* activity but on Android usually happened 1-2 times */
429 if (!cur) {
430 cur = css_gr_search_client(&css->clients,
431 src->perfmon_id);
432 if (cur) {
433 /* found - setup all required data */
434 dst = cur->snapshot;
435 dst_get = CSS_FIFO_ENTRY(dst, dst->get);
436 dst_put = CSS_FIFO_ENTRY(dst, dst->put);
437 dst_head = CSS_FIFO_ENTRY(dst, dst->start);
438 dst_tail = CSS_FIFO_ENTRY(dst, dst->end) - 1;
439 } else {
440 /* client not found - skipping this entry */
441 gk20a_warn(dev_from_gk20a(g),
442 "cyclestats: orphaned perfmon %u\n",
443 src->perfmon_id);
444 goto next_hw_fifo_entry;
445 }
446 }
447
448 /* check for software overflows */
449 if (dst_put + 1 == dst_get ||
450 (dst_put == dst_tail && dst_get == dst_head)) {
451 /* no data copy, no pointer updates */
452 dst->sw_overflow_events_occured++;
453 gk20a_warn(dev_from_gk20a(g),
454 "cyclestats: perfmon %u soft overflow\n",
455 src->perfmon_id);
456 } else {
457 *dst_put = *src;
458 if (dst_put == dst_tail)
459 dst_put = dst_head;
460 else
461 dst_put++;
462 }
463
464next_hw_fifo_entry:
465 sid++;
466 if (++src >= css->hw_end)
467 src = css->hw_snapshot;
468 }
469
470 /* update client put pointer if necessary */
471 if (cur && dst)
472 dst->put = (char *)dst_put - (char *)dst;
473
474 /* re-set HW buffer after processing taking wrapping into account */
475 if (css->hw_get < src) {
476 memset(css->hw_get, 0xff, (src - css->hw_get) * sizeof(*src));
477 } else {
478 memset(css->hw_snapshot, 0xff,
479 (src - css->hw_snapshot) * sizeof(*src));
480 memset(css->hw_get, 0xff,
481 (css->hw_end - css->hw_get) * sizeof(*src));
482 }
483 gr->cs_data->hw_get = src;
484 css_hw_set_handled_snapshots(g, sid);
485 if (pending != sid) {
486 /* not all entries proceed correctly. some of problems */
487 /* reported as overflows, some as orphaned perfmons, */
488 /* but it will be better notify with summary about it */
489 gk20a_warn(dev_from_gk20a(g),
490 "cyclestats: done %u from %u entries\n",
491 sid, pending);
492 }
493
494 return 0;
495}
496
497static u32 css_gr_allocate_perfmon_ids(struct gk20a_cs_snapshot *data,
498 u32 count)
499{
500 u32 *pids = data->perfmon_ids;
501 u32 f;
502 u32 e = CSS_MAX_PERFMON_IDS - count;
503
504 if (!count || count > CSS_MAX_PERFMON_IDS - CSS_FIRST_PERFMON_ID)
505 return 0;
506
507 for (f = CSS_FIRST_PERFMON_ID; f < e; f++) {
508 u32 slots = 0;
509 u32 cur;
510 u32 end = f + count;
511
512 /* lookup for continuous hole [f, f+count) of unused bits */
513 for (cur = f; cur < end; cur++) {
514 if (CSS_PERFMON_GET(pids, cur))
515 break;
516 slots++;
517 }
518
519 if (count == slots) {
520 /* we found of hole of unused bits with required */
521 /* length -> can occupy it for our perfmon IDs */
522 for (cur = f; cur < end; cur++)
523 CSS_PERFMON_USE(pids, cur);
524
525 return f;
526 }
527 }
528
529 return 0;
530}
531
532static u32 css_gr_release_perfmon_ids(struct gk20a_cs_snapshot *data,
533 u32 start,
534 u32 count)
535{
536 u32 *pids = data->perfmon_ids;
537 u32 end = start + count;
538 u32 cnt = 0;
539
540 if (start >= CSS_FIRST_PERFMON_ID && end <= CSS_MAX_PERFMON_IDS) {
541 u32 i;
542 for (i = start; i < end; i++) {
543 if (CSS_PERFMON_GET(pids, i)) {
544 CSS_PERFMON_REL(pids, i);
545 cnt++;
546 }
547 }
548 }
549
550 return cnt;
551}
552
553
554static int css_gr_free_client_data(struct gk20a_cs_snapshot *data,
555 struct gk20a_cs_snapshot_client *client)
556{
557 int ret = 0;
558
559 list_del(&client->list);
560 if (client->perfmon_start && client->perfmon_count) {
561 if (client->perfmon_count != css_gr_release_perfmon_ids(data,
562 client->perfmon_start, client->perfmon_count))
563 ret = -EINVAL;
564 }
565 if (client->dma_handler) {
566 dma_buf_vunmap(client->dma_handler, client->snapshot);
567 dma_buf_put(client->dma_handler);
568 }
569
570 kfree(client);
571
572 return ret;
573}
574
575static int css_gr_create_client_data(struct gk20a_cs_snapshot *data,
576 u32 dmabuf_fd, u32 perfmon_count,
577 struct gk20a_cs_snapshot_client **client)
578{
579 struct gk20a_cs_snapshot_client *cur;
580 int ret = 0;
581
582 cur = kzalloc(sizeof(*cur), GFP_KERNEL);
583 if (!cur) {
584 ret = -ENOMEM;
585 goto failed;
586 }
587
588 cur->dmabuf_fd = dmabuf_fd;
589 cur->dma_handler = dma_buf_get(cur->dmabuf_fd);
590 if (IS_ERR(cur->dma_handler)) {
591 ret = PTR_ERR(cur->dma_handler);
592 cur->dma_handler = NULL;
593 goto failed;
594 }
595
596 cur->snapshot = (struct gk20a_cs_snapshot_fifo *)
597 dma_buf_vmap(cur->dma_handler);
598 if (!cur->snapshot) {
599 ret = -ENOMEM;
600 goto failed;
601 }
602
603 cur->snapshot_size = cur->dma_handler->size;
604 if (cur->snapshot_size < CSS_MIN_CLIENT_SNAPSHOT_SIZE) {
605 ret = -ENOMEM;
606 goto failed;
607 }
608
609 memset(cur->snapshot, 0, sizeof(*cur->snapshot));
610 cur->snapshot->start = sizeof(*cur->snapshot);
611 /* we should be ensure that can fit all fifo entries here */
612 cur->snapshot->end =
613 CSS_FIFO_ENTRY_CAPACITY(cur->snapshot_size)
614 * sizeof(struct gk20a_cs_snapshot_fifo_entry)
615 + sizeof(struct gk20a_cs_snapshot_fifo);
616 cur->snapshot->get = cur->snapshot->start;
617 cur->snapshot->put = cur->snapshot->start;
618
619 cur->perfmon_count = perfmon_count;
620 if (cur->perfmon_count) {
621 cur->perfmon_start = css_gr_allocate_perfmon_ids(data,
622 cur->perfmon_count);
623 if (!cur->perfmon_start) {
624 ret = -ENOENT;
625 goto failed;
626 }
627 }
628
629 list_add_tail(&cur->list, &data->clients);
630 *client = cur;
631
632 return 0;
633
634failed:
635 *client = NULL;
636 if (cur)
637 css_gr_free_client_data(data, cur);
638
639 return ret;
640}
641
642
643int gr_gk20a_css_attach(struct gk20a *g,
644 u32 dmabuf_fd,
645 u32 perfmon_count,
646 u32 *perfmon_start,
647 struct gk20a_cs_snapshot_client **cs_client)
648{
649 int ret = 0;
650 struct gr_gk20a *gr;
651
652 if (!g->allow_all)
653 return -EACCES;
654 /* we must have a placeholder to store pointer to client structure */
655 if (!cs_client)
656 return -EINVAL;
657
658 gr = &g->gr;
659 *cs_client = NULL;
660
661 mutex_lock(&gr->cs_lock);
662
663 ret = css_gr_create_shared_data(gr);
664 if (ret)
665 goto failed;
666
667 ret = css_gr_create_client_data(gr->cs_data,
668 dmabuf_fd,
669 perfmon_count,
670 cs_client);
671 if (ret)
672 goto failed;
673
674 ret = css_hw_enable_snapshot(gr, (*cs_client)->snapshot_size);
675 if (ret)
676 goto failed;
677
678 if (perfmon_start)
679 *perfmon_start = (*cs_client)->perfmon_start;
680
681 mutex_unlock(&gr->cs_lock);
682
683 return 0;
684
685failed:
686 if (gr->cs_data) {
687 if (*cs_client) {
688 css_gr_free_client_data(gr->cs_data, *cs_client);
689 *cs_client = NULL;
690 }
691
692 if (list_empty(&gr->cs_data->clients))
693 css_gr_free_shared_data(gr);
694 }
695 mutex_unlock(&gr->cs_lock);
696
697 if (perfmon_start)
698 *perfmon_start = 0;
699
700 return ret;
701}
702
703int gr_gk20a_css_detach(struct gk20a *g,
704 struct gk20a_cs_snapshot_client *cs_client)
705{
706 int ret = 0;
707 struct gr_gk20a *gr;
708
709 if (!g->allow_all)
710 return -EACCES;
711
712 if (!cs_client)
713 return -EINVAL;
714
715 gr = &g->gr;
716 mutex_lock(&gr->cs_lock);
717 if (gr->cs_data) {
718 struct gk20a_cs_snapshot *data = gr->cs_data;
719
720 ret = css_gr_free_client_data(data, cs_client);
721 if (list_empty(&data->clients))
722 css_gr_free_shared_data(gr);
723 } else {
724 ret = -EBADF;
725 }
726 mutex_unlock(&gr->cs_lock);
727
728 return ret;
729}
730
731int gr_gk20a_css_flush(struct gk20a *g,
732 struct gk20a_cs_snapshot_client *cs_client)
733{
734 int ret = 0;
735 struct gr_gk20a *gr;
736
737 if (!g->allow_all)
738 return -EACCES;
739
740 if (!cs_client)
741 return -EINVAL;
742
743 gr = &g->gr;
744 mutex_lock(&gr->cs_lock);
745 ret = css_gr_flush_snapshots(gr);
746 mutex_unlock(&gr->cs_lock);
747
748 return ret;
749}
750
751/* helper function with locking to cleanup snapshot code code in gr_gk20a.c */
752void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
753{
754 struct gr_gk20a *gr = &g->gr;
755
756 mutex_lock(&gr->cs_lock);
757 css_gr_free_shared_data(gr);
758 mutex_unlock(&gr->cs_lock);
759}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index cc1b221d..9c201f32 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -2011,8 +2011,10 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
2011 gk20a_platform_has_syncpoints(g->dev)) 2011 gk20a_platform_has_syncpoints(g->dev))
2012 gpu->flags |= NVGPU_GPU_FLAGS_HAS_SYNCPOINTS; 2012 gpu->flags |= NVGPU_GPU_FLAGS_HAS_SYNCPOINTS;
2013 2013
2014 if (IS_ENABLED(CONFIG_GK20A_CYCLE_STATS)) 2014 if (IS_ENABLED(CONFIG_GK20A_CYCLE_STATS)) {
2015 gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS; 2015 gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS;
2016 gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS_SNAPSHOT;
2017 }
2016 2018
2017 gpu->gpc_mask = 1; 2019 gpu->gpc_mask = 1;
2018 2020
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 7c007622..85d1a886 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -2847,6 +2847,8 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2847 2847
2848 gk20a_dbg_fn(""); 2848 gk20a_dbg_fn("");
2849 2849
2850 gr_gk20a_free_cyclestats_snapshot_data(g);
2851
2850 gr_gk20a_free_global_ctx_buffers(g); 2852 gr_gk20a_free_global_ctx_buffers(g);
2851 2853
2852 gk20a_gmmu_free(g, &gr->mmu_wr_mem); 2854 gk20a_gmmu_free(g, &gr->mmu_wr_mem);
@@ -4523,6 +4525,11 @@ int gk20a_init_gr_support(struct gk20a *g)
4523 4525
4524 gk20a_dbg_fn(""); 4526 gk20a_dbg_fn("");
4525 4527
4528#if defined(CONFIG_GK20A_CYCLE_STATS)
4529 mutex_init(&g->gr.cs_lock);
4530 g->gr.cs_data = NULL;
4531#endif
4532
4526 /* this is required before gr_gk20a_init_ctx_state */ 4533 /* this is required before gr_gk20a_init_ctx_state */
4527 mutex_init(&g->gr.fecs_mutex); 4534 mutex_init(&g->gr.fecs_mutex);
4528 4535
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index b2213739..0909b660 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -170,6 +170,11 @@ struct sm_info {
170 u8 tpc_index; 170 u8 tpc_index;
171}; 171};
172 172
173#if defined(CONFIG_GK20A_CYCLE_STATS)
174struct gk20a_cs_snapshot_client;
175struct gk20a_cs_snapshot;
176#endif
177
173struct gr_gk20a { 178struct gr_gk20a {
174 struct gk20a *g; 179 struct gk20a *g;
175 struct { 180 struct {
@@ -294,6 +299,10 @@ struct gr_gk20a {
294 u32 fbp_en_mask; 299 u32 fbp_en_mask;
295 u32 no_of_sm; 300 u32 no_of_sm;
296 struct sm_info *sm_to_cluster; 301 struct sm_info *sm_to_cluster;
302#if defined(CONFIG_GK20A_CYCLE_STATS)
303 struct mutex cs_lock;
304 struct gk20a_cs_snapshot *cs_data;
305#endif
297}; 306};
298 307
299void gk20a_fecs_dump_falcon_stats(struct gk20a *g); 308void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
@@ -497,4 +506,27 @@ void gr_gk20a_free_gr_ctx(struct gk20a *g,
497int gr_gk20a_halt_pipe(struct gk20a *g); 506int gr_gk20a_halt_pipe(struct gk20a *g);
498int gr_gk20a_debugfs_init(struct gk20a *g); 507int gr_gk20a_debugfs_init(struct gk20a *g);
499 508
509#if defined(CONFIG_GK20A_CYCLE_STATS)
510int gr_gk20a_css_attach(struct gk20a *g, /* in - main hw structure */
511 u32 dmabuf_fd, /* in - dma mapped memory */
512 u32 perfmon_id_count, /* in - number of perfmons*/
513 u32 *perfmon_id_start, /* out- index of first pm */
514 /* out - pointer to client data used in later */
515 struct gk20a_cs_snapshot_client **css_client);
516
517int gr_gk20a_css_detach(struct gk20a *g,
518 struct gk20a_cs_snapshot_client *css_client);
519int gr_gk20a_css_flush(struct gk20a *g,
520 struct gk20a_cs_snapshot_client *css_client);
521
522void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g);
523
524#else
525/* fake empty cleanup function if no cyclestats snapshots enabled */
526static inline void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
527{
528 (void)g;
529}
530#endif
531
500#endif /*__GR_GK20A_H__*/ 532#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
index 65d91de6..1ca80d29 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
@@ -50,6 +50,38 @@
50#ifndef _hw_perf_gk20a_h_ 50#ifndef _hw_perf_gk20a_h_
51#define _hw_perf_gk20a_h_ 51#define _hw_perf_gk20a_h_
52 52
53static inline u32 perf_pmasys_control_r(void)
54{
55 return 0x001b4000;
56}
57static inline u32 perf_pmasys_control_membuf_status_v(u32 r)
58{
59 return (r >> 4) & 0x1;
60}
61static inline u32 perf_pmasys_control_membuf_status_overflowed_v(void)
62{
63 return 0x00000001;
64}
65static inline u32 perf_pmasys_control_membuf_status_overflowed_f(void)
66{
67 return 0x10;
68}
69static inline u32 perf_pmasys_control_membuf_clear_status_f(u32 v)
70{
71 return (v & 0x1) << 5;
72}
73static inline u32 perf_pmasys_control_membuf_clear_status_v(u32 r)
74{
75 return (r >> 5) & 0x1;
76}
77static inline u32 perf_pmasys_control_membuf_clear_status_doit_v(void)
78{
79 return 0x00000001;
80}
81static inline u32 perf_pmasys_control_membuf_clear_status_doit_f(void)
82{
83 return 0x20;
84}
53static inline u32 perf_pmasys_mem_block_r(void) 85static inline u32 perf_pmasys_mem_block_r(void)
54{ 86{
55 return 0x001b4070; 87 return 0x001b4070;
@@ -74,6 +106,22 @@ static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
74{ 106{
75 return 0x0; 107 return 0x0;
76} 108}
109static inline u32 perf_pmasys_mem_block_target_sys_coh_v(void)
110{
111 return 0x00000002;
112}
113static inline u32 perf_pmasys_mem_block_target_sys_coh_f(void)
114{
115 return 0x20000000;
116}
117static inline u32 perf_pmasys_mem_block_target_sys_ncoh_v(void)
118{
119 return 0x00000003;
120}
121static inline u32 perf_pmasys_mem_block_target_sys_ncoh_f(void)
122{
123 return 0x30000000;
124}
77static inline u32 perf_pmasys_mem_block_valid_f(u32 v) 125static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
78{ 126{
79 return (v & 0x1) << 31; 127 return (v & 0x1) << 31;
@@ -102,6 +150,10 @@ static inline u32 perf_pmasys_outbase_r(void)
102{ 150{
103 return 0x001b4074; 151 return 0x001b4074;
104} 152}
153static inline u32 perf_pmasys_outbase_ptr_f(u32 v)
154{
155 return (v & 0x7ffffff) << 5;
156}
105static inline u32 perf_pmasys_outbaseupper_r(void) 157static inline u32 perf_pmasys_outbaseupper_r(void)
106{ 158{
107 return 0x001b4078; 159 return 0x001b4078;
@@ -114,4 +166,40 @@ static inline u32 perf_pmasys_outsize_r(void)
114{ 166{
115 return 0x001b407c; 167 return 0x001b407c;
116} 168}
169static inline u32 perf_pmasys_outsize_numbytes_f(u32 v)
170{
171 return (v & 0x7ffffff) << 5;
172}
173static inline u32 perf_pmasys_mem_bytes_r(void)
174{
175 return 0x001b4084;
176}
177static inline u32 perf_pmasys_mem_bytes_numbytes_f(u32 v)
178{
179 return (v & 0xfffffff) << 4;
180}
181static inline u32 perf_pmasys_mem_bump_r(void)
182{
183 return 0x001b4088;
184}
185static inline u32 perf_pmasys_mem_bump_numbytes_f(u32 v)
186{
187 return (v & 0xfffffff) << 4;
188}
189static inline u32 perf_pmasys_enginestatus_r(void)
190{
191 return 0x001b40a4;
192}
193static inline u32 perf_pmasys_enginestatus_rbufempty_f(u32 v)
194{
195 return (v & 0x1) << 4;
196}
197static inline u32 perf_pmasys_enginestatus_rbufempty_empty_v(void)
198{
199 return 0x00000001;
200}
201static inline u32 perf_pmasys_enginestatus_rbufempty_empty_f(void)
202{
203 return 0x10;
204}
117#endif 205#endif
diff --git a/drivers/gpu/nvgpu/gm20b/hw_perf_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_perf_gm20b.h
new file mode 100644
index 00000000..b7d487a1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gm20b/hw_perf_gm20b.h
@@ -0,0 +1,205 @@
1/*
2 * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16/*
17 * Function naming determines intended use:
18 *
19 * <x>_r(void) : Returns the offset for register <x>.
20 *
21 * <x>_o(void) : Returns the offset for element <x>.
22 *
23 * <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
24 *
25 * <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
26 *
27 * <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
28 * and masked to place it at field <y> of register <x>. This value
29 * can be |'d with others to produce a full register value for
30 * register <x>.
31 *
32 * <x>_<y>_m(void) : Returns a mask for field <y> of register <x>. This
33 * value can be ~'d and then &'d to clear the value of field <y> for
34 * register <x>.
35 *
36 * <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
37 * to place it at field <y> of register <x>. This value can be |'d
38 * with others to produce a full register value for <x>.
39 *
40 * <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
41 * <x> value 'r' after being shifted to place its LSB at bit 0.
42 * This value is suitable for direct comparison with other unshifted
43 * values appropriate for use in field <y> of register <x>.
44 *
45 * <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
46 * field <y> of register <x>. This value is suitable for direct
47 * comparison with unshifted values appropriate for use in field <y>
48 * of register <x>.
49 */
50#ifndef _hw_perf_gm20b_h_
51#define _hw_perf_gm20b_h_
52
53static inline u32 perf_pmasys_control_r(void)
54{
55 return 0x001b4000;
56}
57static inline u32 perf_pmasys_control_membuf_status_v(u32 r)
58{
59 return (r >> 4) & 0x1;
60}
61static inline u32 perf_pmasys_control_membuf_status_overflowed_v(void)
62{
63 return 0x00000001;
64}
65static inline u32 perf_pmasys_control_membuf_status_overflowed_f(void)
66{
67 return 0x10;
68}
69static inline u32 perf_pmasys_control_membuf_clear_status_f(u32 v)
70{
71 return (v & 0x1) << 5;
72}
73static inline u32 perf_pmasys_control_membuf_clear_status_v(u32 r)
74{
75 return (r >> 5) & 0x1;
76}
77static inline u32 perf_pmasys_control_membuf_clear_status_doit_v(void)
78{
79 return 0x00000001;
80}
81static inline u32 perf_pmasys_control_membuf_clear_status_doit_f(void)
82{
83 return 0x20;
84}
85static inline u32 perf_pmasys_mem_block_r(void)
86{
87 return 0x001b4070;
88}
89static inline u32 perf_pmasys_mem_block_base_f(u32 v)
90{
91 return (v & 0xfffffff) << 0;
92}
93static inline u32 perf_pmasys_mem_block_target_f(u32 v)
94{
95 return (v & 0x3) << 28;
96}
97static inline u32 perf_pmasys_mem_block_target_v(u32 r)
98{
99 return (r >> 28) & 0x3;
100}
101static inline u32 perf_pmasys_mem_block_target_lfb_v(void)
102{
103 return 0x00000000;
104}
105static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
106{
107 return 0x0;
108}
109static inline u32 perf_pmasys_mem_block_target_sys_coh_v(void)
110{
111 return 0x00000002;
112}
113static inline u32 perf_pmasys_mem_block_target_sys_coh_f(void)
114{
115 return 0x20000000;
116}
117static inline u32 perf_pmasys_mem_block_target_sys_ncoh_v(void)
118{
119 return 0x00000003;
120}
121static inline u32 perf_pmasys_mem_block_target_sys_ncoh_f(void)
122{
123 return 0x30000000;
124}
125static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
126{
127 return (v & 0x1) << 31;
128}
129static inline u32 perf_pmasys_mem_block_valid_v(u32 r)
130{
131 return (r >> 31) & 0x1;
132}
133static inline u32 perf_pmasys_mem_block_valid_true_v(void)
134{
135 return 0x00000001;
136}
137static inline u32 perf_pmasys_mem_block_valid_true_f(void)
138{
139 return 0x80000000;
140}
141static inline u32 perf_pmasys_mem_block_valid_false_v(void)
142{
143 return 0x00000000;
144}
145static inline u32 perf_pmasys_mem_block_valid_false_f(void)
146{
147 return 0x0;
148}
149static inline u32 perf_pmasys_outbase_r(void)
150{
151 return 0x001b4074;
152}
153static inline u32 perf_pmasys_outbase_ptr_f(u32 v)
154{
155 return (v & 0x7ffffff) << 5;
156}
157static inline u32 perf_pmasys_outbaseupper_r(void)
158{
159 return 0x001b4078;
160}
161static inline u32 perf_pmasys_outbaseupper_ptr_f(u32 v)
162{
163 return (v & 0xff) << 0;
164}
165static inline u32 perf_pmasys_outsize_r(void)
166{
167 return 0x001b407c;
168}
169static inline u32 perf_pmasys_outsize_numbytes_f(u32 v)
170{
171 return (v & 0x7ffffff) << 5;
172}
173static inline u32 perf_pmasys_mem_bytes_r(void)
174{
175 return 0x001b4084;
176}
177static inline u32 perf_pmasys_mem_bytes_numbytes_f(u32 v)
178{
179 return (v & 0xfffffff) << 4;
180}
181static inline u32 perf_pmasys_mem_bump_r(void)
182{
183 return 0x001b4088;
184}
185static inline u32 perf_pmasys_mem_bump_numbytes_f(u32 v)
186{
187 return (v & 0xfffffff) << 4;
188}
189static inline u32 perf_pmasys_enginestatus_r(void)
190{
191 return 0x001b40a4;
192}
193static inline u32 perf_pmasys_enginestatus_rbufempty_f(u32 v)
194{
195 return (v & 0x1) << 4;
196}
197static inline u32 perf_pmasys_enginestatus_rbufempty_empty_v(void)
198{
199 return 0x00000001;
200}
201static inline u32 perf_pmasys_enginestatus_rbufempty_empty_f(void)
202{
203 return 0x10;
204}
205#endif
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index e5bb0d07..1367b7be 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -106,6 +106,8 @@ struct nvgpu_gpu_zbc_query_table_args {
106#define NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS (1 << 4) 106#define NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS (1 << 4)
107/* MAP_BUFFER_EX with unmapped PTE */ 107/* MAP_BUFFER_EX with unmapped PTE */
108#define NVGPU_GPU_FLAGS_SUPPORT_UNMAPPED_PTE (1 << 5) 108#define NVGPU_GPU_FLAGS_SUPPORT_UNMAPPED_PTE (1 << 5)
109/* NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT is available */
110#define NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS_SNAPSHOT (1 << 6)
109 111
110struct nvgpu_gpu_characteristics { 112struct nvgpu_gpu_characteristics {
111 __u32 arch; 113 __u32 arch;
@@ -731,6 +733,21 @@ struct nvgpu_channel_events_ctrl_args {
731#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL_CMD_ENABLE 1 733#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL_CMD_ENABLE 1
732#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL_CMD_CLEAR 2 734#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL_CMD_CLEAR 2
733 735
736/* cycle stats snapshot buffer support for mode E */
737struct nvgpu_cycle_stats_snapshot_args {
738 __u32 cmd; /* in: command to handle */
739 __u32 dmabuf_fd; /* in: dma buffer handler */
740 __u32 extra; /* in/out: extra payload e.g.*/
741 /* counter/start perfmon */
742 __u32 pad0[1];
743};
744
745/* valid commands to control cycle stats shared buffer */
746#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_FLUSH 0
747#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1
748#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2
749
750
734#define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD \ 751#define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD \
735 _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args) 752 _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args)
736#define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \ 753#define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \
@@ -769,9 +786,11 @@ struct nvgpu_channel_events_ctrl_args {
769 _IO(NVGPU_IOCTL_MAGIC, 116) 786 _IO(NVGPU_IOCTL_MAGIC, 116)
770#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL \ 787#define NVGPU_IOCTL_CHANNEL_EVENTS_CTRL \
771 _IOW(NVGPU_IOCTL_MAGIC, 117, struct nvgpu_channel_events_ctrl_args) 788 _IOW(NVGPU_IOCTL_MAGIC, 117, struct nvgpu_channel_events_ctrl_args)
789#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT \
790 _IOWR(NVGPU_IOCTL_MAGIC, 118, struct nvgpu_cycle_stats_snapshot_args)
772 791
773#define NVGPU_IOCTL_CHANNEL_LAST \ 792#define NVGPU_IOCTL_CHANNEL_LAST \
774 _IOC_NR(NVGPU_IOCTL_CHANNEL_EVENTS_CTRL) 793 _IOC_NR(NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT)
775#define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args) 794#define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args)
776 795
777/* 796/*