diff options
author | Anton Vorontsov <avorontsov@nvidia.com> | 2015-08-19 17:27:51 -0400 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2016-03-23 10:48:47 -0400 |
commit | 1c40d09c4c9c011c1318c328c0b4b6b17d1f537e (patch) | |
tree | 8b93fcd00739f9ada9302f06175278c9cb1d6785 /drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c | |
parent | 82da6ed595a87c8a3038eecd75880ab21dd4c5de (diff) |
gpu: nvgpu: Add support for FECS ctxsw tracing
bug 1648908
This commit adds support for FECS ctxsw tracing. Code is compiled
conditionnaly under CONFIG_GK20_CTXSW_TRACE.
This feature requires an updated FECS ucode that writes one record to a ring
buffer on each context switch. On RM/Kernel side, the GPU driver reads records
from the master ring buffer and generates trace entries into a user-facing
VM ring buffer. For each record in the master ring buffer, RM/Kernel has
to retrieve the vmid+pid of the user process that submitted related work.
Features currently implemented:
- master ring buffer allocation
- debugfs to dump master ring buffer
- FECS record per context switch (with both current and new contexts)
- dedicated device for ctxsw tracing (access to VM ring buffer)
- SOF generation (and access to PTIMER)
- VM ring buffer allocation, and reconfiguration
- enable/disable tracing at user level
- event-based trace filtering
- context_ptr to vmid+pid mapping
- read system call for ctxsw dev
- mmap system call for ctxsw dev (direct access to VM ring buffer)
- poll system call for ctxsw dev
- save/restore register on ELPG/CG6
- separate user ring from FECS ring handling
Features requiring ucode changes:
- enable/disable tracing at FECS level
- actual busy time on engine (bug 1642354)
- master ring buffer threshold interrupt (P1)
- API for GPU to CPU timestamp conversion (P1)
- vmid/pid/uid based filtering (P1)
Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1022737
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c | 763 |
1 files changed, 763 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c new file mode 100644 index 00000000..bac36403 --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c | |||
@@ -0,0 +1,763 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | */ | ||
13 | |||
14 | #include <asm/barrier.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/circ_buf.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/jiffies.h> | ||
20 | #include <linux/wait.h> | ||
21 | #include <linux/ktime.h> | ||
22 | #include <linux/nvgpu.h> | ||
23 | #include <linux/hashtable.h> | ||
24 | #include <linux/debugfs.h> | ||
25 | #include <linux/log2.h> | ||
26 | #include <uapi/linux/nvgpu.h> | ||
27 | #include "ctxsw_trace_gk20a.h" | ||
28 | #include "fecs_trace_gk20a.h" | ||
29 | #include "gk20a.h" | ||
30 | #include "gr_gk20a.h" | ||
31 | #include "hw_ctxsw_prog_gk20a.h" | ||
32 | #include "hw_gr_gk20a.h" | ||
33 | |||
34 | /* | ||
35 | * If HW circular buffer is getting too many "buffer full" conditions, | ||
36 | * increasing this constant should help (it drives Linux' internal buffer size). | ||
37 | */ | ||
38 | #define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6) | ||
39 | #define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */ | ||
40 | #define GK20A_FECS_TRACE_FRAME_PERIOD_NS (1000000000ULL/60ULL) | ||
41 | #define GK20A_FECS_TRACE_PTIMER_SHIFT 5 | ||
42 | |||
43 | struct gk20a_fecs_trace_record { | ||
44 | u32 magic_lo; | ||
45 | u32 magic_hi; | ||
46 | u32 context_id; | ||
47 | u32 context_ptr; | ||
48 | u32 new_context_id; | ||
49 | u32 new_context_ptr; | ||
50 | u64 ts[]; | ||
51 | }; | ||
52 | |||
53 | struct gk20a_fecs_trace_hash_ent { | ||
54 | u32 context_ptr; | ||
55 | pid_t pid; | ||
56 | struct hlist_node node; | ||
57 | }; | ||
58 | |||
59 | struct gk20a_fecs_trace { | ||
60 | |||
61 | struct mem_desc trace_buf; | ||
62 | DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS); | ||
63 | struct mutex hash_lock; | ||
64 | struct mutex poll_lock; | ||
65 | u64 sof; | ||
66 | u32 sof_mask; /* did we already send a SOF for this VM */ | ||
67 | |||
68 | struct task_struct *poll_task; | ||
69 | }; | ||
70 | |||
71 | #ifdef CONFIG_GK20A_CTXSW_TRACE | ||
72 | static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts) | ||
73 | { | ||
74 | return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32)); | ||
75 | } | ||
76 | |||
77 | static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts) | ||
78 | { | ||
79 | return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32); | ||
80 | } | ||
81 | |||
82 | |||
83 | static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch) | ||
84 | { | ||
85 | return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL); | ||
86 | } | ||
87 | |||
88 | static inline int gk20a_fecs_trace_num_ts(void) | ||
89 | { | ||
90 | return (ctxsw_prog_record_timestamp_record_size_in_bytes_v() | ||
91 | - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64); | ||
92 | } | ||
93 | |||
94 | struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record( | ||
95 | struct gk20a_fecs_trace *trace, int idx) | ||
96 | { | ||
97 | return (struct gk20a_fecs_trace_record *) | ||
98 | ((u8 *) trace->trace_buf.cpu_va | ||
99 | + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v())); | ||
100 | } | ||
101 | |||
102 | static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r) | ||
103 | { | ||
104 | /* | ||
105 | * testing magic_hi should suffice. magic_lo is sometimes used | ||
106 | * as a sequence number in experimental ucode. | ||
107 | */ | ||
108 | return (r->magic_hi | ||
109 | == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v()); | ||
110 | } | ||
111 | |||
112 | static int gk20a_fecs_trace_get_read_index(struct gk20a *g) | ||
113 | { | ||
114 | return gr_gk20a_elpg_protected_call(g, | ||
115 | gk20a_readl(g, gr_fecs_mailbox1_r())); | ||
116 | } | ||
117 | |||
118 | static int gk20a_fecs_trace_get_write_index(struct gk20a *g) | ||
119 | { | ||
120 | return gr_gk20a_elpg_protected_call(g, | ||
121 | gk20a_readl(g, gr_fecs_mailbox0_r())); | ||
122 | } | ||
123 | |||
124 | static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index) | ||
125 | { | ||
126 | gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index); | ||
127 | return gr_gk20a_elpg_protected_call(g, | ||
128 | (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0)); | ||
129 | } | ||
130 | |||
131 | void gk20a_fecs_trace_hash_dump(struct gk20a *g) | ||
132 | { | ||
133 | u32 bkt; | ||
134 | struct gk20a_fecs_trace_hash_ent *ent; | ||
135 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
136 | |||
137 | gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table"); | ||
138 | |||
139 | mutex_lock(&trace->hash_lock); | ||
140 | hash_for_each(trace->pid_hash_table, bkt, ent, node) | ||
141 | { | ||
142 | gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d", | ||
143 | ent, bkt, ent->context_ptr, ent->pid); | ||
144 | |||
145 | } | ||
146 | mutex_unlock(&trace->hash_lock); | ||
147 | } | ||
148 | |||
149 | static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid) | ||
150 | { | ||
151 | struct gk20a_fecs_trace_hash_ent *he; | ||
152 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
153 | |||
154 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, | ||
155 | "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid); | ||
156 | |||
157 | he = kzalloc(sizeof(*he), GFP_KERNEL); | ||
158 | if (unlikely(!he)) { | ||
159 | gk20a_warn(dev_from_gk20a(g), | ||
160 | "can't alloc new hash entry for context_ptr=%x pid=%d", | ||
161 | context_ptr, pid); | ||
162 | return -ENOMEM; | ||
163 | } | ||
164 | |||
165 | he->context_ptr = context_ptr; | ||
166 | he->pid = pid; | ||
167 | mutex_lock(&trace->hash_lock); | ||
168 | hash_add(trace->pid_hash_table, &he->node, context_ptr); | ||
169 | mutex_unlock(&trace->hash_lock); | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr) | ||
174 | { | ||
175 | struct hlist_node *tmp; | ||
176 | struct gk20a_fecs_trace_hash_ent *ent; | ||
177 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
178 | |||
179 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, | ||
180 | "freeing hash entry context_ptr=%x", context_ptr); | ||
181 | |||
182 | mutex_lock(&trace->hash_lock); | ||
183 | hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node, | ||
184 | context_ptr) { | ||
185 | if (ent->context_ptr == context_ptr) { | ||
186 | hash_del(&ent->node); | ||
187 | gk20a_dbg(gpu_dbg_ctxsw, | ||
188 | "freed hash entry=%p context_ptr=%x", ent, | ||
189 | ent->context_ptr); | ||
190 | kfree(ent); | ||
191 | break; | ||
192 | } | ||
193 | } | ||
194 | mutex_unlock(&trace->hash_lock); | ||
195 | } | ||
196 | |||
197 | static void gk20a_fecs_trace_free_hash_table(struct gk20a *g) | ||
198 | { | ||
199 | u32 bkt; | ||
200 | struct hlist_node *tmp; | ||
201 | struct gk20a_fecs_trace_hash_ent *ent; | ||
202 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
203 | |||
204 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace); | ||
205 | |||
206 | mutex_lock(&trace->hash_lock); | ||
207 | hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) { | ||
208 | hash_del(&ent->node); | ||
209 | kfree(ent); | ||
210 | } | ||
211 | mutex_unlock(&trace->hash_lock); | ||
212 | |||
213 | } | ||
214 | |||
215 | static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr) | ||
216 | { | ||
217 | struct gk20a_fecs_trace_hash_ent *ent; | ||
218 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
219 | pid_t pid = 0; | ||
220 | |||
221 | mutex_lock(&trace->hash_lock); | ||
222 | hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) { | ||
223 | if (ent->context_ptr == context_ptr) { | ||
224 | gk20a_dbg(gpu_dbg_ctxsw, | ||
225 | "found context_ptr=%x -> pid=%d", | ||
226 | ent->context_ptr, ent->pid); | ||
227 | pid = ent->pid; | ||
228 | break; | ||
229 | } | ||
230 | } | ||
231 | mutex_unlock(&trace->hash_lock); | ||
232 | |||
233 | return pid; | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Converts HW entry format to userspace-facing format and pushes it to the | ||
238 | * queue. | ||
239 | */ | ||
240 | static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index) | ||
241 | { | ||
242 | int i; | ||
243 | struct nvgpu_ctxsw_trace_entry entry = { }; | ||
244 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
245 | pid_t cur_pid; | ||
246 | pid_t new_pid; | ||
247 | |||
248 | /* for now, only one VM */ | ||
249 | const int vmid = 0; | ||
250 | |||
251 | struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record( | ||
252 | trace, index); | ||
253 | |||
254 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, | ||
255 | "consuming record trace=%p read=%d record=%p", trace, index, r); | ||
256 | |||
257 | if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) { | ||
258 | gk20a_warn(dev_from_gk20a(g), | ||
259 | "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)", | ||
260 | trace, index, r, r->magic_lo, r->magic_hi); | ||
261 | return -EINVAL; | ||
262 | } | ||
263 | |||
264 | cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr); | ||
265 | new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr); | ||
266 | |||
267 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, | ||
268 | "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)", | ||
269 | r->context_ptr, cur_pid, r->new_context_ptr, new_pid); | ||
270 | |||
271 | entry.context_id = r->context_id; | ||
272 | entry.vmid = vmid; | ||
273 | |||
274 | /* insert SOF event if needed */ | ||
275 | if (!(trace->sof_mask & BIT(vmid))) { | ||
276 | entry.tag = NVGPU_CTXSW_TAG_SOF; | ||
277 | entry.timestamp = trace->sof; | ||
278 | entry.context_id = 0; | ||
279 | entry.pid = 0; | ||
280 | |||
281 | gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp); | ||
282 | gk20a_ctxsw_trace_write(g, &entry); | ||
283 | trace->sof_mask |= BIT(vmid); | ||
284 | } | ||
285 | |||
286 | /* break out FECS record into trace events */ | ||
287 | for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) { | ||
288 | |||
289 | entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]); | ||
290 | entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]); | ||
291 | entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT; | ||
292 | |||
293 | gk20a_dbg(gpu_dbg_ctxsw, | ||
294 | "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x", | ||
295 | entry.tag, entry.timestamp, r->context_id, | ||
296 | r->new_context_id); | ||
297 | |||
298 | switch (entry.tag) { | ||
299 | case NVGPU_CTXSW_TAG_RESTORE_START: | ||
300 | case NVGPU_CTXSW_TAG_CONTEXT_START: | ||
301 | entry.context_id = r->new_context_id; | ||
302 | entry.pid = new_pid; | ||
303 | break; | ||
304 | |||
305 | case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST: | ||
306 | case NVGPU_CTXSW_TAG_FE_ACK: | ||
307 | case NVGPU_CTXSW_TAG_FE_ACK_WFI: | ||
308 | case NVGPU_CTXSW_TAG_FE_ACK_GFXP: | ||
309 | case NVGPU_CTXSW_TAG_FE_ACK_CTAP: | ||
310 | case NVGPU_CTXSW_TAG_FE_ACK_CILP: | ||
311 | case NVGPU_CTXSW_TAG_SAVE_END: | ||
312 | entry.context_id = r->context_id; | ||
313 | entry.pid = cur_pid; | ||
314 | break; | ||
315 | |||
316 | default: | ||
317 | /* tags are not guaranteed to start at the beginning */ | ||
318 | WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP)); | ||
319 | continue; | ||
320 | } | ||
321 | |||
322 | gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld", | ||
323 | entry.tag, entry.context_id, entry.pid); | ||
324 | |||
325 | if (!entry.context_id) | ||
326 | continue; | ||
327 | |||
328 | gk20a_ctxsw_trace_write(g, &entry); | ||
329 | } | ||
330 | |||
331 | gk20a_ctxsw_trace_wake_up(g, vmid); | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static int gk20a_fecs_trace_poll(struct gk20a *g) | ||
336 | { | ||
337 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
338 | |||
339 | int read = 0; | ||
340 | int write = 0; | ||
341 | int cnt; | ||
342 | int err; | ||
343 | |||
344 | err = gk20a_busy(g->dev); | ||
345 | if (unlikely(err)) | ||
346 | return err; | ||
347 | |||
348 | mutex_lock(&trace->poll_lock); | ||
349 | write = gk20a_fecs_trace_get_write_index(g); | ||
350 | if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) { | ||
351 | gk20a_err(dev_from_gk20a(g), | ||
352 | "failed to acquire write index, write=%d", write); | ||
353 | err = write; | ||
354 | goto done; | ||
355 | } | ||
356 | |||
357 | read = gk20a_fecs_trace_get_read_index(g); | ||
358 | |||
359 | cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS); | ||
360 | if (!cnt) | ||
361 | goto done; | ||
362 | |||
363 | gk20a_dbg(gpu_dbg_ctxsw, | ||
364 | "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d", | ||
365 | read, gk20a_fecs_trace_get_read_index(g), write, cnt); | ||
366 | |||
367 | /* we did not send any SOF yet */ | ||
368 | trace->sof_mask = 0; | ||
369 | |||
370 | /* consume all records */ | ||
371 | while (read != write) { | ||
372 | gk20a_fecs_trace_ring_read(g, read); | ||
373 | |||
374 | /* Get to next record. */ | ||
375 | read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1); | ||
376 | gk20a_fecs_trace_set_read_index(g, read); | ||
377 | } | ||
378 | |||
379 | done: | ||
380 | /* | ||
381 | * OK, we read out all the entries... a new "frame" starts here. | ||
382 | * We remember the Start Of Frame time and insert it on the next | ||
383 | * iteration. | ||
384 | */ | ||
385 | trace->sof = gk20a_read_ptimer(g); | ||
386 | |||
387 | mutex_unlock(&trace->poll_lock); | ||
388 | gk20a_idle(g->dev); | ||
389 | return err; | ||
390 | } | ||
391 | |||
392 | static int gk20a_fecs_trace_periodic_polling(void *arg) | ||
393 | { | ||
394 | struct gk20a *g = (struct gk20a *)arg; | ||
395 | struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS); | ||
396 | |||
397 | pr_info("%s: running\n", __func__); | ||
398 | |||
399 | while (!kthread_should_stop()) { | ||
400 | |||
401 | hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | ||
402 | |||
403 | gk20a_fecs_trace_poll(g); | ||
404 | } | ||
405 | |||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | static int gk20a_fecs_trace_alloc_ring(struct gk20a *g) | ||
410 | { | ||
411 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
412 | |||
413 | return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS | ||
414 | * ctxsw_prog_record_timestamp_record_size_in_bytes_v(), | ||
415 | &trace->trace_buf); | ||
416 | } | ||
417 | |||
418 | static void gk20a_fecs_trace_free_ring(struct gk20a *g) | ||
419 | { | ||
420 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
421 | |||
422 | gk20a_gmmu_free(g, &trace->trace_buf); | ||
423 | } | ||
424 | |||
425 | #ifdef CONFIG_DEBUG_FS | ||
426 | /* | ||
427 | * The sequence iterator functions. We simply use the count of the | ||
428 | * next line as our internal position. | ||
429 | */ | ||
430 | static void *gk20a_fecs_trace_debugfs_ring_seq_start( | ||
431 | struct seq_file *s, loff_t *pos) | ||
432 | { | ||
433 | if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS) | ||
434 | return NULL; | ||
435 | |||
436 | return pos; | ||
437 | } | ||
438 | |||
439 | static void *gk20a_fecs_trace_debugfs_ring_seq_next( | ||
440 | struct seq_file *s, void *v, loff_t *pos) | ||
441 | { | ||
442 | ++(*pos); | ||
443 | if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS) | ||
444 | return NULL; | ||
445 | return pos; | ||
446 | } | ||
447 | |||
448 | static void gk20a_fecs_trace_debugfs_ring_seq_stop( | ||
449 | struct seq_file *s, void *v) | ||
450 | { | ||
451 | } | ||
452 | |||
453 | static int gk20a_fecs_trace_debugfs_ring_seq_show( | ||
454 | struct seq_file *s, void *v) | ||
455 | { | ||
456 | loff_t *pos = (loff_t *) v; | ||
457 | struct gk20a *g = *(struct gk20a **)s->private; | ||
458 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
459 | struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos); | ||
460 | int i; | ||
461 | const u32 invalid_tag = | ||
462 | ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(); | ||
463 | u32 tag; | ||
464 | u64 timestamp; | ||
465 | |||
466 | seq_printf(s, "record #%lld (%p)\n", *pos, r); | ||
467 | seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo); | ||
468 | seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi); | ||
469 | if (gk20a_fecs_trace_is_valid_record(r)) { | ||
470 | seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr); | ||
471 | seq_printf(s, "\tcontext_id=%08x\n", r->context_id); | ||
472 | seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr); | ||
473 | seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id); | ||
474 | for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) { | ||
475 | tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]); | ||
476 | if (tag == invalid_tag) | ||
477 | continue; | ||
478 | timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]); | ||
479 | timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT; | ||
480 | seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp); | ||
481 | } | ||
482 | } | ||
483 | return 0; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Tie them all together into a set of seq_operations. | ||
488 | */ | ||
489 | const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = { | ||
490 | .start = gk20a_fecs_trace_debugfs_ring_seq_start, | ||
491 | .next = gk20a_fecs_trace_debugfs_ring_seq_next, | ||
492 | .stop = gk20a_fecs_trace_debugfs_ring_seq_stop, | ||
493 | .show = gk20a_fecs_trace_debugfs_ring_seq_show | ||
494 | }; | ||
495 | |||
496 | /* | ||
497 | * Time to set up the file operations for our /proc file. In this case, | ||
498 | * all we need is an open function which sets up the sequence ops. | ||
499 | */ | ||
500 | |||
501 | static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode, | ||
502 | struct file *file) | ||
503 | { | ||
504 | struct gk20a **p; | ||
505 | |||
506 | if (!capable(CAP_SYS_ADMIN)) | ||
507 | return -EPERM; | ||
508 | |||
509 | p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops, | ||
510 | sizeof(struct gk20a *)); | ||
511 | if (!p) | ||
512 | return -ENOMEM; | ||
513 | |||
514 | *p = (struct gk20a *)inode->i_private; | ||
515 | return 0; | ||
516 | }; | ||
517 | |||
518 | /* | ||
519 | * The file operations structure contains our open function along with | ||
520 | * set of the canned seq_ ops. | ||
521 | */ | ||
522 | const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = { | ||
523 | .owner = THIS_MODULE, | ||
524 | .open = gk20a_ctxsw_debugfs_ring_open, | ||
525 | .read = seq_read, | ||
526 | .llseek = seq_lseek, | ||
527 | .release = seq_release_private | ||
528 | }; | ||
529 | |||
530 | static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val) | ||
531 | { | ||
532 | *val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg); | ||
533 | return 0; | ||
534 | } | ||
535 | DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops, | ||
536 | gk20a_fecs_trace_debugfs_read, NULL, "%llu\n"); | ||
537 | |||
538 | static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val) | ||
539 | { | ||
540 | *val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg); | ||
541 | return 0; | ||
542 | } | ||
543 | DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops, | ||
544 | gk20a_fecs_trace_debugfs_write, NULL, "%llu\n"); | ||
545 | |||
546 | static void gk20a_fecs_trace_debugfs_init(struct gk20a *g) | ||
547 | { | ||
548 | struct gk20a_platform *plat = platform_get_drvdata(g->dev); | ||
549 | |||
550 | debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g, | ||
551 | &gk20a_fecs_trace_debugfs_read_fops); | ||
552 | debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g, | ||
553 | &gk20a_fecs_trace_debugfs_write_fops); | ||
554 | debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g, | ||
555 | &gk20a_fecs_trace_debugfs_ring_fops); | ||
556 | } | ||
557 | |||
558 | static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g) | ||
559 | { | ||
560 | struct gk20a_platform *plat = platform_get_drvdata(g->dev); | ||
561 | |||
562 | debugfs_remove_recursive(plat->debugfs); | ||
563 | } | ||
564 | |||
565 | #else | ||
566 | |||
567 | static void gk20a_fecs_trace_debugfs_init(struct gk20a *g) | ||
568 | { | ||
569 | } | ||
570 | |||
571 | static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g) | ||
572 | { | ||
573 | } | ||
574 | |||
575 | #endif /* CONFIG_DEBUG_FS */ | ||
576 | |||
577 | static int gk20a_fecs_trace_init(struct gk20a *g) | ||
578 | { | ||
579 | struct gk20a_fecs_trace *trace; | ||
580 | int err; | ||
581 | |||
582 | trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL); | ||
583 | if (!trace) { | ||
584 | gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace"); | ||
585 | return -ENOMEM; | ||
586 | } | ||
587 | g->fecs_trace = trace; | ||
588 | |||
589 | BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS)); | ||
590 | err = gk20a_fecs_trace_alloc_ring(g); | ||
591 | if (err) { | ||
592 | gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring"); | ||
593 | goto clean; | ||
594 | } | ||
595 | |||
596 | mutex_init(&trace->poll_lock); | ||
597 | mutex_init(&trace->hash_lock); | ||
598 | hash_init(trace->pid_hash_table); | ||
599 | |||
600 | gk20a_fecs_trace_debugfs_init(g); | ||
601 | return 0; | ||
602 | |||
603 | clean: | ||
604 | kfree(trace); | ||
605 | g->fecs_trace = NULL; | ||
606 | return err; | ||
607 | } | ||
608 | |||
609 | static int gk20a_fecs_trace_bind_channel(struct gk20a *g, | ||
610 | struct channel_gk20a *ch) | ||
611 | { | ||
612 | /* | ||
613 | * map our circ_buf to the context space and store the GPU VA | ||
614 | * in the context header. | ||
615 | */ | ||
616 | |||
617 | u32 lo; | ||
618 | u32 hi; | ||
619 | phys_addr_t pa; | ||
620 | struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; | ||
621 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
622 | void *ctx_ptr; | ||
623 | u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch); | ||
624 | |||
625 | gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, | ||
626 | "hw_chid=%d context_ptr=%x inst_block=%llx", | ||
627 | ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block)); | ||
628 | |||
629 | if (!trace) | ||
630 | return -ENOMEM; | ||
631 | |||
632 | pa = gk20a_mem_phys(&trace->trace_buf); | ||
633 | if (!pa) | ||
634 | return -ENOMEM; | ||
635 | |||
636 | ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, | ||
637 | PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0, | ||
638 | pgprot_writecombine(PAGE_KERNEL)); | ||
639 | if (!ctx_ptr) | ||
640 | return -ENOMEM; | ||
641 | |||
642 | lo = u64_lo32(pa); | ||
643 | hi = u64_hi32(pa); | ||
644 | |||
645 | gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi, | ||
646 | lo, GK20A_FECS_TRACE_NUM_RECORDS); | ||
647 | |||
648 | gk20a_mem_wr32(ctx_ptr | ||
649 | + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(), | ||
650 | 0, lo); | ||
651 | gk20a_mem_wr32(ctx_ptr | ||
652 | + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(), | ||
653 | 0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi)); | ||
654 | gk20a_mem_wr32(ctx_ptr | ||
655 | + ctxsw_prog_main_image_context_timestamp_buffer_control_o(), | ||
656 | 0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f( | ||
657 | GK20A_FECS_TRACE_NUM_RECORDS)); | ||
658 | |||
659 | vunmap(ctx_ptr); | ||
660 | gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid); | ||
661 | |||
662 | return 0; | ||
663 | } | ||
664 | |||
665 | static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch) | ||
666 | { | ||
667 | u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch); | ||
668 | |||
669 | gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, | ||
670 | "ch=%p context_ptr=%x", ch, context_ptr); | ||
671 | |||
672 | if (g->ops.fecs_trace.flush) | ||
673 | g->ops.fecs_trace.flush(g); | ||
674 | gk20a_fecs_trace_poll(g); | ||
675 | gk20a_fecs_trace_hash_del(g, context_ptr); | ||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | static int gk20a_fecs_trace_reset(struct gk20a *g) | ||
680 | { | ||
681 | gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, ""); | ||
682 | |||
683 | if (g->ops.fecs_trace.flush) | ||
684 | g->ops.fecs_trace.flush(g); | ||
685 | gk20a_fecs_trace_poll(g); | ||
686 | return gk20a_fecs_trace_set_read_index(g, 0); | ||
687 | } | ||
688 | |||
689 | static int gk20a_fecs_trace_deinit(struct gk20a *g) | ||
690 | { | ||
691 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
692 | |||
693 | gk20a_fecs_trace_debugfs_cleanup(g); | ||
694 | kthread_stop(trace->poll_task); | ||
695 | gk20a_fecs_trace_free_ring(g); | ||
696 | gk20a_fecs_trace_free_hash_table(g); | ||
697 | |||
698 | kfree(g->fecs_trace); | ||
699 | g->fecs_trace = NULL; | ||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | static int gk20a_gr_max_entries(struct gk20a *g, | ||
704 | struct nvgpu_ctxsw_trace_filter *filter) | ||
705 | { | ||
706 | int n; | ||
707 | int tag; | ||
708 | |||
709 | /* Compute number of entries per record, with given filter */ | ||
710 | for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++) | ||
711 | n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0); | ||
712 | |||
713 | /* Return max number of entries generated for the whole ring */ | ||
714 | return n * GK20A_FECS_TRACE_NUM_RECORDS; | ||
715 | } | ||
716 | |||
717 | static int gk20a_fecs_trace_enable(struct gk20a *g) | ||
718 | { | ||
719 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
720 | struct task_struct *task; | ||
721 | |||
722 | if (!trace->poll_task) { | ||
723 | task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__); | ||
724 | if (unlikely(IS_ERR(task))) { | ||
725 | gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task"); | ||
726 | return PTR_ERR(task); | ||
727 | } | ||
728 | trace->poll_task = task; | ||
729 | } | ||
730 | |||
731 | return 0; | ||
732 | } | ||
733 | |||
734 | static int gk20a_fecs_trace_disable(struct gk20a *g) | ||
735 | { | ||
736 | struct gk20a_fecs_trace *trace = g->fecs_trace; | ||
737 | |||
738 | if (trace->poll_task) { | ||
739 | kthread_stop(trace->poll_task); | ||
740 | trace->poll_task = NULL; | ||
741 | } | ||
742 | |||
743 | return -EPERM; | ||
744 | } | ||
745 | |||
746 | void gk20a_init_fecs_trace_ops(struct gpu_ops *ops) | ||
747 | { | ||
748 | ops->fecs_trace.init = gk20a_fecs_trace_init; | ||
749 | ops->fecs_trace.deinit = gk20a_fecs_trace_deinit; | ||
750 | ops->fecs_trace.enable = gk20a_fecs_trace_enable; | ||
751 | ops->fecs_trace.disable = gk20a_fecs_trace_disable; | ||
752 | ops->fecs_trace.reset = gk20a_fecs_trace_reset; | ||
753 | ops->fecs_trace.flush = NULL; | ||
754 | ops->fecs_trace.poll = gk20a_fecs_trace_poll; | ||
755 | ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel; | ||
756 | ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel; | ||
757 | ops->fecs_trace.max_entries = gk20a_gr_max_entries; | ||
758 | } | ||
759 | #else | ||
760 | void gk20a_init_fecs_trace_ops(struct gpu_ops *ops) | ||
761 | { | ||
762 | } | ||
763 | #endif /* CONFIG_GK20A_CTXSW_TRACE */ | ||