From 1c40d09c4c9c011c1318c328c0b4b6b17d1f537e Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 19 Aug 2015 14:27:51 -0700 Subject: gpu: nvgpu: Add support for FECS ctxsw tracing bug 1648908 This commit adds support for FECS ctxsw tracing. Code is compiled conditionnaly under CONFIG_GK20_CTXSW_TRACE. This feature requires an updated FECS ucode that writes one record to a ring buffer on each context switch. On RM/Kernel side, the GPU driver reads records from the master ring buffer and generates trace entries into a user-facing VM ring buffer. For each record in the master ring buffer, RM/Kernel has to retrieve the vmid+pid of the user process that submitted related work. Features currently implemented: - master ring buffer allocation - debugfs to dump master ring buffer - FECS record per context switch (with both current and new contexts) - dedicated device for ctxsw tracing (access to VM ring buffer) - SOF generation (and access to PTIMER) - VM ring buffer allocation, and reconfiguration - enable/disable tracing at user level - event-based trace filtering - context_ptr to vmid+pid mapping - read system call for ctxsw dev - mmap system call for ctxsw dev (direct access to VM ring buffer) - poll system call for ctxsw dev - save/restore register on ELPG/CG6 - separate user ring from FECS ring handling Features requiring ucode changes: - enable/disable tracing at FECS level - actual busy time on engine (bug 1642354) - master ring buffer threshold interrupt (P1) - API for GPU to CPU timestamp conversion (P1) - vmid/pid/uid based filtering (P1) Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3 Signed-off-by: Thomas Fleury Reviewed-on: http://git-master/r/1022737 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c | 763 +++++++++++++++++++++++++++++ 1 file changed, 763 insertions(+) create mode 100644 drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c (limited to 'drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c') diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c new file mode 100644 index 00000000..bac36403 --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c @@ -0,0 +1,763 @@ +/* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctxsw_trace_gk20a.h" +#include "fecs_trace_gk20a.h" +#include "gk20a.h" +#include "gr_gk20a.h" +#include "hw_ctxsw_prog_gk20a.h" +#include "hw_gr_gk20a.h" + +/* + * If HW circular buffer is getting too many "buffer full" conditions, + * increasing this constant should help (it drives Linux' internal buffer size). + */ +#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6) +#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */ +#define GK20A_FECS_TRACE_FRAME_PERIOD_NS (1000000000ULL/60ULL) +#define GK20A_FECS_TRACE_PTIMER_SHIFT 5 + +struct gk20a_fecs_trace_record { + u32 magic_lo; + u32 magic_hi; + u32 context_id; + u32 context_ptr; + u32 new_context_id; + u32 new_context_ptr; + u64 ts[]; +}; + +struct gk20a_fecs_trace_hash_ent { + u32 context_ptr; + pid_t pid; + struct hlist_node node; +}; + +struct gk20a_fecs_trace { + + struct mem_desc trace_buf; + DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS); + struct mutex hash_lock; + struct mutex poll_lock; + u64 sof; + u32 sof_mask; /* did we already send a SOF for this VM */ + + struct task_struct *poll_task; +}; + +#ifdef CONFIG_GK20A_CTXSW_TRACE +static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts) +{ + return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32)); +} + +static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts) +{ + return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32); +} + + +static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch) +{ + return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL); +} + +static inline int gk20a_fecs_trace_num_ts(void) +{ + return (ctxsw_prog_record_timestamp_record_size_in_bytes_v() + - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64); +} + +struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record( + struct gk20a_fecs_trace *trace, int idx) +{ + return (struct gk20a_fecs_trace_record *) + ((u8 *) trace->trace_buf.cpu_va + + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v())); +} + +static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r) +{ + /* + * testing magic_hi should suffice. magic_lo is sometimes used + * as a sequence number in experimental ucode. + */ + return (r->magic_hi + == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v()); +} + +static int gk20a_fecs_trace_get_read_index(struct gk20a *g) +{ + return gr_gk20a_elpg_protected_call(g, + gk20a_readl(g, gr_fecs_mailbox1_r())); +} + +static int gk20a_fecs_trace_get_write_index(struct gk20a *g) +{ + return gr_gk20a_elpg_protected_call(g, + gk20a_readl(g, gr_fecs_mailbox0_r())); +} + +static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index) +{ + gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index); + return gr_gk20a_elpg_protected_call(g, + (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0)); +} + +void gk20a_fecs_trace_hash_dump(struct gk20a *g) +{ + u32 bkt; + struct gk20a_fecs_trace_hash_ent *ent; + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table"); + + mutex_lock(&trace->hash_lock); + hash_for_each(trace->pid_hash_table, bkt, ent, node) + { + gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d", + ent, bkt, ent->context_ptr, ent->pid); + + } + mutex_unlock(&trace->hash_lock); +} + +static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid) +{ + struct gk20a_fecs_trace_hash_ent *he; + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, + "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid); + + he = kzalloc(sizeof(*he), GFP_KERNEL); + if (unlikely(!he)) { + gk20a_warn(dev_from_gk20a(g), + "can't alloc new hash entry for context_ptr=%x pid=%d", + context_ptr, pid); + return -ENOMEM; + } + + he->context_ptr = context_ptr; + he->pid = pid; + mutex_lock(&trace->hash_lock); + hash_add(trace->pid_hash_table, &he->node, context_ptr); + mutex_unlock(&trace->hash_lock); + return 0; +} + +static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr) +{ + struct hlist_node *tmp; + struct gk20a_fecs_trace_hash_ent *ent; + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, + "freeing hash entry context_ptr=%x", context_ptr); + + mutex_lock(&trace->hash_lock); + hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node, + context_ptr) { + if (ent->context_ptr == context_ptr) { + hash_del(&ent->node); + gk20a_dbg(gpu_dbg_ctxsw, + "freed hash entry=%p context_ptr=%x", ent, + ent->context_ptr); + kfree(ent); + break; + } + } + mutex_unlock(&trace->hash_lock); +} + +static void gk20a_fecs_trace_free_hash_table(struct gk20a *g) +{ + u32 bkt; + struct hlist_node *tmp; + struct gk20a_fecs_trace_hash_ent *ent; + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace); + + mutex_lock(&trace->hash_lock); + hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) { + hash_del(&ent->node); + kfree(ent); + } + mutex_unlock(&trace->hash_lock); + +} + +static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr) +{ + struct gk20a_fecs_trace_hash_ent *ent; + struct gk20a_fecs_trace *trace = g->fecs_trace; + pid_t pid = 0; + + mutex_lock(&trace->hash_lock); + hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) { + if (ent->context_ptr == context_ptr) { + gk20a_dbg(gpu_dbg_ctxsw, + "found context_ptr=%x -> pid=%d", + ent->context_ptr, ent->pid); + pid = ent->pid; + break; + } + } + mutex_unlock(&trace->hash_lock); + + return pid; +} + +/* + * Converts HW entry format to userspace-facing format and pushes it to the + * queue. + */ +static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index) +{ + int i; + struct nvgpu_ctxsw_trace_entry entry = { }; + struct gk20a_fecs_trace *trace = g->fecs_trace; + pid_t cur_pid; + pid_t new_pid; + + /* for now, only one VM */ + const int vmid = 0; + + struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record( + trace, index); + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, + "consuming record trace=%p read=%d record=%p", trace, index, r); + + if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) { + gk20a_warn(dev_from_gk20a(g), + "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)", + trace, index, r, r->magic_lo, r->magic_hi); + return -EINVAL; + } + + cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr); + new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr); + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, + "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)", + r->context_ptr, cur_pid, r->new_context_ptr, new_pid); + + entry.context_id = r->context_id; + entry.vmid = vmid; + + /* insert SOF event if needed */ + if (!(trace->sof_mask & BIT(vmid))) { + entry.tag = NVGPU_CTXSW_TAG_SOF; + entry.timestamp = trace->sof; + entry.context_id = 0; + entry.pid = 0; + + gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp); + gk20a_ctxsw_trace_write(g, &entry); + trace->sof_mask |= BIT(vmid); + } + + /* break out FECS record into trace events */ + for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) { + + entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]); + entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]); + entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT; + + gk20a_dbg(gpu_dbg_ctxsw, + "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x", + entry.tag, entry.timestamp, r->context_id, + r->new_context_id); + + switch (entry.tag) { + case NVGPU_CTXSW_TAG_RESTORE_START: + case NVGPU_CTXSW_TAG_CONTEXT_START: + entry.context_id = r->new_context_id; + entry.pid = new_pid; + break; + + case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST: + case NVGPU_CTXSW_TAG_FE_ACK: + case NVGPU_CTXSW_TAG_FE_ACK_WFI: + case NVGPU_CTXSW_TAG_FE_ACK_GFXP: + case NVGPU_CTXSW_TAG_FE_ACK_CTAP: + case NVGPU_CTXSW_TAG_FE_ACK_CILP: + case NVGPU_CTXSW_TAG_SAVE_END: + entry.context_id = r->context_id; + entry.pid = cur_pid; + break; + + default: + /* tags are not guaranteed to start at the beginning */ + WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP)); + continue; + } + + gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld", + entry.tag, entry.context_id, entry.pid); + + if (!entry.context_id) + continue; + + gk20a_ctxsw_trace_write(g, &entry); + } + + gk20a_ctxsw_trace_wake_up(g, vmid); + return 0; +} + +static int gk20a_fecs_trace_poll(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + + int read = 0; + int write = 0; + int cnt; + int err; + + err = gk20a_busy(g->dev); + if (unlikely(err)) + return err; + + mutex_lock(&trace->poll_lock); + write = gk20a_fecs_trace_get_write_index(g); + if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) { + gk20a_err(dev_from_gk20a(g), + "failed to acquire write index, write=%d", write); + err = write; + goto done; + } + + read = gk20a_fecs_trace_get_read_index(g); + + cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS); + if (!cnt) + goto done; + + gk20a_dbg(gpu_dbg_ctxsw, + "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d", + read, gk20a_fecs_trace_get_read_index(g), write, cnt); + + /* we did not send any SOF yet */ + trace->sof_mask = 0; + + /* consume all records */ + while (read != write) { + gk20a_fecs_trace_ring_read(g, read); + + /* Get to next record. */ + read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1); + gk20a_fecs_trace_set_read_index(g, read); + } + +done: + /* + * OK, we read out all the entries... a new "frame" starts here. + * We remember the Start Of Frame time and insert it on the next + * iteration. + */ + trace->sof = gk20a_read_ptimer(g); + + mutex_unlock(&trace->poll_lock); + gk20a_idle(g->dev); + return err; +} + +static int gk20a_fecs_trace_periodic_polling(void *arg) +{ + struct gk20a *g = (struct gk20a *)arg; + struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS); + + pr_info("%s: running\n", __func__); + + while (!kthread_should_stop()) { + + hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + + gk20a_fecs_trace_poll(g); + } + + return 0; +} + +static int gk20a_fecs_trace_alloc_ring(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + + return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS + * ctxsw_prog_record_timestamp_record_size_in_bytes_v(), + &trace->trace_buf); +} + +static void gk20a_fecs_trace_free_ring(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_gmmu_free(g, &trace->trace_buf); +} + +#ifdef CONFIG_DEBUG_FS +/* + * The sequence iterator functions. We simply use the count of the + * next line as our internal position. + */ +static void *gk20a_fecs_trace_debugfs_ring_seq_start( + struct seq_file *s, loff_t *pos) +{ + if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS) + return NULL; + + return pos; +} + +static void *gk20a_fecs_trace_debugfs_ring_seq_next( + struct seq_file *s, void *v, loff_t *pos) +{ + ++(*pos); + if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS) + return NULL; + return pos; +} + +static void gk20a_fecs_trace_debugfs_ring_seq_stop( + struct seq_file *s, void *v) +{ +} + +static int gk20a_fecs_trace_debugfs_ring_seq_show( + struct seq_file *s, void *v) +{ + loff_t *pos = (loff_t *) v; + struct gk20a *g = *(struct gk20a **)s->private; + struct gk20a_fecs_trace *trace = g->fecs_trace; + struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos); + int i; + const u32 invalid_tag = + ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(); + u32 tag; + u64 timestamp; + + seq_printf(s, "record #%lld (%p)\n", *pos, r); + seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo); + seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi); + if (gk20a_fecs_trace_is_valid_record(r)) { + seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr); + seq_printf(s, "\tcontext_id=%08x\n", r->context_id); + seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr); + seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id); + for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) { + tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]); + if (tag == invalid_tag) + continue; + timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]); + timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT; + seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp); + } + } + return 0; +} + +/* + * Tie them all together into a set of seq_operations. + */ +const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = { + .start = gk20a_fecs_trace_debugfs_ring_seq_start, + .next = gk20a_fecs_trace_debugfs_ring_seq_next, + .stop = gk20a_fecs_trace_debugfs_ring_seq_stop, + .show = gk20a_fecs_trace_debugfs_ring_seq_show +}; + +/* + * Time to set up the file operations for our /proc file. In this case, + * all we need is an open function which sets up the sequence ops. + */ + +static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode, + struct file *file) +{ + struct gk20a **p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops, + sizeof(struct gk20a *)); + if (!p) + return -ENOMEM; + + *p = (struct gk20a *)inode->i_private; + return 0; +}; + +/* + * The file operations structure contains our open function along with + * set of the canned seq_ ops. + */ +const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = { + .owner = THIS_MODULE, + .open = gk20a_ctxsw_debugfs_ring_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private +}; + +static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val) +{ + *val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops, + gk20a_fecs_trace_debugfs_read, NULL, "%llu\n"); + +static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val) +{ + *val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops, + gk20a_fecs_trace_debugfs_write, NULL, "%llu\n"); + +static void gk20a_fecs_trace_debugfs_init(struct gk20a *g) +{ + struct gk20a_platform *plat = platform_get_drvdata(g->dev); + + debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g, + &gk20a_fecs_trace_debugfs_read_fops); + debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g, + &gk20a_fecs_trace_debugfs_write_fops); + debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g, + &gk20a_fecs_trace_debugfs_ring_fops); +} + +static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g) +{ + struct gk20a_platform *plat = platform_get_drvdata(g->dev); + + debugfs_remove_recursive(plat->debugfs); +} + +#else + +static void gk20a_fecs_trace_debugfs_init(struct gk20a *g) +{ +} + +static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +static int gk20a_fecs_trace_init(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace; + int err; + + trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL); + if (!trace) { + gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace"); + return -ENOMEM; + } + g->fecs_trace = trace; + + BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS)); + err = gk20a_fecs_trace_alloc_ring(g); + if (err) { + gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring"); + goto clean; + } + + mutex_init(&trace->poll_lock); + mutex_init(&trace->hash_lock); + hash_init(trace->pid_hash_table); + + gk20a_fecs_trace_debugfs_init(g); + return 0; + +clean: + kfree(trace); + g->fecs_trace = NULL; + return err; +} + +static int gk20a_fecs_trace_bind_channel(struct gk20a *g, + struct channel_gk20a *ch) +{ + /* + * map our circ_buf to the context space and store the GPU VA + * in the context header. + */ + + u32 lo; + u32 hi; + phys_addr_t pa; + struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; + struct gk20a_fecs_trace *trace = g->fecs_trace; + void *ctx_ptr; + u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch); + + gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, + "hw_chid=%d context_ptr=%x inst_block=%llx", + ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block)); + + if (!trace) + return -ENOMEM; + + pa = gk20a_mem_phys(&trace->trace_buf); + if (!pa) + return -ENOMEM; + + ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages, + PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0, + pgprot_writecombine(PAGE_KERNEL)); + if (!ctx_ptr) + return -ENOMEM; + + lo = u64_lo32(pa); + hi = u64_hi32(pa); + + gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi, + lo, GK20A_FECS_TRACE_NUM_RECORDS); + + gk20a_mem_wr32(ctx_ptr + + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(), + 0, lo); + gk20a_mem_wr32(ctx_ptr + + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(), + 0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi)); + gk20a_mem_wr32(ctx_ptr + + ctxsw_prog_main_image_context_timestamp_buffer_control_o(), + 0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f( + GK20A_FECS_TRACE_NUM_RECORDS)); + + vunmap(ctx_ptr); + gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid); + + return 0; +} + +static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch) +{ + u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch); + + gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, + "ch=%p context_ptr=%x", ch, context_ptr); + + if (g->ops.fecs_trace.flush) + g->ops.fecs_trace.flush(g); + gk20a_fecs_trace_poll(g); + gk20a_fecs_trace_hash_del(g, context_ptr); + return 0; +} + +static int gk20a_fecs_trace_reset(struct gk20a *g) +{ + gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, ""); + + if (g->ops.fecs_trace.flush) + g->ops.fecs_trace.flush(g); + gk20a_fecs_trace_poll(g); + return gk20a_fecs_trace_set_read_index(g, 0); +} + +static int gk20a_fecs_trace_deinit(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + + gk20a_fecs_trace_debugfs_cleanup(g); + kthread_stop(trace->poll_task); + gk20a_fecs_trace_free_ring(g); + gk20a_fecs_trace_free_hash_table(g); + + kfree(g->fecs_trace); + g->fecs_trace = NULL; + return 0; +} + +static int gk20a_gr_max_entries(struct gk20a *g, + struct nvgpu_ctxsw_trace_filter *filter) +{ + int n; + int tag; + + /* Compute number of entries per record, with given filter */ + for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++) + n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0); + + /* Return max number of entries generated for the whole ring */ + return n * GK20A_FECS_TRACE_NUM_RECORDS; +} + +static int gk20a_fecs_trace_enable(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + struct task_struct *task; + + if (!trace->poll_task) { + task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__); + if (unlikely(IS_ERR(task))) { + gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task"); + return PTR_ERR(task); + } + trace->poll_task = task; + } + + return 0; +} + +static int gk20a_fecs_trace_disable(struct gk20a *g) +{ + struct gk20a_fecs_trace *trace = g->fecs_trace; + + if (trace->poll_task) { + kthread_stop(trace->poll_task); + trace->poll_task = NULL; + } + + return -EPERM; +} + +void gk20a_init_fecs_trace_ops(struct gpu_ops *ops) +{ + ops->fecs_trace.init = gk20a_fecs_trace_init; + ops->fecs_trace.deinit = gk20a_fecs_trace_deinit; + ops->fecs_trace.enable = gk20a_fecs_trace_enable; + ops->fecs_trace.disable = gk20a_fecs_trace_disable; + ops->fecs_trace.reset = gk20a_fecs_trace_reset; + ops->fecs_trace.flush = NULL; + ops->fecs_trace.poll = gk20a_fecs_trace_poll; + ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel; + ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel; + ops->fecs_trace.max_entries = gk20a_gr_max_entries; +} +#else +void gk20a_init_fecs_trace_ops(struct gpu_ops *ops) +{ +} +#endif /* CONFIG_GK20A_CTXSW_TRACE */ -- cgit v1.2.2