gpu: nvgpu: add support for FECS VA

- On t186, ucode expects physical address to be programmed for FECS trace buffer. - On t194, ucode expects GPU VA to be programmed for FECS trace buffer. This patch adds extra support to handle this change for linux native. - Increase the size of FECS trace buffer (as few entries were getting dropped due to overflow of FECS trace buffer.) - This moves FECS trace buffer handling in global context buffer. - This adds extra check for updation of mailbox1 register. (Bug 200417403) EVLR-2077 Change-Id: I7c3324ce9341976a1375e0afe6c53c424a053723 Signed-off-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1536028 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Nirav Patel <nipatel@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Vaibhav Kachore <vkachore@nvidia.com> 2018-02-22 06:15:30 -0500
committer: Tejal Kudav <tkudav@nvidia.com> 2018-06-14 09:44:08 -0400
commit: ca3215c6b23c7d855ced899d8090aaa8ce9a9fa3 (patch)
tree: 710114451d4838f82a9e9998db52b81cf76d68c9 /drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
parent: 97d697a8481ca0c348102f04165903e3205302ed (diff)
1 files changed, 54 insertions, 48 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index c9d7ea06..117920da 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -28,6 +28,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/dma.h>
+#include <nvgpu/enabled.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/hashtable.h>
 #include <nvgpu/circ_buf.h>
@@ -51,7 +52,7 @@
 * If HW circular buffer is getting too many "buffer full" conditions,
 * increasing this constant should help (it drives Linux' internal buffer size).
 */
-#define GK20A_FECS_TRACE_NUM_RECORDS            (1 << 6)
+#define GK20A_FECS_TRACE_NUM_RECORDS            (1 << 10)
 #define GK20A_FECS_TRACE_HASH_BITS              8 /* 2^8 */
 #define GK20A_FECS_TRACE_FRAME_PERIOD_US        (1000000ULL/60ULL)
 #define GK20A_FECS_TRACE_PTIMER_SHIFT           5
@@ -74,7 +75,6 @@ struct gk20a_fecs_trace_hash_ent {
 struct gk20a_fecs_trace {
-        struct nvgpu_mem trace_buf;
        DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
        struct nvgpu_mutex hash_lock;
        struct nvgpu_mutex poll_lock;
@@ -106,10 +106,12 @@ static inline int gk20a_fecs_trace_num_ts(void)
 }
 static struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
-        struct gk20a_fecs_trace *trace, int idx)
+        struct gk20a *g, int idx)
 {
+        struct nvgpu_mem *mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
        return (struct gk20a_fecs_trace_record *)
-                ((u8 *) trace->trace_buf.cpu_va
+                ((u8 *) mem->cpu_va
                + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
 }
@@ -258,12 +260,13 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
        struct gk20a_fecs_trace *trace = g->fecs_trace;
        pid_t cur_pid;
        pid_t new_pid;
+        int count = 0;
        /* for now, only one VM */
        const int vmid = 0;
-        struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(
+        struct gk20a_fecs_trace_record *r =
-                trace, index);
+                gk20a_fecs_trace_get_record(g, index);
        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
                "consuming record trace=%p read=%d record=%p", trace, index, r);
@@ -334,10 +337,11 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
                        continue;
                gk20a_ctxsw_trace_write(g, &entry);
+                count++;
        }
        gk20a_ctxsw_trace_wake_up(g, vmid);
-        return 0;
+        return count;
 }
 int gk20a_fecs_trace_poll(struct gk20a *g)
@@ -376,15 +380,16 @@ int gk20a_fecs_trace_poll(struct gk20a *g)
        g->ops.mm.fb_flush(g);
        while (read != write) {
-                /* Ignore error code, as we want to consume all records */
+                cnt = gk20a_fecs_trace_ring_read(g, read);
-                (void)gk20a_fecs_trace_ring_read(g, read);
+                if (cnt <= 0)
+                        break;
                /* Get to next record. */
                read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
        }
        /* ensure FECS records has been updated before incrementing read index */
-        nvgpu_smp_wmb();
+        nvgpu_wmb();
        gk20a_fecs_trace_set_read_index(g, read);
 done:
@@ -411,20 +416,10 @@ static int gk20a_fecs_trace_periodic_polling(void *arg)
        return 0;
 }
-static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)
+size_t gk20a_fecs_trace_buffer_size(struct gk20a *g)
 {
-        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        return GK20A_FECS_TRACE_NUM_RECORDS
+                        * ctxsw_prog_record_timestamp_record_size_in_bytes_v();
-        return nvgpu_dma_alloc_sys(g, GK20A_FECS_TRACE_NUM_RECORDS
-                        * ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
-                        &trace->trace_buf);
-}
-static void gk20a_fecs_trace_free_ring(struct gk20a *g)
-{
-        struct gk20a_fecs_trace *trace = g->fecs_trace;
-        nvgpu_dma_free(g, &trace->trace_buf);
 }
 #ifdef CONFIG_DEBUG_FS
@@ -460,8 +455,8 @@ static int gk20a_fecs_trace_debugfs_ring_seq_show(
 {
        loff_t *pos = (loff_t *) v;
        struct gk20a *g = *(struct gk20a **)s->private;
-        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        struct gk20a_fecs_trace_record *r =
-        struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos);
+                gk20a_fecs_trace_get_record(g, *pos);
        int i;
        const u32 invalid_tag =
            ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
@@ -588,12 +583,6 @@ int gk20a_fecs_trace_init(struct gk20a *g)
                goto clean_poll_lock;
        BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
-        err = gk20a_fecs_trace_alloc_ring(g);
-        if (err) {
-                nvgpu_warn(g, "failed to allocate FECS ring");
-                goto clean_hash_lock;
-        }
        hash_init(trace->pid_hash_table);
        __nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);
@@ -604,8 +593,6 @@ int gk20a_fecs_trace_init(struct gk20a *g)
        return 0;
-clean_hash_lock:
-        nvgpu_mutex_destroy(&trace->hash_lock);
 clean_poll_lock:
        nvgpu_mutex_destroy(&trace->poll_lock);
 clean:
@@ -624,14 +611,14 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
        u32 lo;
        u32 hi;
-        u64 pa;
+        u64 addr;
        struct tsg_gk20a *tsg;
        struct nvgpu_gr_ctx *ch_ctx;
        struct gk20a_fecs_trace *trace = g->fecs_trace;
        struct nvgpu_mem *mem;
        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
        pid_t pid;
-        u32 aperture;
+        u32 aperture_mask;
        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
                        "chid=%d context_ptr=%x inst_block=%llx",
@@ -648,34 +635,54 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
        if (!trace)
                return -ENOMEM;
-        pa = nvgpu_inst_block_addr(g, &trace->trace_buf);
+        mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
-        if (!pa)
-                return -ENOMEM;
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
-        aperture = nvgpu_aperture_mask(g, &trace->trace_buf,
+                addr = ch_ctx->global_ctx_buffer_va[FECS_TRACE_BUFFER_VA];
+                nvgpu_log(g, gpu_dbg_ctxsw, "gpu_va=%llx", addr);
+                aperture_mask = 0;
+        } else {
+                addr = nvgpu_inst_block_addr(g, mem);
+                nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
+                aperture_mask = nvgpu_aperture_mask(g, mem,
                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
+        }
+        if (!addr)
+                return -ENOMEM;
+        lo = u64_lo32(addr);
+        hi = u64_hi32(addr);
+        mem = &ch_ctx->mem;
        if (nvgpu_mem_begin(g, mem))
                return -ENOMEM;
-        lo = u64_lo32(pa);
-        hi = u64_hi32(pa);
        nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
                lo, GK20A_FECS_TRACE_NUM_RECORDS);
        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+                        GK20A_FECS_TRACE_NUM_RECORDS));
+        nvgpu_mem_end(g, mem);
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA))
+                mem = &ch->ctx_header.mem;
+        if (nvgpu_mem_begin(g, mem))
+                return -ENOMEM;
+        nvgpu_mem_wr(g, mem,
                ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
                lo);
        nvgpu_mem_wr(g, mem,
                ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
                ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) |
-                aperture);
+                aperture_mask);
-        nvgpu_mem_wr(g, mem,
-                ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-                ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
-                        GK20A_FECS_TRACE_NUM_RECORDS));
        nvgpu_mem_end(g, mem);
@@ -728,7 +735,6 @@ int gk20a_fecs_trace_deinit(struct gk20a *g)
                return 0;
        nvgpu_thread_stop(&trace->poll_task);
-        gk20a_fecs_trace_free_ring(g);
        gk20a_fecs_trace_free_hash_table(g);
        nvgpu_mutex_destroy(&g->fecs_trace->hash_lock);
author	Vaibhav Kachore <vkachore@nvidia.com>	2018-02-22 06:15:30 -0500
committer	Tejal Kudav <tkudav@nvidia.com>	2018-06-14 09:44:08 -0400
commit	ca3215c6b23c7d855ced899d8090aaa8ce9a9fa3 (patch)
tree	710114451d4838f82a9e9998db52b81cf76d68c9 /drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
parent	97d697a8481ca0c348102f04165903e3205302ed (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c index c9d7ea06..117920da 100644 --- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -28,6 +28,7 @@
28		28
29	#include <nvgpu/kmem.h>	29	#include <nvgpu/kmem.h>
30	#include <nvgpu/dma.h>	30	#include <nvgpu/dma.h>
		31	#include <nvgpu/enabled.h>
31	#include <nvgpu/bug.h>	32	#include <nvgpu/bug.h>
32	#include <nvgpu/hashtable.h>	33	#include <nvgpu/hashtable.h>
33	#include <nvgpu/circ_buf.h>	34	#include <nvgpu/circ_buf.h>
@@ -51,7 +52,7 @@
51	* If HW circular buffer is getting too many "buffer full" conditions,	52	* If HW circular buffer is getting too many "buffer full" conditions,
52	* increasing this constant should help (it drives Linux' internal buffer size).	53	* increasing this constant should help (it drives Linux' internal buffer size).
53	*/	54	*/
54	#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6)	55	#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 10)
55	#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */	56	#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */
56	#define GK20A_FECS_TRACE_FRAME_PERIOD_US (1000000ULL/60ULL)	57	#define GK20A_FECS_TRACE_FRAME_PERIOD_US (1000000ULL/60ULL)
57	#define GK20A_FECS_TRACE_PTIMER_SHIFT 5	58	#define GK20A_FECS_TRACE_PTIMER_SHIFT 5
@@ -74,7 +75,6 @@ struct gk20a_fecs_trace_hash_ent {
74		75
75	struct gk20a_fecs_trace {	76	struct gk20a_fecs_trace {
76		77
77	struct nvgpu_mem trace_buf;
78	DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);	78	DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
79	struct nvgpu_mutex hash_lock;	79	struct nvgpu_mutex hash_lock;
80	struct nvgpu_mutex poll_lock;	80	struct nvgpu_mutex poll_lock;
@@ -106,10 +106,12 @@ static inline int gk20a_fecs_trace_num_ts(void)
106	}	106	}
107		107
108	static struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(	108	static struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
109	struct gk20a_fecs_trace *trace, int idx)	109	struct gk20a *g, int idx)
110	{	110	{
		111	struct nvgpu_mem *mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
		112
111	return (struct gk20a_fecs_trace_record *)	113	return (struct gk20a_fecs_trace_record *)
112	((u8 *) trace->trace_buf.cpu_va	114	((u8 *) mem->cpu_va
113	+ (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));	115	+ (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
114	}	116	}
115		117
@@ -258,12 +260,13 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
258	struct gk20a_fecs_trace *trace = g->fecs_trace;	260	struct gk20a_fecs_trace *trace = g->fecs_trace;
259	pid_t cur_pid;	261	pid_t cur_pid;
260	pid_t new_pid;	262	pid_t new_pid;
		263	int count = 0;
261		264
262	/* for now, only one VM */	265	/* for now, only one VM */
263	const int vmid = 0;	266	const int vmid = 0;
264		267
265	struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(	268	struct gk20a_fecs_trace_record *r =
266	trace, index);	269	gk20a_fecs_trace_get_record(g, index);
267		270
268	nvgpu_log(g, gpu_dbg_fn \| gpu_dbg_ctxsw,	271	nvgpu_log(g, gpu_dbg_fn \| gpu_dbg_ctxsw,
269	"consuming record trace=%p read=%d record=%p", trace, index, r);	272	"consuming record trace=%p read=%d record=%p", trace, index, r);
@@ -334,10 +337,11 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
334	continue;	337	continue;
335		338
336	gk20a_ctxsw_trace_write(g, &entry);	339	gk20a_ctxsw_trace_write(g, &entry);
		340	count++;
337	}	341	}
338		342
339	gk20a_ctxsw_trace_wake_up(g, vmid);	343	gk20a_ctxsw_trace_wake_up(g, vmid);
340	return 0;	344	return count;
341	}	345	}
342		346
343	int gk20a_fecs_trace_poll(struct gk20a *g)	347	int gk20a_fecs_trace_poll(struct gk20a *g)
@@ -376,15 +380,16 @@ int gk20a_fecs_trace_poll(struct gk20a *g)
376	g->ops.mm.fb_flush(g);	380	g->ops.mm.fb_flush(g);
377		381
378	while (read != write) {	382	while (read != write) {
379	/* Ignore error code, as we want to consume all records */	383	cnt = gk20a_fecs_trace_ring_read(g, read);
380	(void)gk20a_fecs_trace_ring_read(g, read);	384	if (cnt <= 0)
		385	break;
381		386
382	/* Get to next record. */	387	/* Get to next record. */
383	read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);	388	read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
384	}	389	}
385		390
386	/* ensure FECS records has been updated before incrementing read index */	391	/* ensure FECS records has been updated before incrementing read index */
387	nvgpu_smp_wmb();	392	nvgpu_wmb();
388	gk20a_fecs_trace_set_read_index(g, read);	393	gk20a_fecs_trace_set_read_index(g, read);
389		394
390	done:	395	done:
@@ -411,20 +416,10 @@ static int gk20a_fecs_trace_periodic_polling(void *arg)
411	return 0;	416	return 0;
412	}	417	}
413		418
414	static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)	419	size_t gk20a_fecs_trace_buffer_size(struct gk20a *g)
415	{	420	{
416	struct gk20a_fecs_trace *trace = g->fecs_trace;	421	return GK20A_FECS_TRACE_NUM_RECORDS
417		422	* ctxsw_prog_record_timestamp_record_size_in_bytes_v();
418	return nvgpu_dma_alloc_sys(g, GK20A_FECS_TRACE_NUM_RECORDS
419	* ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
420	&trace->trace_buf);
421	}
422
423	static void gk20a_fecs_trace_free_ring(struct gk20a *g)
424	{
425	struct gk20a_fecs_trace *trace = g->fecs_trace;
426
427	nvgpu_dma_free(g, &trace->trace_buf);
428	}	423	}
429		424
430	#ifdef CONFIG_DEBUG_FS	425	#ifdef CONFIG_DEBUG_FS
@@ -460,8 +455,8 @@ static int gk20a_fecs_trace_debugfs_ring_seq_show(
460	{	455	{
461	loff_t pos = (loff_t ) v;	456	loff_t pos = (loff_t ) v;
462	struct gk20a g = (struct gk20a **)s->private;	457	struct gk20a g = (struct gk20a **)s->private;
463	struct gk20a_fecs_trace *trace = g->fecs_trace;	458	struct gk20a_fecs_trace_record *r =
464	struct gk20a_fecs_trace_record r = gk20a_fecs_trace_get_record(trace, pos);	459	gk20a_fecs_trace_get_record(g, *pos);
465	int i;	460	int i;
466	const u32 invalid_tag =	461	const u32 invalid_tag =
467	ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();	462	ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
@@ -588,12 +583,6 @@ int gk20a_fecs_trace_init(struct gk20a *g)
588	goto clean_poll_lock;	583	goto clean_poll_lock;
589		584
590	BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));	585	BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
591	err = gk20a_fecs_trace_alloc_ring(g);
592	if (err) {
593	nvgpu_warn(g, "failed to allocate FECS ring");
594	goto clean_hash_lock;
595	}
596
597	hash_init(trace->pid_hash_table);	586	hash_init(trace->pid_hash_table);
598		587
599	__nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);	588	__nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);
@@ -604,8 +593,6 @@ int gk20a_fecs_trace_init(struct gk20a *g)
604		593
605	return 0;	594	return 0;
606		595
607	clean_hash_lock:
608	nvgpu_mutex_destroy(&trace->hash_lock);
609	clean_poll_lock:	596	clean_poll_lock:
610	nvgpu_mutex_destroy(&trace->poll_lock);	597	nvgpu_mutex_destroy(&trace->poll_lock);
611	clean:	598	clean:
@@ -624,14 +611,14 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
624		611
625	u32 lo;	612	u32 lo;
626	u32 hi;	613	u32 hi;
627	u64 pa;	614	u64 addr;
628	struct tsg_gk20a *tsg;	615	struct tsg_gk20a *tsg;
629	struct nvgpu_gr_ctx *ch_ctx;	616	struct nvgpu_gr_ctx *ch_ctx;
630	struct gk20a_fecs_trace *trace = g->fecs_trace;	617	struct gk20a_fecs_trace *trace = g->fecs_trace;
631	struct nvgpu_mem *mem;	618	struct nvgpu_mem *mem;
632	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);	619	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
633	pid_t pid;	620	pid_t pid;
634	u32 aperture;	621	u32 aperture_mask;
635		622
636	nvgpu_log(g, gpu_dbg_fn\|gpu_dbg_ctxsw,	623	nvgpu_log(g, gpu_dbg_fn\|gpu_dbg_ctxsw,
637	"chid=%d context_ptr=%x inst_block=%llx",	624	"chid=%d context_ptr=%x inst_block=%llx",
@@ -648,34 +635,54 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
648	if (!trace)	635	if (!trace)
649	return -ENOMEM;	636	return -ENOMEM;
650		637
651	pa = nvgpu_inst_block_addr(g, &trace->trace_buf);	638	mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
652	if (!pa)	639
653	return -ENOMEM;	640	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
654	aperture = nvgpu_aperture_mask(g, &trace->trace_buf,	641	addr = ch_ctx->global_ctx_buffer_va[FECS_TRACE_BUFFER_VA];
		642	nvgpu_log(g, gpu_dbg_ctxsw, "gpu_va=%llx", addr);
		643	aperture_mask = 0;
		644	} else {
		645	addr = nvgpu_inst_block_addr(g, mem);
		646	nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
		647	aperture_mask = nvgpu_aperture_mask(g, mem,
655	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),	648	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
656	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),	649	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
657	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());	650	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
		651	}
		652	if (!addr)
		653	return -ENOMEM;
		654
		655	lo = u64_lo32(addr);
		656	hi = u64_hi32(addr);
		657
		658	mem = &ch_ctx->mem;
658		659
659	if (nvgpu_mem_begin(g, mem))	660	if (nvgpu_mem_begin(g, mem))
660	return -ENOMEM;	661	return -ENOMEM;
661		662
662	lo = u64_lo32(pa);
663	hi = u64_hi32(pa);
664
665	nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,	663	nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
666	lo, GK20A_FECS_TRACE_NUM_RECORDS);	664	lo, GK20A_FECS_TRACE_NUM_RECORDS);
667		665
668	nvgpu_mem_wr(g, mem,	666	nvgpu_mem_wr(g, mem,
		667	ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
		668	ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
		669	GK20A_FECS_TRACE_NUM_RECORDS));
		670
		671	nvgpu_mem_end(g, mem);
		672
		673	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA))
		674	mem = &ch->ctx_header.mem;
		675
		676	if (nvgpu_mem_begin(g, mem))
		677	return -ENOMEM;
		678
		679	nvgpu_mem_wr(g, mem,
669	ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),	680	ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
670	lo);	681	lo);
671	nvgpu_mem_wr(g, mem,	682	nvgpu_mem_wr(g, mem,
672	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),	683	ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
673	ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) \|	684	ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) \|
674	aperture);	685	aperture_mask);
675	nvgpu_mem_wr(g, mem,
676	ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
677	ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
678	GK20A_FECS_TRACE_NUM_RECORDS));
679		686
680	nvgpu_mem_end(g, mem);	687	nvgpu_mem_end(g, mem);
681		688
@@ -728,7 +735,6 @@ int gk20a_fecs_trace_deinit(struct gk20a *g)
728	return 0;	735	return 0;
729		736
730	nvgpu_thread_stop(&trace->poll_task);	737	nvgpu_thread_stop(&trace->poll_task);
731	gk20a_fecs_trace_free_ring(g);
732	gk20a_fecs_trace_free_hash_table(g);	738	gk20a_fecs_trace_free_hash_table(g);
733		739
734	nvgpu_mutex_destroy(&g->fecs_trace->hash_lock);	740	nvgpu_mutex_destroy(&g->fecs_trace->hash_lock);