/*
 * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <nvgpu/kmem.h>
#include <nvgpu/dma.h>
#include <nvgpu/enabled.h>
#include <nvgpu/bug.h>
#include <nvgpu/hashtable.h>
#include <nvgpu/circ_buf.h>
#include <nvgpu/thread.h>
#include <nvgpu/barrier.h>
#include <nvgpu/mm.h>
#include <nvgpu/enabled.h>
#include <nvgpu/ctxsw_trace.h>
#include <nvgpu/io.h>
#include <nvgpu/utils.h>
#include <nvgpu/timers.h>
#include <nvgpu/channel.h>

#include "fecs_trace_gk20a.h"
#include "gk20a.h"
#include "gr_gk20a.h"

#include <nvgpu/log.h>
#include <nvgpu/fecs_trace.h>

#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

struct gk20a_fecs_trace_hash_ent {
	u32 context_ptr;
	pid_t pid;
	struct hlist_node node;
};

struct gk20a_fecs_trace {

	DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
	struct nvgpu_mutex hash_lock;
	struct nvgpu_mutex poll_lock;
	struct nvgpu_thread poll_task;
	bool init;
	struct nvgpu_mutex enable_lock;
	u32 enable_count;
};

#ifdef CONFIG_GK20A_CTXSW_TRACE
u32 gk20a_fecs_trace_record_ts_tag_invalid_ts_v(void)
{
	return ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
}

u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
{
	return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
}

u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
{
	return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
}

static u32 gk20a_fecs_trace_fecs_context_ptr(struct gk20a *g, struct channel_gk20a *ch)
{
	return (u32) (nvgpu_inst_block_addr(g, &ch->inst_block) >> 12LL);
}

int gk20a_fecs_trace_num_ts(void)
{
	return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
		- sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
}

struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
	struct gk20a *g, int idx)
{
	struct nvgpu_mem *mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;

	return (struct gk20a_fecs_trace_record *)
		((u8 *) mem->cpu_va
		+ (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
}

bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
{
	/*
	 * testing magic_hi should suffice. magic_lo is sometimes used
	 * as a sequence number in experimental ucode.
	 */
	return (r->magic_hi
		== ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
}

int gk20a_fecs_trace_get_read_index(struct gk20a *g)
{
	return gr_gk20a_elpg_protected_call(g,
			gk20a_readl(g, gr_fecs_mailbox1_r()));
}

int gk20a_fecs_trace_get_write_index(struct gk20a *g)
{
	return gr_gk20a_elpg_protected_call(g,
			gk20a_readl(g, gr_fecs_mailbox0_r()));
}

static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
{
	nvgpu_log(g, gpu_dbg_ctxsw, "set read=%d", index);
	return gr_gk20a_elpg_protected_call(g,
			(gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
}

void gk20a_fecs_trace_hash_dump(struct gk20a *g)
{
	u32 bkt;
	struct gk20a_fecs_trace_hash_ent *ent;
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	nvgpu_log(g, gpu_dbg_ctxsw, "dumping hash table");

	nvgpu_mutex_acquire(&trace->hash_lock);
	hash_for_each(trace->pid_hash_table, bkt, ent, node)
	{
		nvgpu_log(g, gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
			ent, bkt, ent->context_ptr, ent->pid);

	}
	nvgpu_mutex_release(&trace->hash_lock);
}

static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
{
	struct gk20a_fecs_trace_hash_ent *he;
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
		"adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);

	he = nvgpu_kzalloc(g, sizeof(*he));
	if (unlikely(!he)) {
		nvgpu_warn(g,
			"can't alloc new hash entry for context_ptr=%x pid=%d",
			context_ptr, pid);
		return -ENOMEM;
	}

	he->context_ptr = context_ptr;
	he->pid = pid;
	nvgpu_mutex_acquire(&trace->hash_lock);
	hash_add(trace->pid_hash_table, &he->node, context_ptr);
	nvgpu_mutex_release(&trace->hash_lock);
	return 0;
}

static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
{
	struct hlist_node *tmp;
	struct gk20a_fecs_trace_hash_ent *ent;
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
		"freeing hash entry context_ptr=%x", context_ptr);

	nvgpu_mutex_acquire(&trace->hash_lock);
	hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
		context_ptr) {
		if (ent->context_ptr == context_ptr) {
			hash_del(&ent->node);
			nvgpu_log(g, gpu_dbg_ctxsw,
				"freed hash entry=%p context_ptr=%x", ent,
				ent->context_ptr);
			nvgpu_kfree(g, ent);
			break;
		}
	}
	nvgpu_mutex_release(&trace->hash_lock);
}

static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
{
	u32 bkt;
	struct hlist_node *tmp;
	struct gk20a_fecs_trace_hash_ent *ent;
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);

	nvgpu_mutex_acquire(&trace->hash_lock);
	hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
		hash_del(&ent->node);
		nvgpu_kfree(g, ent);
	}
	nvgpu_mutex_release(&trace->hash_lock);

}

static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
{
	struct gk20a_fecs_trace_hash_ent *ent;
	struct gk20a_fecs_trace *trace = g->fecs_trace;
	pid_t pid = 0;

	nvgpu_mutex_acquire(&trace->hash_lock);
	hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
		if (ent->context_ptr == context_ptr) {
			nvgpu_log(g, gpu_dbg_ctxsw,
				"found context_ptr=%x -> pid=%d",
				ent->context_ptr, ent->pid);
			pid = ent->pid;
			break;
		}
	}
	nvgpu_mutex_release(&trace->hash_lock);

	return pid;
}

/*
 * Converts HW entry format to userspace-facing format and pushes it to the
 * queue.
 */
static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
{
	int i;
	struct nvgpu_gpu_ctxsw_trace_entry entry = { };
	struct gk20a_fecs_trace *trace = g->fecs_trace;
	pid_t cur_pid;
	pid_t new_pid;
	int count = 0;

	/* for now, only one VM */
	const int vmid = 0;

	struct gk20a_fecs_trace_record *r =
		gk20a_fecs_trace_get_record(g, index);

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
		"consuming record trace=%p read=%d record=%p", trace, index, r);

	if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
		nvgpu_warn(g,
			"trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
			trace, index, r, r->magic_lo, r->magic_hi);
		return -EINVAL;
	}

	/* Clear magic_hi to detect cases where CPU could read write index
	 * before FECS record is actually written to DRAM. This should not
	 * as we force FECS writes to SYSMEM by reading through PRAMIN.
	 */
	r->magic_hi = 0;

	cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
	new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
		"context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
		r->context_ptr, cur_pid, r->new_context_ptr, new_pid);

	entry.context_id = r->context_id;
	entry.vmid = vmid;

	/* break out FECS record into trace events */
	for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {

		entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
		entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
		entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;

		nvgpu_log(g, gpu_dbg_ctxsw,
			"tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
			entry.tag, entry.timestamp, r->context_id,
			r->new_context_id);

		switch (nvgpu_gpu_ctxsw_tags_to_common_tags(entry.tag)) {
		case NVGPU_GPU_CTXSW_TAG_RESTORE_START:
		case NVGPU_GPU_CTXSW_TAG_CONTEXT_START:
			entry.context_id = r->new_context_id;
			entry.pid = new_pid;
			break;

		case NVGPU_GPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
		case NVGPU_GPU_CTXSW_TAG_FE_ACK:
		case NVGPU_GPU_CTXSW_TAG_FE_ACK_WFI:
		case NVGPU_GPU_CTXSW_TAG_FE_ACK_GFXP:
		case NVGPU_GPU_CTXSW_TAG_FE_ACK_CTAP:
		case NVGPU_GPU_CTXSW_TAG_FE_ACK_CILP:
		case NVGPU_GPU_CTXSW_TAG_SAVE_END:
			entry.context_id = r->context_id;
			entry.pid = cur_pid;
			break;

		default:
			/* tags are not guaranteed to start at the beginning */
			WARN_ON(entry.tag && (entry.tag != NVGPU_GPU_CTXSW_TAG_INVALID_TIMESTAMP));
			continue;
		}

		nvgpu_log(g, gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
			entry.tag, entry.context_id, entry.pid);

		if (!entry.context_id)
			continue;

		gk20a_ctxsw_trace_write(g, &entry);
		count++;
	}

	gk20a_ctxsw_trace_wake_up(g, vmid);
	return count;
}

int gk20a_fecs_trace_poll(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	int read = 0;
	int write = 0;
	int cnt;
	int err;

	err = gk20a_busy(g);
	if (unlikely(err))
		return err;

	nvgpu_mutex_acquire(&trace->poll_lock);
	write = gk20a_fecs_trace_get_write_index(g);
	if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
		nvgpu_err(g,
			"failed to acquire write index, write=%d", write);
		err = write;
		goto done;
	}

	read = gk20a_fecs_trace_get_read_index(g);

	cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
	if (!cnt)
		goto done;

	nvgpu_log(g, gpu_dbg_ctxsw,
		"circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
		read, gk20a_fecs_trace_get_read_index(g), write, cnt);

	/* Ensure all FECS writes have made it to SYSMEM */
	g->ops.mm.fb_flush(g);

	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
		/* Bits 30:0 of MAILBOX1 represents actual read pointer value */
		read = read & (~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));
	}

	while (read != write) {
		cnt = gk20a_fecs_trace_ring_read(g, read);
		if (cnt > 0) {
			nvgpu_log(g, gpu_dbg_ctxsw,
				"number of trace entries added: %d", cnt);
		}

		/* Get to next record. */
		read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
	}

	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
		/*
		 * In the next step, read pointer is going to be updated.
		 * So, MSB of read pointer should be set back to 1. This will
		 * keep FECS trace enabled.
		 */
		read = read | (BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
	}

	/* ensure FECS records has been updated before incrementing read index */
	nvgpu_wmb();
	gk20a_fecs_trace_set_read_index(g, read);

done:
	nvgpu_mutex_release(&trace->poll_lock);
	gk20a_idle(g);
	return err;
}

static int gk20a_fecs_trace_periodic_polling(void *arg)
{
	struct gk20a *g = (struct gk20a *)arg;
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	pr_info("%s: running\n", __func__);

	while (!nvgpu_thread_should_stop(&trace->poll_task)) {

		nvgpu_usleep_range(GK20A_FECS_TRACE_FRAME_PERIOD_US,
				   GK20A_FECS_TRACE_FRAME_PERIOD_US * 2);

		gk20a_fecs_trace_poll(g);
	}

	return 0;
}

size_t gk20a_fecs_trace_buffer_size(struct gk20a *g)
{
	return GK20A_FECS_TRACE_NUM_RECORDS
			* ctxsw_prog_record_timestamp_record_size_in_bytes_v();
}

int gk20a_fecs_trace_init(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace;
	int err;

	trace = nvgpu_kzalloc(g, sizeof(struct gk20a_fecs_trace));
	if (!trace) {
		nvgpu_warn(g, "failed to allocate fecs_trace");
		return -ENOMEM;
	}
	g->fecs_trace = trace;

	err = nvgpu_mutex_init(&trace->poll_lock);
	if (err)
		goto clean;
	err = nvgpu_mutex_init(&trace->hash_lock);
	if (err)
		goto clean_poll_lock;

	err = nvgpu_mutex_init(&trace->enable_lock);
	if (err)
		goto clean_hash_lock;

	BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
	hash_init(trace->pid_hash_table);

	__nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);

	trace->enable_count = 0;
	trace->init = true;

	return 0;

clean_hash_lock:
	nvgpu_mutex_destroy(&trace->hash_lock);

clean_poll_lock:
	nvgpu_mutex_destroy(&trace->poll_lock);
clean:
	nvgpu_kfree(g, trace);
	g->fecs_trace = NULL;
	return err;
}

int gk20a_fecs_trace_bind_channel(struct gk20a *g,
		struct channel_gk20a *ch)
{
	/*
	 * map our circ_buf to the context space and store the GPU VA
	 * in the context header.
	 */

	u32 lo;
	u32 hi;
	u64 addr;
	struct tsg_gk20a *tsg;
	struct nvgpu_gr_ctx *ch_ctx;
	struct gk20a_fecs_trace *trace = g->fecs_trace;
	struct nvgpu_mem *mem;
	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
	u32 aperture_mask;

	tsg = tsg_gk20a_from_ch(ch);
	if (tsg == NULL) {
		nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
		return -EINVAL;
	}

	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
			"chid=%d context_ptr=%x inst_block=%llx",
			ch->chid, context_ptr,
			nvgpu_inst_block_addr(g, &ch->inst_block));

	tsg = tsg_gk20a_from_ch(ch);
	if (!tsg)
		return -EINVAL;

	ch_ctx = &tsg->gr_ctx;
	mem = &ch_ctx->mem;

	if (!trace)
		return -ENOMEM;

	mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;

	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
		addr = ch_ctx->global_ctx_buffer_va[FECS_TRACE_BUFFER_VA];
		nvgpu_log(g, gpu_dbg_ctxsw, "gpu_va=%llx", addr);
		aperture_mask = 0;
	} else {
		addr = nvgpu_inst_block_addr(g, mem);
		nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
		aperture_mask = nvgpu_aperture_mask(g, mem,
			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
	}
	if (!addr)
		return -ENOMEM;

	lo = u64_lo32(addr);
	hi = u64_hi32(addr);

	mem = &ch_ctx->mem;

	nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
		lo, GK20A_FECS_TRACE_NUM_RECORDS);

	nvgpu_mem_wr(g, mem,
		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
			GK20A_FECS_TRACE_NUM_RECORDS));

	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA))
		mem = &ch->ctx_header;

	nvgpu_mem_wr(g, mem,
		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
		lo);
	nvgpu_mem_wr(g, mem,
		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) |
		aperture_mask);

	/* pid (process identifier) in user space, corresponds to tgid (thread
	 * group id) in kernel space.
	 */
	gk20a_fecs_trace_hash_add(g, context_ptr, tsg->tgid);

	return 0;
}

int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
{
	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);

	if (g->fecs_trace) {
		nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
			"ch=%p context_ptr=%x", ch, context_ptr);

		if (g->ops.fecs_trace.is_enabled(g)) {
			if (g->ops.fecs_trace.flush)
				g->ops.fecs_trace.flush(g);
			gk20a_fecs_trace_poll(g);
		}
		gk20a_fecs_trace_hash_del(g, context_ptr);
	}
	return 0;
}

int gk20a_fecs_trace_reset(struct gk20a *g)
{
	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");

	if (!g->ops.fecs_trace.is_enabled(g))
		return 0;

	gk20a_fecs_trace_poll(g);
	return gk20a_fecs_trace_set_read_index(g, 0);
}

int gk20a_fecs_trace_deinit(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	if (!trace->init)
		return 0;

	/*
	 * Check if tracer was enabled before attempting to stop the
	 * tracer thread.
	 */
	if (trace->enable_count > 0) {
		nvgpu_thread_stop(&trace->poll_task);
	}
	gk20a_fecs_trace_free_hash_table(g);

	nvgpu_mutex_destroy(&g->fecs_trace->hash_lock);
	nvgpu_mutex_destroy(&g->fecs_trace->poll_lock);
	nvgpu_mutex_destroy(&g->fecs_trace->enable_lock);

	nvgpu_kfree(g, g->fecs_trace);
	g->fecs_trace = NULL;
	return 0;
}

int gk20a_gr_max_entries(struct gk20a *g,
		struct nvgpu_gpu_ctxsw_trace_filter *filter)
{
	int n;
	int tag;

	/* Compute number of entries per record, with given filter */
	for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
		n += (NVGPU_GPU_CTXSW_FILTER_ISSET(tag, filter) != 0);

	/* Return max number of entries generated for the whole ring */
	return n * GK20A_FECS_TRACE_NUM_RECORDS;
}

int gk20a_fecs_trace_enable(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace = g->fecs_trace;
	int write;
	int err = 0;

	if (!trace)
		return -EINVAL;

	nvgpu_mutex_acquire(&trace->enable_lock);
	trace->enable_count++;

	if (trace->enable_count == 1U) {
		/* drop data in hw buffer */
		if (g->ops.fecs_trace.flush)
			g->ops.fecs_trace.flush(g);

		write = gk20a_fecs_trace_get_write_index(g);

		if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
			/*
			 * For enabling FECS trace support, MAILBOX1's MSB
			 * (Bit 31:31) should be set to 1. Bits 30:0 represents
			 * actual pointer value.
			 */
			write = write |
				(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
		}
		gk20a_fecs_trace_set_read_index(g, write);

		/*
		 * FECS ucode does a priv holdoff around the assertion of
		 * context reset. So, pri transactions (e.g. mailbox1 register
		 * write) might fail due to this. Hence, do write with ack
		 * i.e. write and read it back to make sure write happened for
		 * mailbox1.
		 */
		while (gk20a_fecs_trace_get_read_index(g) != write) {
			nvgpu_log(g, gpu_dbg_ctxsw, "mailbox1 update failed");
			gk20a_fecs_trace_set_read_index(g, write);
		}

		err = nvgpu_thread_create(&trace->poll_task, g,
				gk20a_fecs_trace_periodic_polling, __func__);
		if (err) {
			nvgpu_warn(g,
				"failed to create FECS polling task");
			goto done;
		}
	}

done:
	nvgpu_mutex_release(&trace->enable_lock);
	return err;
}

int gk20a_fecs_trace_disable(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace = g->fecs_trace;
	int read = 0;

	if (trace == NULL) {
		return -EINVAL;
	}

	nvgpu_mutex_acquire(&trace->enable_lock);
	if (trace->enable_count <= 0U) {
		nvgpu_mutex_release(&trace->enable_lock);
		return 0;
	}
	trace->enable_count--;
	if (trace->enable_count == 0U) {
		if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
			/*
			 * For disabling FECS trace support, MAILBOX1's MSB
			 * (Bit 31:31) should be set to 0.
			 */
			read = gk20a_fecs_trace_get_read_index(g) &
				(~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));

			gk20a_fecs_trace_set_read_index(g, read);

			/*
			 * FECS ucode does a priv holdoff around the assertion
			 * of context reset. So, pri transactions (e.g.
			 * mailbox1 register write) might fail due to this.
			 * Hence, do write with ack i.e. write and read it back
			 * to make sure write happened for mailbox1.
			 */
			while (gk20a_fecs_trace_get_read_index(g) != read) {
				nvgpu_log(g, gpu_dbg_ctxsw,
					"mailbox1 update failed");
				gk20a_fecs_trace_set_read_index(g, read);
			}
		}

		nvgpu_thread_stop(&trace->poll_task);

	}
	nvgpu_mutex_release(&trace->enable_lock);

	return -EPERM;
}

bool gk20a_fecs_trace_is_enabled(struct gk20a *g)
{
	struct gk20a_fecs_trace *trace = g->fecs_trace;

	return (trace && nvgpu_thread_is_running(&trace->poll_task));
}

void gk20a_fecs_trace_reset_buffer(struct gk20a *g)
{
	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");

	gk20a_fecs_trace_set_read_index(g,
		gk20a_fecs_trace_get_write_index(g));
}
#endif /* CONFIG_GK20A_CTXSW_TRACE */