/*
 * Virtualized GPU Fifo
 *
 * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/dma-mapping.h>
#include "vgpu/vgpu.h"
#include "gk20a/hw_fifo_gk20a.h"
#include "gk20a/hw_ram_gk20a.h"

static void vgpu_channel_bind(struct channel_gk20a *ch)
{
	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_channel_config_params *p =
			&msg.params.channel_config;
	int err;

	gk20a_dbg_info("bind channel %d", ch->hw_chid);

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_BIND;
	msg.handle = platform->virt_handle;
	p->handle = ch->virt_ctx;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
	WARN_ON(err || msg.ret);

	ch->bound = true;
}

static void vgpu_channel_unbind(struct channel_gk20a *ch)
{
	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);

	gk20a_dbg_fn("");

	if (ch->bound) {
		struct tegra_vgpu_cmd_msg msg;
		struct tegra_vgpu_channel_config_params *p =
				&msg.params.channel_config;
		int err;

		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_UNBIND;
		msg.handle = platform->virt_handle;
		p->handle = ch->virt_ctx;
		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
		WARN_ON(err || msg.ret);
	}

	ch->bound = false;

	/*
	 * if we are agrressive then we can destroy the syncpt
	 * resource at this point
	 * if not, then it will be destroyed at channel_free()
	 */
	if (ch->sync && ch->sync->aggressive_destroy) {
		ch->sync->destroy(ch->sync);
		ch->sync = NULL;
	}
}

static int vgpu_channel_alloc_inst(struct gk20a *g, struct channel_gk20a *ch)
{
	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_channel_hwctx_params *p = &msg.params.channel_hwctx;
	int err;

	gk20a_dbg_fn("");

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_HWCTX;
	msg.handle = platform->virt_handle;
	p->id = ch->hw_chid;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
	if (err || msg.ret) {
		gk20a_err(dev_from_gk20a(g), "fail");
		return -ENOMEM;
	}

	ch->virt_ctx = p->handle;
	gk20a_dbg_fn("done");
	return 0;
}

static void vgpu_channel_free_inst(struct gk20a *g, struct channel_gk20a *ch)
{
	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_channel_hwctx_params *p = &msg.params.channel_hwctx;
	int err;

	gk20a_dbg_fn("");

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_HWCTX;
	msg.handle = platform->virt_handle;
	p->handle = ch->virt_ctx;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
	WARN_ON(err || msg.ret);
}

static void vgpu_channel_disable(struct channel_gk20a *ch)
{
	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_channel_config_params *p =
			&msg.params.channel_config;
	int err;

	gk20a_dbg_fn("");

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_DISABLE;
	msg.handle = platform->virt_handle;
	p->handle = ch->virt_ctx;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
	WARN_ON(err || msg.ret);
}

static int vgpu_channel_setup_ramfc(struct channel_gk20a *ch, u64 gpfifo_base,
				u32 gpfifo_entries, u32 flags)
{
	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
	struct device __maybe_unused *d = dev_from_gk20a(ch->g);
	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_ramfc_params *p = &msg.params.ramfc;
	int err;

	gk20a_dbg_fn("");

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SETUP_RAMFC;
	msg.handle = platform->virt_handle;
	p->handle = ch->virt_ctx;
	p->gpfifo_va = gpfifo_base;
	p->num_entries = gpfifo_entries;
	p->userd_addr = ch->userd_iova;
	p->iova = mapping ? 1 : 0;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));

	return (err || msg.ret) ? -ENOMEM : 0;
}

static int init_engine_info(struct fifo_gk20a *f)
{
	struct fifo_engine_info_gk20a *gr_info;
	const u32 gr_sw_id = ENGINE_GR_GK20A;

	gk20a_dbg_fn("");

	/* all we really care about finding is the graphics entry    */
	/* especially early on in sim it probably thinks it has more */
	f->num_engines = 1;

	gr_info = f->engine_info + gr_sw_id;

	/* FIXME: retrieve this from server */
	gr_info->runlist_id = 0;
	return 0;
}

static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
{
	struct fifo_engine_info_gk20a *engine_info;
	struct fifo_runlist_info_gk20a *runlist;
	struct device *d = dev_from_gk20a(g);
	u32 runlist_id;
	u32 i;
	u64 runlist_size;

	gk20a_dbg_fn("");

	f->max_runlists = fifo_eng_runlist_base__size_1_v();
	f->runlist_info = kzalloc(sizeof(struct fifo_runlist_info_gk20a) *
				  f->max_runlists, GFP_KERNEL);
	if (!f->runlist_info)
		goto clean_up;

	engine_info = f->engine_info + ENGINE_GR_GK20A;
	runlist_id = engine_info->runlist_id;
	runlist = &f->runlist_info[runlist_id];

	runlist->active_channels =
		kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
			GFP_KERNEL);
	if (!runlist->active_channels)
		goto clean_up_runlist_info;

	runlist_size  = sizeof(u16) * f->num_channels;
	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
		int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
		if (err) {
			dev_err(d, "memory allocation failed\n");
			goto clean_up_runlist;
		}
	}
	mutex_init(&runlist->mutex);

	/* None of buffers is pinned if this value doesn't change.
	    Otherwise, one of them (cur_buffer) must have been pinned. */
	runlist->cur_buffer = MAX_RUNLIST_BUFFERS;

	gk20a_dbg_fn("done");
	return 0;

clean_up_runlist:
	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++)
		gk20a_gmmu_free(g, &runlist->mem[i]);

	kfree(runlist->active_channels);
	runlist->active_channels = NULL;

clean_up_runlist_info:
	kfree(f->runlist_info);
	f->runlist_info = NULL;

clean_up:
	gk20a_dbg_fn("fail");
	return -ENOMEM;
}

static int vgpu_init_fifo_setup_sw(struct gk20a *g)
{
	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
	struct fifo_gk20a *f = &g->fifo;
	struct device *d = dev_from_gk20a(g);
	int chid, err = 0;

	gk20a_dbg_fn("");

	if (f->sw_ready) {
		gk20a_dbg_fn("skip init");
		return 0;
	}

	f->g = g;

	err = vgpu_get_attribute(platform->virt_handle,
				TEGRA_VGPU_ATTRIB_NUM_CHANNELS,
				&f->num_channels);
	if (err)
		return -ENXIO;

	f->max_engines = ENGINE_INVAL_GK20A;

	f->userd_entry_size = 1 << ram_userd_base_shift_v();

	err = gk20a_gmmu_alloc(g, f->userd_entry_size * f->num_channels,
			&f->userd);
	if (err) {
		dev_err(d, "memory allocation failed\n");
		goto clean_up;
	}

	/* bar1 va */
	f->userd.gpu_va = vgpu_bar1_map(g, &f->userd.sgt, f->userd.size);
	if (!f->userd.gpu_va) {
		dev_err(d, "gmmu mapping failed\n");
		goto clean_up;
	}

	gk20a_dbg(gpu_dbg_map, "userd bar1 va = 0x%llx", f->userd.gpu_va);

	f->channel = kzalloc(f->num_channels * sizeof(*f->channel),
				GFP_KERNEL);
	f->engine_info = kzalloc(f->max_engines * sizeof(*f->engine_info),
				GFP_KERNEL);

	if (!(f->channel && f->engine_info)) {
		err = -ENOMEM;
		goto clean_up;
	}

	init_engine_info(f);

	init_runlist(g, f);

	INIT_LIST_HEAD(&f->free_chs);
	mutex_init(&f->free_chs_mutex);

	for (chid = 0; chid < f->num_channels; chid++) {
		f->channel[chid].userd_cpu_va =
			f->userd.cpu_va + chid * f->userd_entry_size;
		f->channel[chid].userd_iova =
			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
				+ chid * f->userd_entry_size;
		f->channel[chid].userd_gpu_va =
			f->userd.gpu_va + chid * f->userd_entry_size;

		gk20a_init_channel_support(g, chid);
	}

	f->deferred_reset_pending = false;
	mutex_init(&f->deferred_reset_mutex);

	f->sw_ready = true;

	gk20a_dbg_fn("done");
	return 0;

clean_up:
	gk20a_dbg_fn("fail");
	/* FIXME: unmap from bar1 */
	gk20a_gmmu_free(g, &f->userd);

	memset(&f->userd, 0, sizeof(f->userd));

	kfree(f->channel);
	f->channel = NULL;
	kfree(f->engine_info);
	f->engine_info = NULL;

	return err;
}

static int vgpu_init_fifo_setup_hw(struct gk20a *g)
{
	gk20a_dbg_fn("");

	/* test write, read through bar1 @ userd region before
	 * turning on the snooping */
	{
		struct fifo_gk20a *f = &g->fifo;
		u32 v, v1 = 0x33, v2 = 0x55;

		u32 bar1_vaddr = f->userd.gpu_va;
		volatile u32 *cpu_vaddr = f->userd.cpu_va;

		gk20a_dbg_info("test bar1 @ vaddr 0x%x",
			   bar1_vaddr);

		v = gk20a_bar1_readl(g, bar1_vaddr);

		*cpu_vaddr = v1;
		smp_mb();

		if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
			return -EINVAL;
		}

		gk20a_bar1_writel(g, bar1_vaddr, v2);

		if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
			return -EINVAL;
		}

		/* is it visible to the cpu? */
		if (*cpu_vaddr != v2) {
			gk20a_err(dev_from_gk20a(g),
				"cpu didn't see bar1 write @ %p!",
				cpu_vaddr);
		}

		/* put it back */
		gk20a_bar1_writel(g, bar1_vaddr, v);
	}

	gk20a_dbg_fn("done");

	return 0;
}

int vgpu_init_fifo_support(struct gk20a *g)
{
	u32 err;

	gk20a_dbg_fn("");

	err = vgpu_init_fifo_setup_sw(g);
	if (err)
		return err;

	err = vgpu_init_fifo_setup_hw(g);
	return err;
}

static int vgpu_fifo_preempt_channel(struct gk20a *g, u32 hw_chid)
{
	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
	struct fifo_gk20a *f = &g->fifo;
	struct tegra_vgpu_cmd_msg msg;
	struct tegra_vgpu_channel_config_params *p =
			&msg.params.channel_config;
	int err;

	gk20a_dbg_fn("");

	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_PREEMPT;
	msg.handle = platform->virt_handle;
	p->handle = f->channel[hw_chid].virt_ctx;
	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));

	if (err || msg.ret) {
		gk20a_err(dev_from_gk20a(g),
			"preempt channel %d failed\n", hw_chid);
		err = -ENOMEM;
	}

	return err;
}

static int vgpu_submit_runlist(u64 handle, u8 runlist_id, u16 *runlist,
			u32 num_entries)
{
	struct tegra_vgpu_cmd_msg *msg;
	struct tegra_vgpu_runlist_params *p;
	size_t size = sizeof(*msg) + sizeof(*runlist) * num_entries;
	char *ptr;
	int err;

	msg = kmalloc(size, GFP_KERNEL);
	if (!msg)
		return -1;

	msg->cmd = TEGRA_VGPU_CMD_SUBMIT_RUNLIST;
	msg->handle = handle;
	p = &msg->params.runlist;
	p->runlist_id = runlist_id;
	p->num_entries = num_entries;

	ptr = (char *)msg + sizeof(*msg);
	memcpy(ptr, runlist, sizeof(*runlist) * num_entries);
	err = vgpu_comm_sendrecv(msg, size, sizeof(*msg));

	err = (err || msg->ret) ? -1 : 0;
	kfree(msg);
	return err;
}

static int vgpu_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
					u32 hw_chid, bool add,
					bool wait_for_finish)
{
	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
	struct fifo_gk20a *f = &g->fifo;
	struct fifo_runlist_info_gk20a *runlist;
	u16 *runlist_entry = NULL;
	u32 count = 0;

	gk20a_dbg_fn("");

	runlist = &f->runlist_info[runlist_id];

	/* valid channel, add/remove it from active list.
	   Otherwise, keep active list untouched for suspend/resume. */
	if (hw_chid != ~0) {
		if (add) {
			if (test_and_set_bit(hw_chid,
				runlist->active_channels) == 1)
				return 0;
		} else {
			if (test_and_clear_bit(hw_chid,
				runlist->active_channels) == 0)
				return 0;
		}
	}

	if (hw_chid != ~0 || /* add/remove a valid channel */
	    add /* resume to add all channels back */) {
		u32 chid;

		runlist_entry = runlist->mem[0].cpu_va;
		for_each_set_bit(chid,
			runlist->active_channels, f->num_channels) {
			gk20a_dbg_info("add channel %d to runlist", chid);
			runlist_entry[0] = chid;
			runlist_entry++;
			count++;
		}
	} else	/* suspend to remove all channels */
		count = 0;

	return vgpu_submit_runlist(platform->virt_handle, runlist_id,
				runlist->mem[0].cpu_va, count);
}

/* add/remove a channel from runlist
   special cases below: runlist->active_channels will NOT be changed.
   (hw_chid == ~0 && !add) means remove all active channels from runlist.
   (hw_chid == ~0 &&  add) means restore all active channels on runlist. */
static int vgpu_fifo_update_runlist(struct gk20a *g, u32 runlist_id,
				u32 hw_chid, bool add, bool wait_for_finish)
{
	struct fifo_runlist_info_gk20a *runlist = NULL;
	struct fifo_gk20a *f = &g->fifo;
	u32 ret = 0;

	gk20a_dbg_fn("");

	runlist = &f->runlist_info[runlist_id];

	mutex_lock(&runlist->mutex);

	ret = vgpu_fifo_update_runlist_locked(g, runlist_id, hw_chid, add,
					wait_for_finish);

	mutex_unlock(&runlist->mutex);
	return ret;
}

static int vgpu_fifo_wait_engine_idle(struct gk20a *g)
{
	gk20a_dbg_fn("");

	return 0;
}

static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
		struct channel_gk20a *ch)
{
	if (ch->error_notifier) {
		if (ch->error_notifier->status == 0xffff) {
			/* If error code is already set, this mmu fault
			 * was triggered as part of recovery from other
			 * error condition.
			 * Don't overwrite error flag. */
		} else {
			gk20a_set_error_notifier(ch,
				NVGPU_CHANNEL_FIFO_ERROR_MMU_ERR_FLT);
		}
	}
	/* mark channel as faulted */
	ch->has_timedout = true;
	wmb();
	/* unblock pending waits */
	wake_up(&ch->semaphore_wq);
	wake_up(&ch->notifier_wq);
	wake_up(&ch->submit_wq);
}

int vgpu_fifo_isr(struct gk20a *g, struct tegra_vgpu_fifo_intr_info *info)
{
	struct fifo_gk20a *f = &g->fifo;
	struct channel_gk20a *ch = &f->channel[info->chid];

	gk20a_err(dev_from_gk20a(g), "fifo intr (%d) on ch %u",
		info->type, info->chid);

	switch (info->type) {
	case TEGRA_VGPU_FIFO_INTR_PBDMA:
		gk20a_set_error_notifier(ch, NVGPU_CHANNEL_PBDMA_ERROR);
		break;
	case TEGRA_VGPU_FIFO_INTR_CTXSW_TIMEOUT:
		gk20a_set_error_notifier(ch,
					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
		break;
	case TEGRA_VGPU_FIFO_INTR_MMU_FAULT:
		gk20a_channel_abort(ch);
		vgpu_fifo_set_ctx_mmu_error(g, ch);
		break;
	default:
		WARN_ON(1);
		break;
	}

	return 0;
}

void vgpu_init_fifo_ops(struct gpu_ops *gops)
{
	gops->fifo.bind_channel = vgpu_channel_bind;
	gops->fifo.unbind_channel = vgpu_channel_unbind;
	gops->fifo.disable_channel = vgpu_channel_disable;
	gops->fifo.alloc_inst = vgpu_channel_alloc_inst;
	gops->fifo.free_inst = vgpu_channel_free_inst;
	gops->fifo.setup_ramfc = vgpu_channel_setup_ramfc;
	gops->fifo.preempt_channel = vgpu_fifo_preempt_channel;
	gops->fifo.update_runlist = vgpu_fifo_update_runlist;
	gops->fifo.wait_engine_idle = vgpu_fifo_wait_engine_idle;
}