From 1fd722f592c2e0523c5e399a2406a4e387057188 Mon Sep 17 00:00:00 2001
From: Aingara Paramakuru <aparamakuru@nvidia.com>
Date: Mon, 5 May 2014 21:14:22 -0400
Subject: gpu: nvgpu: support gk20a virtualization

The nvgpu driver now supports using the Tegra graphics virtualization
interfaces to support gk20a in a virtualized environment.

Bug 1509608

Change-Id: I6ede15ee7bf0b0ad8a13e8eb5f557c3516ead676
Signed-off-by: Aingara Paramakuru <aparamakuru@nvidia.com>
Reviewed-on: http://git-master/r/440122
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/vgpu/Makefile    |  10 +
 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c | 569 ++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/vgpu/gr_vgpu.c   | 687 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/vgpu/ltc_vgpu.c  |  55 +++
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c   | 425 +++++++++++++++++++++++
 drivers/gpu/nvgpu/vgpu/vgpu.c      | 416 ++++++++++++++++++++++
 drivers/gpu/nvgpu/vgpu/vgpu.h      |  41 +++
 7 files changed, 2203 insertions(+)
 create mode 100644 drivers/gpu/nvgpu/vgpu/Makefile
 create mode 100644 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
 create mode 100644 drivers/gpu/nvgpu/vgpu/gr_vgpu.c
 create mode 100644 drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
 create mode 100644 drivers/gpu/nvgpu/vgpu/mm_vgpu.c
 create mode 100644 drivers/gpu/nvgpu/vgpu/vgpu.c
 create mode 100644 drivers/gpu/nvgpu/vgpu/vgpu.h

(limited to 'drivers/gpu/nvgpu/vgpu')

diff --git a/drivers/gpu/nvgpu/vgpu/Makefile b/drivers/gpu/nvgpu/vgpu/Makefile
new file mode 100644
index 00000000..edad7171
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/Makefile
@@ -0,0 +1,10 @@
+GCOV_PROFILE := y
+ccflags-y += -Idrivers/gpu/nvgpu
+ccflags-y += -Wno-multichar
+
+obj-$(CONFIG_TEGRA_GR_VIRTUALIZATION)  = \
+	ltc_vgpu.o \
+	gr_vgpu.o \
+	fifo_vgpu.o \
+	mm_vgpu.o \
+	vgpu.o
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
new file mode 100644
index 00000000..23dec1f3
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -0,0 +1,569 @@
+/*
+ * Virtualized GPU Fifo
+ *
+ * Copyright (c) 2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/dma-mapping.h>
+#include "vgpu/vgpu.h"
+#include "gk20a/hw_fifo_gk20a.h"
+#include "gk20a/hw_ram_gk20a.h"
+
+static void vgpu_channel_bind(struct channel_gk20a *ch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_channel_config_params *p =
+			&msg.params.channel_config;
+	int err;
+
+	gk20a_dbg_info("bind channel %d", ch->hw_chid);
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_BIND;
+	msg.handle = platform->virt_handle;
+	p->handle = ch->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+
+	ch->bound = true;
+}
+
+static void vgpu_channel_unbind(struct channel_gk20a *ch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+
+	gk20a_dbg_fn("");
+
+	if (ch->bound) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_channel_config_params *p =
+				&msg.params.channel_config;
+		int err;
+
+		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_UNBIND;
+		msg.handle = platform->virt_handle;
+		p->handle = ch->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
+	}
+
+	ch->bound = false;
+
+	/*
+	 * if we are agrressive then we can destroy the syncpt
+	 * resource at this point
+	 * if not, then it will be destroyed at channel_free()
+	 */
+	if (ch->sync && ch->sync->aggressive_destroy) {
+		ch->sync->destroy(ch->sync);
+		ch->sync = NULL;
+	}
+}
+
+static int vgpu_channel_alloc_inst(struct gk20a *g, struct channel_gk20a *ch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_channel_hwctx_params *p = &msg.params.channel_hwctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_HWCTX;
+	msg.handle = platform->virt_handle;
+	p->id = ch->hw_chid;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret) {
+		gk20a_err(dev_from_gk20a(g), "fail");
+		return -ENOMEM;
+	}
+
+	ch->virt_ctx = p->handle;
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static void vgpu_channel_free_inst(struct gk20a *g, struct channel_gk20a *ch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_channel_hwctx_params *p = &msg.params.channel_hwctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_HWCTX;
+	msg.handle = platform->virt_handle;
+	p->handle = ch->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+}
+
+static void vgpu_channel_disable(struct channel_gk20a *ch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_channel_config_params *p =
+			&msg.params.channel_config;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_DISABLE;
+	msg.handle = platform->virt_handle;
+	p->handle = ch->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+}
+
+static int vgpu_channel_setup_ramfc(struct channel_gk20a *ch, u64 gpfifo_base,
+				u32 gpfifo_entries)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+	struct device __maybe_unused *d = dev_from_gk20a(ch->g);
+	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_ramfc_params *p = &msg.params.ramfc;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SETUP_RAMFC;
+	msg.handle = platform->virt_handle;
+	p->handle = ch->virt_ctx;
+	p->gpfifo_va = gpfifo_base;
+	p->num_entries = gpfifo_entries;
+	p->userd_addr = ch->userd_iova;
+	p->iova = mapping ? 1 : 0;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? -ENOMEM : 0;
+}
+
+static int init_engine_info(struct fifo_gk20a *f)
+{
+	struct fifo_engine_info_gk20a *gr_info;
+	const u32 gr_sw_id = ENGINE_GR_GK20A;
+
+	gk20a_dbg_fn("");
+
+	/* all we really care about finding is the graphics entry    */
+	/* especially early on in sim it probably thinks it has more */
+	f->num_engines = 1;
+
+	gr_info = f->engine_info + gr_sw_id;
+
+	gr_info->sw_id = gr_sw_id;
+	gr_info->name = "gr";
+	/* FIXME: retrieve this from server */
+	gr_info->runlist_id = 0;
+	return 0;
+}
+
+static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
+{
+	struct fifo_engine_info_gk20a *engine_info;
+	struct fifo_runlist_info_gk20a *runlist;
+	struct device *d = dev_from_gk20a(g);
+	u32 runlist_id;
+	u32 i;
+	u64 runlist_size;
+
+	gk20a_dbg_fn("");
+
+	f->max_runlists = fifo_eng_runlist_base__size_1_v();
+	f->runlist_info = kzalloc(sizeof(struct fifo_runlist_info_gk20a) *
+				  f->max_runlists, GFP_KERNEL);
+	if (!f->runlist_info)
+		goto clean_up;
+
+	engine_info = f->engine_info + ENGINE_GR_GK20A;
+	runlist_id = engine_info->runlist_id;
+	runlist = &f->runlist_info[runlist_id];
+
+	runlist->active_channels =
+		kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
+			GFP_KERNEL);
+	if (!runlist->active_channels)
+		goto clean_up_runlist_info;
+
+	runlist_size  = sizeof(u16) * f->num_channels;
+	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+		dma_addr_t iova;
+
+		runlist->mem[i].cpuva =
+			dma_alloc_coherent(d,
+					runlist_size,
+					&iova,
+					GFP_KERNEL);
+		if (!runlist->mem[i].cpuva) {
+			dev_err(d, "memory allocation failed\n");
+			goto clean_up_runlist;
+		}
+		runlist->mem[i].iova = iova;
+		runlist->mem[i].size = runlist_size;
+	}
+	mutex_init(&runlist->mutex);
+	init_waitqueue_head(&runlist->runlist_wq);
+
+	/* None of buffers is pinned if this value doesn't change.
+	    Otherwise, one of them (cur_buffer) must have been pinned. */
+	runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up_runlist:
+	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+		if (runlist->mem[i].cpuva)
+			dma_free_coherent(d,
+				runlist->mem[i].size,
+				runlist->mem[i].cpuva,
+				runlist->mem[i].iova);
+		runlist->mem[i].cpuva = NULL;
+		runlist->mem[i].iova = 0;
+	}
+
+	kfree(runlist->active_channels);
+	runlist->active_channels = NULL;
+
+clean_up_runlist_info:
+	kfree(f->runlist_info);
+	f->runlist_info = NULL;
+
+clean_up:
+	gk20a_dbg_fn("fail");
+	return -ENOMEM;
+}
+
+static int vgpu_init_fifo_setup_sw(struct gk20a *g)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct fifo_gk20a *f = &g->fifo;
+	struct device *d = dev_from_gk20a(g);
+	int chid, err = 0;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	if (f->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	f->g = g;
+
+	err = vgpu_get_attribute(platform->virt_handle,
+				TEGRA_VGPU_ATTRIB_NUM_CHANNELS,
+				&f->num_channels);
+	if (err)
+		return -ENXIO;
+
+	f->max_engines = ENGINE_INVAL_GK20A;
+
+	f->userd_entry_size = 1 << ram_userd_base_shift_v();
+	f->userd_total_size = f->userd_entry_size * f->num_channels;
+
+	f->userd.cpuva = dma_alloc_coherent(d,
+					f->userd_total_size,
+					&iova,
+					GFP_KERNEL);
+	if (!f->userd.cpuva) {
+		dev_err(d, "memory allocation failed\n");
+		goto clean_up;
+	}
+
+	f->userd.iova = iova;
+	err = gk20a_get_sgtable(d, &f->userd.sgt,
+				f->userd.cpuva, f->userd.iova,
+				f->userd_total_size);
+	if (err) {
+		dev_err(d, "failed to create sg table\n");
+		goto clean_up;
+	}
+
+	/* bar1 va */
+	f->userd.gpu_va = vgpu_bar1_map(g, &f->userd.sgt, f->userd_total_size);
+	if (!f->userd.gpu_va) {
+		dev_err(d, "gmmu mapping failed\n");
+		goto clean_up;
+	}
+
+	gk20a_dbg(gpu_dbg_map, "userd bar1 va = 0x%llx", f->userd.gpu_va);
+
+	f->userd.size = f->userd_total_size;
+
+	f->channel = kzalloc(f->num_channels * sizeof(*f->channel),
+				GFP_KERNEL);
+	f->engine_info = kzalloc(f->max_engines * sizeof(*f->engine_info),
+				GFP_KERNEL);
+
+	if (!(f->channel && f->engine_info)) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	init_engine_info(f);
+
+	init_runlist(g, f);
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		f->channel[chid].userd_cpu_va =
+			f->userd.cpuva + chid * f->userd_entry_size;
+		f->channel[chid].userd_iova =
+			NV_MC_SMMU_VADDR_TRANSLATE(f->userd.iova)
+				+ chid * f->userd_entry_size;
+		f->channel[chid].userd_gpu_va =
+			f->userd.gpu_va + chid * f->userd_entry_size;
+
+		gk20a_init_channel_support(g, chid);
+	}
+	mutex_init(&f->ch_inuse_mutex);
+
+	f->deferred_reset_pending = false;
+	mutex_init(&f->deferred_reset_mutex);
+
+	f->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up:
+	gk20a_dbg_fn("fail");
+	/* FIXME: unmap from bar1 */
+	if (f->userd.sgt)
+		gk20a_free_sgtable(&f->userd.sgt);
+	if (f->userd.cpuva)
+		dma_free_coherent(d,
+				f->userd_total_size,
+				f->userd.cpuva,
+				f->userd.iova);
+	f->userd.cpuva = NULL;
+	f->userd.iova = 0;
+
+	memset(&f->userd, 0, sizeof(struct userd_desc));
+
+	kfree(f->channel);
+	f->channel = NULL;
+	kfree(f->engine_info);
+	f->engine_info = NULL;
+
+	return err;
+}
+
+static int vgpu_init_fifo_setup_hw(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	/* test write, read through bar1 @ userd region before
+	 * turning on the snooping */
+	{
+		struct fifo_gk20a *f = &g->fifo;
+		u32 v, v1 = 0x33, v2 = 0x55;
+
+		u32 bar1_vaddr = f->userd.gpu_va;
+		volatile u32 *cpu_vaddr = f->userd.cpuva;
+
+		gk20a_dbg_info("test bar1 @ vaddr 0x%x",
+			   bar1_vaddr);
+
+		v = gk20a_bar1_readl(g, bar1_vaddr);
+
+		*cpu_vaddr = v1;
+		smp_mb();
+
+		if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+			return -EINVAL;
+		}
+
+		gk20a_bar1_writel(g, bar1_vaddr, v2);
+
+		if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+			return -EINVAL;
+		}
+
+		/* is it visible to the cpu? */
+		if (*cpu_vaddr != v2) {
+			gk20a_err(dev_from_gk20a(g),
+				"cpu didn't see bar1 write @ %p!",
+				cpu_vaddr);
+		}
+
+		/* put it back */
+		gk20a_bar1_writel(g, bar1_vaddr, v);
+	}
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+int vgpu_init_fifo_support(struct gk20a *g)
+{
+	u32 err;
+
+	gk20a_dbg_fn("");
+
+	err = vgpu_init_fifo_setup_sw(g);
+	if (err)
+		return err;
+
+	err = vgpu_init_fifo_setup_hw(g);
+	return err;
+}
+
+static int vgpu_fifo_preempt_channel(struct gk20a *g, u32 hw_chid)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct fifo_gk20a *f = &g->fifo;
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_channel_config_params *p =
+			&msg.params.channel_config;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_PREEMPT;
+	msg.handle = platform->virt_handle;
+	p->handle = f->channel[hw_chid].virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	if (err || msg.ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"preempt channel %d failed\n", hw_chid);
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static int vgpu_submit_runlist(u64 handle, u8 runlist_id, u16 *runlist,
+			u32 num_entries)
+{
+	struct tegra_vgpu_cmd_msg *msg;
+	struct tegra_vgpu_runlist_params *p;
+	size_t size = sizeof(*msg) + sizeof(*runlist) * num_entries;
+	char *ptr;
+	int err;
+
+	msg = kmalloc(size, GFP_KERNEL);
+	if (!msg)
+		return -1;
+
+	msg->cmd = TEGRA_VGPU_CMD_SUBMIT_RUNLIST;
+	msg->handle = handle;
+	p = &msg->params.runlist;
+	p->runlist_id = runlist_id;
+	p->num_entries = num_entries;
+
+	ptr = (char *)msg + sizeof(*msg);
+	memcpy(ptr, runlist, sizeof(*runlist) * num_entries);
+	err = vgpu_comm_sendrecv(msg, size, sizeof(*msg));
+
+	err = (err || msg->ret) ? -1 : 0;
+	kfree(msg);
+	return err;
+}
+
+static int vgpu_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+					u32 hw_chid, bool add,
+					bool wait_for_finish)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_runlist_info_gk20a *runlist;
+	u16 *runlist_entry = NULL;
+	u32 count = 0;
+
+	gk20a_dbg_fn("");
+
+	runlist = &f->runlist_info[runlist_id];
+
+	/* valid channel, add/remove it from active list.
+	   Otherwise, keep active list untouched for suspend/resume. */
+	if (hw_chid != ~0) {
+		if (add) {
+			if (test_and_set_bit(hw_chid,
+				runlist->active_channels) == 1)
+				return 0;
+		} else {
+			if (test_and_clear_bit(hw_chid,
+				runlist->active_channels) == 0)
+				return 0;
+		}
+	}
+
+	if (hw_chid != ~0 || /* add/remove a valid channel */
+	    add /* resume to add all channels back */) {
+		u32 chid;
+
+		runlist_entry = runlist->mem[0].cpuva;
+		for_each_set_bit(chid,
+			runlist->active_channels, f->num_channels) {
+			gk20a_dbg_info("add channel %d to runlist", chid);
+			runlist_entry[0] = chid;
+			runlist_entry++;
+			count++;
+		}
+	} else	/* suspend to remove all channels */
+		count = 0;
+
+	return vgpu_submit_runlist(platform->virt_handle, runlist_id,
+				runlist->mem[0].cpuva, count);
+}
+
+/* add/remove a channel from runlist
+   special cases below: runlist->active_channels will NOT be changed.
+   (hw_chid == ~0 && !add) means remove all active channels from runlist.
+   (hw_chid == ~0 &&  add) means restore all active channels on runlist. */
+static int vgpu_fifo_update_runlist(struct gk20a *g, u32 runlist_id,
+				u32 hw_chid, bool add, bool wait_for_finish)
+{
+	struct fifo_runlist_info_gk20a *runlist = NULL;
+	struct fifo_gk20a *f = &g->fifo;
+	u32 ret = 0;
+
+	gk20a_dbg_fn("");
+
+	runlist = &f->runlist_info[runlist_id];
+
+	mutex_lock(&runlist->mutex);
+
+	ret = vgpu_fifo_update_runlist_locked(g, runlist_id, hw_chid, add,
+					wait_for_finish);
+
+	mutex_unlock(&runlist->mutex);
+	return ret;
+}
+
+static int vgpu_fifo_wait_engine_idle(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	return 0;
+}
+
+void vgpu_init_fifo_ops(struct gpu_ops *gops)
+{
+	gops->fifo.bind_channel = vgpu_channel_bind;
+	gops->fifo.unbind_channel = vgpu_channel_unbind;
+	gops->fifo.disable_channel = vgpu_channel_disable;
+	gops->fifo.alloc_inst = vgpu_channel_alloc_inst;
+	gops->fifo.free_inst = vgpu_channel_free_inst;
+	gops->fifo.setup_ramfc = vgpu_channel_setup_ramfc;
+	gops->fifo.preempt_channel = vgpu_fifo_preempt_channel;
+	gops->fifo.update_runlist = vgpu_fifo_update_runlist;
+	gops->fifo.wait_engine_idle = vgpu_fifo_wait_engine_idle;
+}
+
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
new file mode 100644
index 00000000..a7e966da
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -0,0 +1,687 @@
+/*
+ * Virtualized GPU Graphics
+ *
+ * Copyright (c) 2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "vgpu/vgpu.h"
+#include "gk20a/hw_gr_gk20a.h"
+
+static int vgpu_gr_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(c->g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_COMMIT_GR_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? -1 : 0;
+}
+
+static int vgpu_gr_commit_global_ctx_buffers(struct gk20a *g,
+					struct channel_gk20a *c, bool patch)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_COMMIT_GR_GLOBAL_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? -1 : 0;
+}
+
+/* load saved fresh copy of gloden image into channel gr_ctx */
+static int vgpu_gr_load_golden_ctx_image(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_LOAD_GR_GOLDEN_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? -1 : 0;
+}
+
+static int vgpu_gr_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+
+	gk20a_dbg_fn("");
+
+	vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_GOLDEN_CTX_SIZE,
+			&g->gr.ctx_vars.golden_image_size);
+	vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_ZCULL_CTX_SIZE,
+			&g->gr.ctx_vars.zcull_ctxsw_image_size);
+	if (!g->gr.ctx_vars.golden_image_size ||
+		!g->gr.ctx_vars.zcull_ctxsw_image_size)
+		return -ENXIO;
+
+	gr->ctx_vars.buffer_size = g->gr.ctx_vars.golden_image_size;
+	g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
+	return 0;
+}
+
+static int vgpu_gr_alloc_global_ctx_buffers(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int attr_buffer_size;
+
+	u32 cb_buffer_size = gr->bundle_cb_default_size *
+		gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
+
+	u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
+		gr_scc_pagepool_total_pages_byte_granularity_v();
+
+	gk20a_dbg_fn("");
+
+	attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
+
+	gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
+	gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
+
+	gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
+	gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
+
+	gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
+	gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
+
+	gk20a_dbg_info("priv access map size : %d",
+		gr->ctx_vars.priv_access_map_size);
+	gr->global_ctx_buffer[PRIV_ACCESS_MAP].size =
+		gr->ctx_vars.priv_access_map_size;
+
+	return 0;
+}
+
+static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct gr_gk20a *gr = &g->gr;
+	u64 gpu_va;
+	u32 i;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	/* FIXME: add VPR support */
+
+	/* Circular Buffer */
+	gpu_va = gk20a_vm_alloc_va(ch_vm,
+				gr->global_ctx_buffer[CIRCULAR].size, 0);
+
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[CIRCULAR_VA] = gpu_va;
+	g_bfr_size[CIRCULAR_VA] = gr->global_ctx_buffer[CIRCULAR].size;
+
+	/* Attribute Buffer */
+	gpu_va = gk20a_vm_alloc_va(ch_vm,
+				gr->global_ctx_buffer[ATTRIBUTE].size, 0);
+
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[ATTRIBUTE_VA] = gpu_va;
+	g_bfr_size[ATTRIBUTE_VA] = gr->global_ctx_buffer[ATTRIBUTE].size;
+
+	/* Page Pool */
+	gpu_va = gk20a_vm_alloc_va(ch_vm,
+				gr->global_ctx_buffer[PAGEPOOL].size, 0);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[PAGEPOOL_VA] = gpu_va;
+	g_bfr_size[PAGEPOOL_VA] = gr->global_ctx_buffer[PAGEPOOL].size;
+
+	/* Priv register Access Map */
+	gpu_va = gk20a_vm_alloc_va(ch_vm,
+				gr->global_ctx_buffer[PRIV_ACCESS_MAP].size, 0);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
+	g_bfr_size[PRIV_ACCESS_MAP_VA] =
+		gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_MAP_GR_GLOBAL_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	p->cb_va = g_bfr_va[CIRCULAR_VA];
+	p->attr_va = g_bfr_va[ATTRIBUTE_VA];
+	p->page_pool_va = g_bfr_va[PAGEPOOL_VA];
+	p->priv_access_map_va = g_bfr_va[PRIV_ACCESS_MAP_VA];
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		goto clean_up;
+
+	c->ch_ctx.global_ctx_buffer_mapped = true;
+	return 0;
+
+ clean_up:
+	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+		if (g_bfr_va[i]) {
+			gk20a_vm_free_va(ch_vm, g_bfr_va[i],
+					g_bfr_size[i], 0);
+			g_bfr_va[i] = 0;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void vgpu_gr_unmap_global_ctx_buffers(struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(c->g->dev);
+	struct vm_gk20a *ch_vm = c->vm;
+	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	u32 i;
+
+	gk20a_dbg_fn("");
+
+	if (c->ch_ctx.global_ctx_buffer_mapped) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+		int err;
+
+		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_UNMAP_GR_GLOBAL_CTX;
+		msg.handle = platform->virt_handle;
+		p->handle = c->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
+	}
+
+	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+		if (g_bfr_va[i]) {
+			gk20a_vm_free_va(ch_vm, g_bfr_va[i], g_bfr_size[i], 0);
+			g_bfr_va[i] = 0;
+			g_bfr_size[i] = 0;
+		}
+	}
+	c->ch_ctx.global_ctx_buffer_mapped = false;
+}
+
+static int vgpu_gr_alloc_channel_gr_ctx(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	struct gr_gk20a *gr = &g->gr;
+	struct gr_ctx_desc *gr_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (gr->ctx_vars.buffer_size == 0)
+		return 0;
+
+	/* alloc channel gr ctx buffer */
+	gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
+	gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
+
+	gr_ctx = kzalloc(sizeof(*gr_ctx), GFP_KERNEL);
+	if (!gr_ctx)
+		return -ENOMEM;
+
+	gr_ctx->size = gr->ctx_vars.buffer_total_size;
+	gr_ctx->gpu_va = gk20a_vm_alloc_va(ch_vm, gr_ctx->size, 0);
+
+	if (!gr_ctx->gpu_va) {
+		kfree(gr_ctx);
+		return -ENOMEM;
+	}
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_GR_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	p->gr_ctx_va = gr_ctx->gpu_va;
+	p->class_num = c->obj_class;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	if (err || msg.ret) {
+		gk20a_vm_free_va(ch_vm, gr_ctx->gpu_va, gr_ctx->size, 0);
+		err = -ENOMEM;
+	} else
+		c->ch_ctx.gr_ctx = gr_ctx;
+
+	return err;
+}
+
+static void vgpu_gr_free_channel_gr_ctx(struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(c->g->dev);
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+
+	gk20a_dbg_fn("");
+
+	if (ch_ctx->gr_ctx && ch_ctx->gr_ctx->gpu_va) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+		int err;
+
+		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_GR_CTX;
+		msg.handle = platform->virt_handle;
+		p->handle = c->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
+
+		gk20a_vm_free_va(ch_vm, ch_ctx->gr_ctx->gpu_va,
+				ch_ctx->gr_ctx->size, 0);
+		ch_ctx->gr_ctx->gpu_va = 0;
+		kfree(ch_ctx->gr_ctx);
+	}
+}
+
+static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	patch_ctx->size = 128 * sizeof(u32);
+	patch_ctx->gpu_va = gk20a_vm_alloc_va(ch_vm, patch_ctx->size, 0);
+	if (!patch_ctx->gpu_va)
+		return -ENOMEM;
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_GR_PATCH_CTX;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	p->patch_ctx_va = patch_ctx->gpu_va;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret) {
+		gk20a_vm_free_va(ch_vm, patch_ctx->gpu_va, patch_ctx->size, 0);
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void vgpu_gr_free_channel_patch_ctx(struct channel_gk20a *c)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(c->g->dev);
+	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+
+	gk20a_dbg_fn("");
+
+	if (patch_ctx->gpu_va) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+		int err;
+
+		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_GR_PATCH_CTX;
+		msg.handle = platform->virt_handle;
+		p->handle = c->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
+
+		gk20a_vm_free_va(ch_vm, patch_ctx->gpu_va, patch_ctx->size, 0);
+		patch_ctx->gpu_va = 0;
+	}
+}
+
+static void vgpu_gr_free_channel_ctx(struct channel_gk20a *c)
+{
+	gk20a_dbg_fn("");
+
+	vgpu_gr_unmap_global_ctx_buffers(c);
+	vgpu_gr_free_channel_patch_ctx(c);
+	if (!gk20a_is_channel_marked_as_tsg(c))
+		vgpu_gr_free_channel_gr_ctx(c);
+
+	/* zcull_ctx, pm_ctx */
+
+	memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+
+	c->num_objects = 0;
+	c->first_init = false;
+}
+
+static int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c,
+				struct nvhost_alloc_obj_ctx_args *args)
+{
+	struct gk20a *g = c->g;
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg = NULL;
+	int err = 0;
+
+	gk20a_dbg_fn("");
+
+	/* an address space needs to have been bound at this point.*/
+	if (!gk20a_channel_as_bound(c)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "not bound to address space at time"
+			   " of grctx allocation");
+		return -EINVAL;
+	}
+
+	if (!g->ops.gr.is_valid_class(g, args->class_num)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "invalid obj class 0x%x", args->class_num);
+		err = -EINVAL;
+		goto out;
+	}
+	c->obj_class = args->class_num;
+
+	/* FIXME: add TSG support */
+	if (gk20a_is_channel_marked_as_tsg(c))
+		tsg = &f->tsg[c->tsgid];
+
+	/* allocate gr ctx buffer */
+	if (!ch_ctx->gr_ctx) {
+		err = vgpu_gr_alloc_channel_gr_ctx(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to allocate gr ctx buffer");
+			goto out;
+		}
+	} else {
+		/*TBD: needs to be more subtle about which is
+		 * being allocated as some are allowed to be
+		 * allocated along same channel */
+		gk20a_err(dev_from_gk20a(g),
+			"too many classes alloc'd on same channel");
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* commit gr ctx buffer */
+	err = vgpu_gr_commit_inst(c, ch_ctx->gr_ctx->gpu_va);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to commit gr ctx buffer");
+		goto out;
+	}
+
+	/* allocate patch buffer */
+	if (ch_ctx->patch_ctx.pages == NULL) {
+		err = vgpu_gr_alloc_channel_patch_ctx(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to allocate patch buffer");
+			goto out;
+		}
+	}
+
+	/* map global buffer to channel gpu_va and commit */
+	if (!ch_ctx->global_ctx_buffer_mapped) {
+		err = vgpu_gr_map_global_ctx_buffers(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to map global ctx buffer");
+			goto out;
+		}
+		gr_gk20a_elpg_protected_call(g,
+				vgpu_gr_commit_global_ctx_buffers(g, c, true));
+	}
+
+	/* load golden image */
+	if (!c->first_init) {
+		err = gr_gk20a_elpg_protected_call(g,
+				vgpu_gr_load_golden_ctx_image(g, c));
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to load golden ctx image");
+			goto out;
+		}
+		c->first_init = true;
+	}
+
+	c->num_objects++;
+
+	gk20a_dbg_fn("done");
+	return 0;
+out:
+	/* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+	   can be reused so no need to release them.
+	   2. golden image load is a one time thing so if
+	   they pass, no need to undo. */
+	gk20a_err(dev_from_gk20a(g), "fail");
+	return err;
+}
+
+static int vgpu_gr_free_obj_ctx(struct channel_gk20a  *c,
+				struct nvhost_free_obj_ctx_args *args)
+{
+	unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
+
+	gk20a_dbg_fn("");
+
+	if (c->num_objects == 0)
+		return 0;
+
+	c->num_objects--;
+
+	if (c->num_objects == 0) {
+		c->first_init = false;
+		gk20a_disable_channel(c,
+			!c->has_timedout,
+			timeout);
+	}
+
+	return 0;
+}
+
+static int vgpu_gr_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+
+	gk20a_dbg_fn("");
+
+	if (vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_GPC_COUNT, &gr->gpc_count))
+		return -ENOMEM;
+
+	if (vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_MAX_TPC_PER_GPC_COUNT,
+			&gr->max_tpc_per_gpc_count))
+		return -ENOMEM;
+
+	if (vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_MAX_TPC_COUNT,
+			&gr->max_tpc_count))
+		return -ENOMEM;
+
+	g->ops.gr.bundle_cb_defaults(g);
+	g->ops.gr.cb_size_default(g);
+	g->ops.gr.calc_global_ctx_buffer_size(g);
+	return 0;
+}
+
+static int vgpu_gr_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+				struct channel_gk20a *c, u64 zcull_va,
+				u32 mode)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_zcull_bind_params *p = &msg.params.zcull_bind;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_BIND_ZCULL;
+	msg.handle = platform->virt_handle;
+	p->handle = c->virt_ctx;
+	p->zcull_va = zcull_va;
+	p->mode = mode;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? -ENOMEM : 0;
+}
+
+static int vgpu_gr_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+				struct gr_zcull_info *zcull_params)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_zcull_info_params *p = &msg.params.zcull_info;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	msg.cmd = TEGRA_VGPU_CMD_GET_ZCULL_INFO;
+	msg.handle = platform->virt_handle;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		return -ENOMEM;
+
+	zcull_params->width_align_pixels = p->width_align_pixels;
+	zcull_params->height_align_pixels = p->height_align_pixels;
+	zcull_params->pixel_squares_by_aliquots = p->pixel_squares_by_aliquots;
+	zcull_params->aliquot_total = p->aliquot_total;
+	zcull_params->region_byte_multiplier = p->region_byte_multiplier;
+	zcull_params->region_header_size = p->region_header_size;
+	zcull_params->subregion_header_size = p->subregion_header_size;
+	zcull_params->subregion_width_align_pixels =
+		p->subregion_width_align_pixels;
+	zcull_params->subregion_height_align_pixels =
+		p->subregion_height_align_pixels;
+	zcull_params->subregion_count = p->subregion_count;
+
+	return 0;
+}
+
+static void vgpu_remove_gr_support(struct gr_gk20a *gr)
+{
+	gk20a_dbg_fn("");
+
+	gk20a_allocator_destroy(&gr->comp_tags);
+}
+
+static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (gr->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	gr->g = g;
+
+	err = vgpu_gr_init_gr_config(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = vgpu_gr_init_ctx_state(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = g->ops.ltc.init_comptags(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = vgpu_gr_alloc_global_ctx_buffers(g);
+	if (err)
+		goto clean_up;
+
+	mutex_init(&gr->ctx_mutex);
+
+	gr->remove_support = vgpu_remove_gr_support;
+	gr->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up:
+	gk20a_err(dev_from_gk20a(g), "fail");
+	vgpu_remove_gr_support(gr);
+	return err;
+}
+
+int vgpu_init_gr_support(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	return vgpu_gr_init_gr_setup_sw(g);
+}
+
+struct gr_isr_data {
+	u32 addr;
+	u32 data_lo;
+	u32 data_hi;
+	u32 curr_ctx;
+	u32 chid;
+	u32 offset;
+	u32 sub_chan;
+	u32 class_num;
+};
+
+static int vgpu_gr_handle_notify_pending(struct gk20a *g,
+					struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+
+	gk20a_dbg_fn("");
+	wake_up(&ch->notifier_wq);
+	return 0;
+}
+
+int vgpu_gr_isr(struct gk20a *g, struct tegra_vgpu_gr_intr_info *info)
+{
+	struct gr_isr_data isr_data;
+
+	gk20a_dbg_fn("");
+
+	isr_data.chid = info->chid;
+
+	if (info->type == TEGRA_VGPU_GR_INTR_NOTIFY)
+		vgpu_gr_handle_notify_pending(g, &isr_data);
+
+	return 0;
+}
+
+void vgpu_init_gr_ops(struct gpu_ops *gops)
+{
+	gops->gr.free_channel_ctx = vgpu_gr_free_channel_ctx;
+	gops->gr.alloc_obj_ctx = vgpu_gr_alloc_obj_ctx;
+	gops->gr.free_obj_ctx = vgpu_gr_free_obj_ctx;
+	gops->gr.bind_ctxsw_zcull = vgpu_gr_bind_ctxsw_zcull;
+	gops->gr.get_zcull_info = vgpu_gr_get_zcull_info;
+}
diff --git a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
new file mode 100644
index 00000000..ddff23b7
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
@@ -0,0 +1,55 @@
+/*
+ * Virtualized GPU L2
+ *
+ * Copyright (c) 2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "vgpu/vgpu.h"
+
+static int vgpu_determine_L2_size_bytes(struct gk20a *g)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	u32 cache_size = 0;
+
+	gk20a_dbg_fn("");
+
+	if (vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_L2_SIZE, &cache_size))
+		dev_err(dev_from_gk20a(g), "unable to get L2 size");
+
+	return cache_size;
+}
+
+static int vgpu_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	u32 max_comptag_lines = 0;
+
+	gk20a_dbg_fn("");
+
+	vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_COMPTAG_LINES, &max_comptag_lines);
+	if (max_comptag_lines < 2)
+		return -ENXIO;
+
+	gk20a_allocator_init(&gr->comp_tags, "comptag",
+			      1, /* start */
+			      max_comptag_lines - 1, /* length*/
+			      1); /* align */
+	return 0;
+}
+
+void vgpu_init_ltc_ops(struct gpu_ops *gops)
+{
+	gops->ltc.determine_L2_size_bytes = vgpu_determine_L2_size_bytes;
+	gops->ltc.init_comptags = vgpu_ltc_init_comptags;
+}
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
new file mode 100644
index 00000000..6ed1dece
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -0,0 +1,425 @@
+/*
+ * Virtualized GPU Memory Management
+ *
+ * Copyright (c) 2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/dma-mapping.h>
+#include "vgpu/vgpu.h"
+
+/* note: keep the page sizes sorted lowest to highest here */
+static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
+static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
+
+static int vgpu_init_mm_setup_sw(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+
+	gk20a_dbg_fn("");
+
+	if (mm->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	mm->g = g;
+	mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
+	mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
+	mm->pde_stride    = mm->big_page_size << 10;
+	mm->pde_stride_shift = ilog2(mm->pde_stride);
+	BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
+
+	/*TBD: make channel vm size configurable */
+	mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
+
+	gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
+
+	mm->sw_ready = true;
+
+	return 0;
+}
+
+int vgpu_init_mm_support(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	return vgpu_init_mm_setup_sw(g);
+}
+
+static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
+				u64 map_offset,
+				struct sg_table *sgt,
+				u64 buffer_offset,
+				u64 size,
+				int pgsz_idx,
+				u8 kind_v,
+				u32 ctag_offset,
+				u32 flags,
+				int rw_flag,
+				bool clear_ctags)
+{
+	int err = 0;
+	struct device *d = dev_from_vm(vm);
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
+	u64 addr = gk20a_mm_iova_addr(sgt->sgl);
+	u8 prot;
+
+	gk20a_dbg_fn("");
+
+	/* Allocate (or validate when map_offset != 0) the virtual address. */
+	if (!map_offset) {
+		map_offset = gk20a_vm_alloc_va(vm, size,
+					  pgsz_idx);
+		if (!map_offset) {
+			gk20a_err(d, "failed to allocate va space");
+			err = -ENOMEM;
+			goto fail;
+		}
+	}
+
+	if (rw_flag == gk20a_mem_flag_read_only)
+		prot = TEGRA_VGPU_MAP_PROT_READ_ONLY;
+	else if (rw_flag == gk20a_mem_flag_write_only)
+		prot = TEGRA_VGPU_MAP_PROT_WRITE_ONLY;
+	else
+		prot = TEGRA_VGPU_MAP_PROT_NONE;
+
+	msg.cmd = TEGRA_VGPU_CMD_AS_MAP;
+	msg.handle = platform->virt_handle;
+	p->handle = vm->handle;
+	p->addr = addr;
+	p->gpu_va = map_offset;
+	p->size = size;
+	p->pgsz_idx = pgsz_idx;
+	p->iova = mapping ? 1 : 0;
+	p->kind = kind_v;
+	p->cacheable =
+		(flags & NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE) ? 1 : 0;
+	p->prot = prot;
+	p->ctag_offset = ctag_offset;
+	p->clear_ctags = clear_ctags;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		goto fail;
+
+	vm->tlb_dirty = true;
+	return map_offset;
+fail:
+	gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
+	return 0;
+}
+
+static void vgpu_locked_gmmu_unmap(struct vm_gk20a *vm,
+				u64 vaddr,
+				u64 size,
+				int pgsz_idx,
+				bool va_allocated,
+				int rw_flag)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (va_allocated) {
+		err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
+		if (err) {
+			dev_err(dev_from_vm(vm),
+				"failed to free va");
+			return;
+		}
+	}
+
+	msg.cmd = TEGRA_VGPU_CMD_AS_UNMAP;
+	msg.handle = platform->virt_handle;
+	p->handle = vm->handle;
+	p->gpu_va = vaddr;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		dev_err(dev_from_vm(vm),
+			"failed to update gmmu ptes on unmap");
+
+	vm->tlb_dirty = true;
+}
+
+static void vgpu_vm_remove_support(struct vm_gk20a *vm)
+{
+	struct gk20a *g = vm->mm->g;
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct mapped_buffer_node *mapped_buffer;
+	struct vm_reserved_va_node *va_node, *va_node_tmp;
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_share_params *p = &msg.params.as_share;
+	struct rb_node *node;
+	int err;
+
+	gk20a_dbg_fn("");
+	mutex_lock(&vm->update_gmmu_lock);
+
+	/* TBD: add a flag here for the unmap code to recognize teardown
+	 * and short-circuit any otherwise expensive operations. */
+
+	node = rb_first(&vm->mapped_buffers);
+	while (node) {
+		mapped_buffer =
+			container_of(node, struct mapped_buffer_node, node);
+		gk20a_vm_unmap_locked(mapped_buffer);
+		node = rb_first(&vm->mapped_buffers);
+	}
+
+	/* destroy remaining reserved memory areas */
+	list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
+		reserved_va_list) {
+		list_del(&va_node->reserved_va_list);
+		kfree(va_node);
+	}
+
+	msg.cmd = TEGRA_VGPU_CMD_AS_FREE_SHARE;
+	msg.handle = platform->virt_handle;
+	p->handle = vm->handle;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+
+	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	/* release zero page if used */
+	if (vm->zero_page_cpuva)
+		dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
+				  vm->zero_page_cpuva, vm->zero_page_iova);
+
+	/* vm is not used anymore. release it. */
+	kfree(vm);
+}
+
+u64 vgpu_bar1_map(struct gk20a *g, struct sg_table **sgt, u64 size)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct dma_iommu_mapping *mapping =
+			to_dma_iommu_mapping(dev_from_gk20a(g));
+	u64 addr = gk20a_mm_iova_addr((*sgt)->sgl);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
+	int err;
+
+	msg.cmd = TEGRA_VGPU_CMD_MAP_BAR1;
+	msg.handle = platform->virt_handle;
+	p->addr = addr;
+	p->size = size;
+	p->iova = mapping ? 1 : 0;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		addr = 0;
+	else
+		addr = p->gpu_va;
+
+	return addr;
+}
+
+/* address space interfaces for the gk20a module */
+static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share)
+{
+	struct gk20a_as *as = as_share->as;
+	struct gk20a *g = gk20a_from_as(as);
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_share_params *p = &msg.params.as_share;
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm;
+	u64 vma_size;
+	u32 num_pages, low_hole_pages;
+	char name[32];
+	int err;
+
+	gk20a_dbg_fn("");
+
+	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+	if (!vm)
+		return -ENOMEM;
+
+	as_share->vm = vm;
+
+	vm->mm = mm;
+	vm->as_share = as_share;
+
+	vm->big_pages = true;
+
+	vm->va_start  = mm->pde_stride;   /* create a one pde hole */
+	vm->va_limit  = mm->channel.size; /* note this means channel.size is
+					     really just the max */
+
+	msg.cmd = TEGRA_VGPU_CMD_AS_ALLOC_SHARE;
+	msg.handle = platform->virt_handle;
+	p->size = vm->va_limit;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	if (err || msg.ret)
+		return -ENOMEM;
+
+	vm->handle = p->handle;
+
+	/* low-half: alloc small pages */
+	/* high-half: alloc big pages */
+	vma_size = mm->channel.size >> 1;
+
+	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+		 gmmu_page_sizes[gmmu_page_size_small]>>10);
+	num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
+
+	/* num_pages above is without regard to the low-side hole. */
+	low_hole_pages = (vm->va_start >>
+			  gmmu_page_shifts[gmmu_page_size_small]);
+
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
+	      low_hole_pages,             /* start */
+	      num_pages - low_hole_pages, /* length */
+	      1);                         /* align */
+
+	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+		 gmmu_page_sizes[gmmu_page_size_big]>>10);
+
+	num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
+			      num_pages, /* start */
+			      num_pages, /* length */
+			      1); /* align */
+
+	vm->mapped_buffers = RB_ROOT;
+
+	mutex_init(&vm->update_gmmu_lock);
+	kref_init(&vm->ref);
+	INIT_LIST_HEAD(&vm->reserved_va_list);
+
+	vm->enable_ctag = true;
+
+	return 0;
+}
+
+static int vgpu_vm_bind_channel(struct gk20a_as_share *as_share,
+				struct channel_gk20a *ch)
+{
+	struct vm_gk20a *vm = as_share->vm;
+	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_bind_share_params *p = &msg.params.as_bind_share;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	ch->vm = vm;
+	msg.cmd = TEGRA_VGPU_CMD_AS_BIND_SHARE;
+	msg.handle = platform->virt_handle;
+	p->as_handle = vm->handle;
+	p->chan_handle = ch->virt_ctx;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	if (err || msg.ret) {
+		ch->vm = NULL;
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void vgpu_cache_maint(u64 handle, u8 op)
+{
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_cache_maint_params *p = &msg.params.cache_maint;
+	int err;
+
+	msg.cmd = TEGRA_VGPU_CMD_CACHE_MAINT;
+	msg.handle = handle;
+	p->op = op;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+}
+
+static int vgpu_mm_fb_flush(struct gk20a *g)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+
+	gk20a_dbg_fn("");
+
+	vgpu_cache_maint(platform->virt_handle, TEGRA_VGPU_FB_FLUSH);
+	return 0;
+}
+
+static void vgpu_mm_l2_invalidate(struct gk20a *g)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+
+	gk20a_dbg_fn("");
+
+	vgpu_cache_maint(platform->virt_handle, TEGRA_VGPU_L2_MAINT_INV);
+}
+
+static void vgpu_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	u8 op;
+
+	gk20a_dbg_fn("");
+
+	if (invalidate)
+		op = TEGRA_VGPU_L2_MAINT_FLUSH_INV;
+	else
+		op =  TEGRA_VGPU_L2_MAINT_FLUSH;
+
+	vgpu_cache_maint(platform->virt_handle, op);
+}
+
+static void vgpu_mm_tlb_invalidate(struct vm_gk20a *vm)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_as_invalidate_params *p = &msg.params.as_invalidate;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	/* No need to invalidate if tlb is clean */
+	mutex_lock(&vm->update_gmmu_lock);
+	if (!vm->tlb_dirty) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		return;
+	}
+
+	msg.cmd = TEGRA_VGPU_CMD_AS_INVALIDATE;
+	msg.handle = platform->virt_handle;
+	p->handle = vm->handle;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+	WARN_ON(err || msg.ret);
+	vm->tlb_dirty = false;
+	mutex_unlock(&vm->update_gmmu_lock);
+}
+
+void vgpu_init_mm_ops(struct gpu_ops *gops)
+{
+	gops->mm.gmmu_map = vgpu_locked_gmmu_map;
+	gops->mm.gmmu_unmap = vgpu_locked_gmmu_unmap;
+	gops->mm.vm_remove = vgpu_vm_remove_support;
+	gops->mm.vm_alloc_share = vgpu_vm_alloc_share;
+	gops->mm.vm_bind_channel = vgpu_vm_bind_channel;
+	gops->mm.fb_flush = vgpu_mm_fb_flush;
+	gops->mm.l2_invalidate = vgpu_mm_l2_invalidate;
+	gops->mm.l2_flush = vgpu_mm_l2_flush;
+	gops->mm.tlb_invalidate = vgpu_mm_tlb_invalidate;
+}
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
new file mode 100644
index 00000000..cfe307ff
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -0,0 +1,416 @@
+/*
+ * Virtualized GPU
+ *
+ * Copyright (c) 2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/pm_runtime.h>
+#include "vgpu/vgpu.h"
+#include "gk20a/debug_gk20a.h"
+#include "gk20a/hal_gk20a.h"
+#include "gk20a/hw_mc_gk20a.h"
+
+static inline int vgpu_comm_init(struct platform_device *pdev)
+{
+	size_t queue_sizes[] = { TEGRA_VGPU_QUEUE_SIZES };
+
+	return tegra_gr_comm_init(pdev, TEGRA_GR_COMM_CTX_CLIENT, 3,
+				queue_sizes, TEGRA_VGPU_QUEUE_CMD,
+				ARRAY_SIZE(queue_sizes));
+}
+
+static inline void vgpu_comm_deinit(void)
+{
+	size_t queue_sizes[] = { TEGRA_VGPU_QUEUE_SIZES };
+
+	tegra_gr_comm_deinit(TEGRA_GR_COMM_CTX_CLIENT, TEGRA_VGPU_QUEUE_CMD,
+			ARRAY_SIZE(queue_sizes));
+}
+
+int vgpu_comm_sendrecv(struct tegra_vgpu_cmd_msg *msg, size_t size_in,
+		size_t size_out)
+{
+	void *handle;
+	size_t size = size_in;
+	void *data = msg;
+	int err;
+
+	err = tegra_gr_comm_sendrecv(TEGRA_GR_COMM_CTX_CLIENT,
+				tegra_gr_comm_get_server_vmid(),
+				TEGRA_VGPU_QUEUE_CMD, &handle, &data, &size);
+	if (!err) {
+		WARN_ON(size < size_out);
+		memcpy(msg, data, size_out);
+		tegra_gr_comm_release(handle);
+	}
+
+	return err;
+}
+
+static u64 vgpu_connect(void)
+{
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_connect_params *p = &msg.params.connect;
+	int err;
+
+	msg.cmd = TEGRA_VGPU_CMD_CONNECT;
+	p->module = TEGRA_VGPU_MODULE_GPU;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	return (err || msg.ret) ? 0 : p->handle;
+}
+
+int vgpu_get_attribute(u64 handle, u32 attrib, u32 *value)
+{
+	struct tegra_vgpu_cmd_msg msg;
+	struct tegra_vgpu_attrib_params *p = &msg.params.attrib;
+	int err;
+
+	msg.cmd = TEGRA_VGPU_CMD_GET_ATTRIBUTE;
+	msg.handle = handle;
+	p->attrib = attrib;
+	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+
+	if (err || msg.ret)
+		return -1;
+
+	*value = p->value;
+	return 0;
+}
+
+static int vgpu_intr_thread(void *dev_id)
+{
+	struct gk20a *g = dev_id;
+
+	while (true) {
+		struct tegra_vgpu_intr_msg *msg;
+		u32 sender;
+		void *handle;
+		size_t size;
+		int err;
+
+		err = tegra_gr_comm_recv(TEGRA_GR_COMM_CTX_CLIENT,
+					TEGRA_VGPU_QUEUE_INTR, &handle,
+					(void **)&msg, &size, &sender);
+		if (WARN_ON(err))
+			continue;
+
+		if (msg->event == TEGRA_VGPU_EVENT_ABORT) {
+			tegra_gr_comm_release(handle);
+			break;
+		}
+
+		if (msg->unit == TEGRA_VGPU_INTR_GR)
+			vgpu_gr_isr(g, &msg->info.gr_intr);
+
+		tegra_gr_comm_release(handle);
+	}
+
+	while (!kthread_should_stop())
+		msleep(10);
+	return 0;
+}
+
+static void vgpu_remove_support(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
+	struct tegra_vgpu_intr_msg msg;
+	int err;
+
+	if (g->pmu.remove_support)
+		g->pmu.remove_support(&g->pmu);
+
+	if (g->gr.remove_support)
+		g->gr.remove_support(&g->gr);
+
+	if (g->fifo.remove_support)
+		g->fifo.remove_support(&g->fifo);
+
+	if (g->mm.remove_support)
+		g->mm.remove_support(&g->mm);
+
+	msg.event = TEGRA_VGPU_EVENT_ABORT;
+	err = tegra_gr_comm_send(TEGRA_GR_COMM_CTX_CLIENT,
+				TEGRA_GR_COMM_ID_SELF, TEGRA_VGPU_QUEUE_INTR,
+				&msg, sizeof(msg));
+	WARN_ON(err);
+	kthread_stop(platform->intr_handler);
+
+	/* free mappings to registers, etc*/
+
+	if (g->bar1) {
+		iounmap(g->bar1);
+		g->bar1 = 0;
+	}
+}
+
+static int vgpu_init_support(struct platform_device *dev)
+{
+	struct resource *r = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	struct gk20a *g = get_gk20a(dev);
+	int err = 0;
+
+	if (!r) {
+		dev_err(dev_from_gk20a(g), "faield to get gk20a bar1\n");
+		err = -ENXIO;
+		goto fail;
+	}
+
+	g->bar1 = devm_request_and_ioremap(&dev->dev, r);
+	if (!g->bar1) {
+		dev_err(dev_from_gk20a(g), "failed to remap gk20a bar1\n");
+		err = -ENXIO;
+		goto fail;
+	}
+
+	mutex_init(&g->dbg_sessions_lock);
+	mutex_init(&g->client_lock);
+
+	g->remove_support = vgpu_remove_support;
+	return 0;
+
+ fail:
+	vgpu_remove_support(dev);
+	return err;
+}
+
+int vgpu_pm_prepare_poweroff(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct gk20a *g = get_gk20a(pdev);
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (!g->power_on)
+		return 0;
+
+	ret = gk20a_channel_suspend(g);
+	if (ret)
+		return ret;
+
+	g->power_on = false;
+
+	return ret;
+}
+
+static void vgpu_detect_chip(struct gk20a *g)
+{
+	struct nvhost_gpu_characteristics *gpu = &g->gpu_characteristics;
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
+
+	u32 mc_boot_0_value;
+
+	if (vgpu_get_attribute(platform->virt_handle,
+			TEGRA_VGPU_ATTRIB_PMC_BOOT_0,
+			&mc_boot_0_value)) {
+		gk20a_err(dev_from_gk20a(g), "failed to detect chip");
+		return;
+	}
+
+	gpu->arch = mc_boot_0_architecture_v(mc_boot_0_value) <<
+		NVHOST_GPU_ARCHITECTURE_SHIFT;
+	gpu->impl = mc_boot_0_implementation_v(mc_boot_0_value);
+	gpu->rev =
+		(mc_boot_0_major_revision_v(mc_boot_0_value) << 4) |
+		mc_boot_0_minor_revision_v(mc_boot_0_value);
+
+	gk20a_dbg_info("arch: %x, impl: %x, rev: %x\n",
+			g->gpu_characteristics.arch,
+			g->gpu_characteristics.impl,
+			g->gpu_characteristics.rev);
+}
+
+static int vgpu_init_hal(struct gk20a *g)
+{
+	u32 ver = g->gpu_characteristics.arch + g->gpu_characteristics.impl;
+
+	switch (ver) {
+	case GK20A_GPUID_GK20A:
+		gk20a_dbg_info("gk20a detected");
+		/* init gk20a ops then override with virt extensions */
+		gk20a_init_hal(&g->ops);
+		vgpu_init_fifo_ops(&g->ops);
+		vgpu_init_gr_ops(&g->ops);
+		vgpu_init_ltc_ops(&g->ops);
+		vgpu_init_mm_ops(&g->ops);
+		break;
+	default:
+		gk20a_err(&g->dev->dev, "no support for %x", ver);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+int vgpu_pm_finalize_poweron(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct gk20a *g = get_gk20a(pdev);
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (g->power_on)
+		return 0;
+
+	g->power_on = true;
+
+	vgpu_detect_chip(g);
+	err = vgpu_init_hal(g);
+	if (err)
+		goto done;
+
+	err = vgpu_init_mm_support(g);
+	if (err) {
+		gk20a_err(dev, "failed to init gk20a mm");
+		goto done;
+	}
+
+	err = vgpu_init_fifo_support(g);
+	if (err) {
+		gk20a_err(dev, "failed to init gk20a fifo");
+		goto done;
+	}
+
+	err = vgpu_init_gr_support(g);
+	if (err) {
+		gk20a_err(dev, "failed to init gk20a gr");
+		goto done;
+	}
+
+	err = gk20a_init_gpu_characteristics(g);
+	if (err) {
+		gk20a_err(dev, "failed to init gk20a gpu characteristics");
+		goto done;
+	}
+
+	gk20a_channel_resume(g);
+
+done:
+	return err;
+}
+
+static int vgpu_pm_init(struct platform_device *dev)
+{
+	int err = 0;
+
+	gk20a_dbg_fn("");
+
+	pm_runtime_enable(&dev->dev);
+	return err;
+}
+
+int vgpu_probe(struct platform_device *dev)
+{
+	struct gk20a *gk20a;
+	int err;
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
+
+	if (!platform) {
+		dev_err(&dev->dev, "no platform data\n");
+		return -ENODATA;
+	}
+
+	gk20a_dbg_fn("");
+
+	gk20a = kzalloc(sizeof(struct gk20a), GFP_KERNEL);
+	if (!gk20a) {
+		dev_err(&dev->dev, "couldn't allocate gk20a support");
+		return -ENOMEM;
+	}
+
+	platform->g = gk20a;
+	gk20a->dev = dev;
+
+	err = gk20a_user_init(dev);
+	if (err)
+		return err;
+
+	vgpu_init_support(dev);
+
+	init_rwsem(&gk20a->busy_lock);
+
+	spin_lock_init(&gk20a->mc_enable_lock);
+
+	/* Initialize the platform interface. */
+	err = platform->probe(dev);
+	if (err) {
+		dev_err(&dev->dev, "platform probe failed");
+		return err;
+	}
+
+	err = vgpu_pm_init(dev);
+	if (err) {
+		dev_err(&dev->dev, "pm init failed");
+		return err;
+	}
+
+	if (platform->late_probe) {
+		err = platform->late_probe(dev);
+		if (err) {
+			dev_err(&dev->dev, "late probe failed");
+			return err;
+		}
+	}
+
+	err = vgpu_comm_init(dev);
+	if (err) {
+		dev_err(&dev->dev, "failed to init comm interface\n");
+		return -ENOSYS;
+	}
+
+	platform->virt_handle = vgpu_connect();
+	if (!platform->virt_handle) {
+		dev_err(&dev->dev, "failed to connect to server node\n");
+		vgpu_comm_deinit();
+		return -ENOSYS;
+	}
+
+	platform->intr_handler = kthread_run(vgpu_intr_thread, gk20a, "gk20a");
+	if (IS_ERR(platform->intr_handler))
+		return -ENOMEM;
+
+	gk20a_debug_init(dev);
+
+	/* Set DMA parameters to allow larger sgt lists */
+	dev->dev.dma_parms = &gk20a->dma_parms;
+	dma_set_max_seg_size(&dev->dev, UINT_MAX);
+
+	gk20a->gr_idle_timeout_default =
+			CONFIG_GK20A_DEFAULT_TIMEOUT;
+	gk20a->timeouts_enabled = true;
+
+	gk20a_create_sysfs(dev);
+	gk20a_init_gr(gk20a);
+
+	return 0;
+}
+
+int vgpu_remove(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+	gk20a_dbg_fn("");
+
+	if (g->remove_support)
+		g->remove_support(dev);
+
+	vgpu_comm_deinit();
+	gk20a_user_deinit(dev);
+	gk20a_get_platform(dev)->g = NULL;
+	kfree(g);
+	return 0;
+}
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.h b/drivers/gpu/nvgpu/vgpu/vgpu.h
new file mode 100644
index 00000000..445a1c90
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.h
@@ -0,0 +1,41 @@
+/*
+ * Virtualized GPU Interfaces
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _VIRT_H_
+#define _VIRT_H_
+
+#include <linux/tegra_gr_comm.h>
+#include <linux/tegra_vgpu.h>
+#include "gk20a/gk20a.h"
+
+int vgpu_pm_prepare_poweroff(struct device *dev);
+int vgpu_pm_finalize_poweron(struct device *dev);
+int vgpu_probe(struct platform_device *dev);
+int vgpu_remove(struct platform_device *dev);
+u64 vgpu_bar1_map(struct gk20a *g, struct sg_table **sgt, u64 size);
+int vgpu_gr_isr(struct gk20a *g, struct tegra_vgpu_gr_intr_info *info);
+void vgpu_init_fifo_ops(struct gpu_ops *gops);
+void vgpu_init_gr_ops(struct gpu_ops *gops);
+void vgpu_init_ltc_ops(struct gpu_ops *gops);
+void vgpu_init_mm_ops(struct gpu_ops *gops);
+int vgpu_init_mm_support(struct gk20a *g);
+int vgpu_init_gr_support(struct gk20a *g);
+int vgpu_init_fifo_support(struct gk20a *g);
+
+int vgpu_get_attribute(u64 handle, u32 attrib, u32 *value);
+int vgpu_comm_sendrecv(struct tegra_vgpu_cmd_msg *msg, size_t size_in,
+		size_t size_out);
+
+#endif
-- 
cgit v1.2.2