gpu: nvgpu: Add NVIDIA GPU Driver

This patch moves the NVIDIA GPU driver to a new location. Bug 1482562 Change-Id: I24293810b9d0f1504fd9be00135e21dad656ccb6 Signed-off-by: Arto Merilainen <amerilainen@nvidia.com> Reviewed-on: http://git-master/r/383722 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Arto Merilainen <amerilainen@nvidia.com> 2014-03-19 03:38:25 -0400
committer: Dan Willemsen <dwillemsen@nvidia.com> 2015-03-18 15:08:53 -0400
commit: a9785995d5f22aaeb659285f8aeb64d8b56982e0 (patch)
tree: cc75f75bcf43db316a002a7a240b81f299bf6d7f /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parent: 61efaf843c22b85424036ec98015121c08f5f16c (diff)
1 files changed, 2984 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
new file mode 100644
index 00000000..b22df5e8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -0,0 +1,2984 @@
+/*
+ * drivers/video/tegra/host/gk20a/mm_gk20a.c
+ *
+ * GK20A memory management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/nvhost.h>
+#include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
+#include <linux/nvmap.h>
+#include <linux/tegra-soc.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-buf.h>
+#include <asm/cacheflush.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "hw_gmmu_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_bus_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_flush_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "kind_gk20a.h"
+#ifdef CONFIG_ARM64
+#define outer_flush_range(a, b)
+#define __cpuc_flush_dcache_area __flush_dcache_area
+#endif
+/*
+ * GPU mapping life cycle
+ * ======================
+ *
+ * Kernel mappings
+ * ---------------
+ *
+ * Kernel mappings are created through vm.map(..., false):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path does not support deferred unmapping (i.e. kernel must wait for
+ *    all hw operations on the buffer to complete before unmapping).
+ *  - References to dmabuf are owned and managed by the (kernel) clients of
+ *    the gk20a_vm layer.
+ *
+ *
+ * User space mappings
+ * -------------------
+ *
+ * User space mappings are created through as.map_buffer -> vm.map(..., true):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
+ *    until all hw operations have completed).
+ *  - References to dmabuf are owned and managed by the vm_gk20a
+ *    layer itself. vm.map acquires these refs, and sets
+ *    mapped_buffer->own_mem_ref to record that we must release the refs when we
+ *    actually unmap.
+ *
+ */
+static inline int vm_aspace_id(struct vm_gk20a *vm)
+{
+        /* -1 is bar1 or pmu, etc. */
+        return vm->as_share ? vm->as_share->id : -1;
+}
+static inline u32 hi32(u64 f)
+{
+        return (u32)(f >> 32);
+}
+static inline u32 lo32(u64 f)
+{
+        return (u32)(f & 0xffffffff);
+}
+#define FLUSH_CPU_DCACHE(va, pa, size)  \
+        do {    \
+                __cpuc_flush_dcache_area((void *)(va), (size_t)(size)); \
+                outer_flush_range(pa, pa + (size_t)(size));             \
+        } while (0)
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer);
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+                                        struct rb_root *root, u64 addr);
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+                                struct rb_root *root, struct dma_buf *dmabuf,
+                                u32 kind);
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+                                   enum gmmu_pgsz_gk20a pgsz_idx,
+                                   struct sg_table *sgt,
+                                   u64 first_vaddr, u64 last_vaddr,
+                                   u8 kind_v, u32 ctag_offset, bool cacheable,
+                                   int rw_flag);
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
+static void gk20a_vm_remove_support(struct vm_gk20a *vm);
+/* note: keep the page sizes sorted lowest to highest here */
+static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
+static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
+static const u64 gmmu_page_offset_masks[gmmu_nr_page_sizes] = { 0xfffLL,
+                                                                0x1ffffLL };
+static const u64 gmmu_page_masks[gmmu_nr_page_sizes] = { ~0xfffLL, ~0x1ffffLL };
+struct gk20a_comptags {
+        u32 offset;
+        u32 lines;
+};
+struct gk20a_dmabuf_priv {
+        struct mutex lock;
+        struct gk20a_allocator *comptag_allocator;
+        struct gk20a_comptags comptags;
+        struct dma_buf_attachment *attach;
+        struct sg_table *sgt;
+        int pin_count;
+};
+static void gk20a_mm_delete_priv(void *_priv)
+{
+        struct gk20a_dmabuf_priv *priv = _priv;
+        if (!priv)
+                return;
+        if (priv->comptags.lines) {
+                BUG_ON(!priv->comptag_allocator);
+                priv->comptag_allocator->free(priv->comptag_allocator,
+                                              priv->comptags.offset,
+                                              priv->comptags.lines);
+        }
+        kfree(priv);
+}
+struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
+{
+        struct gk20a_dmabuf_priv *priv;
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (WARN_ON(!priv))
+                return ERR_PTR(-EINVAL);
+        mutex_lock(&priv->lock);
+        if (priv->pin_count == 0) {
+                priv->attach = dma_buf_attach(dmabuf, dev);
+                if (IS_ERR(priv->attach)) {
+                        mutex_unlock(&priv->lock);
+                        return (struct sg_table *)priv->attach;
+                }
+                priv->sgt = dma_buf_map_attachment(priv->attach,
+                                                   DMA_BIDIRECTIONAL);
+                if (IS_ERR(priv->sgt)) {
+                        dma_buf_detach(dmabuf, priv->attach);
+                        mutex_unlock(&priv->lock);
+                        return priv->sgt;
+                }
+        }
+        priv->pin_count++;
+        mutex_unlock(&priv->lock);
+        return priv->sgt;
+}
+void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
+                    struct sg_table *sgt)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        dma_addr_t dma_addr;
+        if (IS_ERR(priv) || !priv)
+                return;
+        mutex_lock(&priv->lock);
+        WARN_ON(priv->sgt != sgt);
+        priv->pin_count--;
+        WARN_ON(priv->pin_count < 0);
+        dma_addr = sg_dma_address(priv->sgt->sgl);
+        if (priv->pin_count == 0) {
+                dma_buf_unmap_attachment(priv->attach, priv->sgt,
+                                         DMA_BIDIRECTIONAL);
+                dma_buf_detach(dmabuf, priv->attach);
+        }
+        mutex_unlock(&priv->lock);
+}
+static void gk20a_get_comptags(struct device *dev,
+                               struct dma_buf *dmabuf,
+                               struct gk20a_comptags *comptags)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (!comptags)
+                return;
+        if (!priv) {
+                comptags->lines = 0;
+                comptags->offset = 0;
+                return;
+        }
+        *comptags = priv->comptags;
+}
+static int gk20a_alloc_comptags(struct device *dev,
+                                struct dma_buf *dmabuf,
+                                struct gk20a_allocator *allocator,
+                                int lines)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        u32 offset = 0;
+        int err;
+        if (!priv)
+                return -ENOSYS;
+        if (!lines)
+                return -EINVAL;
+        /* store the allocator so we can use it when we free the ctags */
+        priv->comptag_allocator = allocator;
+        err = allocator->alloc(allocator, &offset, lines);
+        if (!err) {
+                priv->comptags.lines = lines;
+                priv->comptags.offset = offset;
+        }
+        return err;
+}
+static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        if (g->ops.fb.reset)
+                g->ops.fb.reset(g);
+        if (g->ops.fb.init_fs_state)
+                g->ops.fb.init_fs_state(g);
+        return 0;
+}
+void gk20a_remove_mm_support(struct mm_gk20a *mm)
+{
+        struct gk20a *g = mm->g;
+        struct device *d = dev_from_gk20a(g);
+        struct vm_gk20a *vm = &mm->bar1.vm;
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        gk20a_dbg_fn("");
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        gk20a_vm_remove_support(vm);
+}
+int gk20a_init_mm_setup_sw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        int i;
+        gk20a_dbg_fn("");
+        if (mm->sw_ready) {
+                gk20a_dbg_fn("skip init");
+                return 0;
+        }
+        mm->g = g;
+        mutex_init(&mm->tlb_lock);
+        mutex_init(&mm->l2_op_lock);
+        mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
+        mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
+        mm->pde_stride    = mm->big_page_size << 10;
+        mm->pde_stride_shift = ilog2(mm->pde_stride);
+        BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
+        for (i = 0; i < ARRAY_SIZE(gmmu_page_sizes); i++) {
+                u32 num_ptes, pte_space, num_pages;
+                /* assuming "full" page tables */
+                num_ptes = mm->pde_stride / gmmu_page_sizes[i];
+                pte_space = num_ptes * gmmu_pte__size_v();
+                /* allocate whole pages */
+                pte_space = roundup(pte_space, PAGE_SIZE);
+                num_pages = pte_space / PAGE_SIZE;
+                /* make sure "order" is viable */
+                BUG_ON(!is_power_of_2(num_pages));
+                mm->page_table_sizing[i].num_ptes = num_ptes;
+                mm->page_table_sizing[i].order = ilog2(num_pages);
+        }
+        /*TBD: make channel vm size configurable */
+        mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
+        gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
+        gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
+                        gmmu_page_sizes[gmmu_page_size_small] >> 10,
+                        (mm->page_table_sizing[gmmu_page_size_small].num_ptes *
+                         gmmu_pte__size_v()) >> 10);
+        gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
+                        gmmu_page_sizes[gmmu_page_size_big] >> 10,
+                        (mm->page_table_sizing[gmmu_page_size_big].num_ptes *
+                         gmmu_pte__size_v()) >> 10);
+        gk20a_init_bar1_vm(mm);
+        mm->remove_support = gk20a_remove_mm_support;
+        mm->sw_ready = true;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+/* make sure gk20a_init_mm_support is called before */
+static int gk20a_init_mm_setup_hw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        phys_addr_t inst_pa = inst_block->cpu_pa;
+        gk20a_dbg_fn("");
+        /* set large page size in fb
+         * note this is very early on, can we defer it ? */
+        {
+                u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (gmmu_page_sizes[gmmu_page_size_big] == SZ_128K)
+                        fb_mmu_ctrl = (fb_mmu_ctrl &
+                                       ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
+                                fb_mmu_ctrl_vm_pg_size_128kb_f();
+                else
+                        BUG_ON(1); /* no support/testing for larger ones yet */
+                gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
+        }
+        inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
+        gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
+        /* this is very early in init... can we defer this? */
+        {
+                gk20a_writel(g, bus_bar1_block_r(),
+                             bus_bar1_block_target_vid_mem_f() |
+                             bus_bar1_block_mode_virtual_f() |
+                             bus_bar1_block_ptr_f(inst_pa));
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_init_mm_support(struct gk20a *g)
+{
+        u32 err;
+        err = gk20a_init_mm_reset_enable_hw(g);
+        if (err)
+                return err;
+        err = gk20a_init_mm_setup_sw(g);
+        if (err)
+                return err;
+        err = gk20a_init_mm_setup_hw(g);
+        if (err)
+                return err;
+        return err;
+}
+#ifdef CONFIG_GK20A_PHYS_PAGE_TABLES
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+                            void **handle,
+                            struct sg_table **sgt,
+                            size_t *size)
+{
+        u32 num_pages = 1 << order;
+        u32 len = num_pages * PAGE_SIZE;
+        int err;
+        struct page *pages;
+        gk20a_dbg_fn("");
+        pages = alloc_pages(GFP_KERNEL, order);
+        if (!pages) {
+                gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
+                goto err_out;
+        }
+        *sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+        if (!sgt) {
+                gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
+                goto err_alloced;
+        }
+        err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+        if (err) {
+                gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
+                goto err_sg_table;
+        }
+        sg_set_page((*sgt)->sgl, pages, len, 0);
+        *handle = page_address(pages);
+        memset(*handle, 0, len);
+        *size = len;
+        FLUSH_CPU_DCACHE(*handle, sg_phys((*sgt)->sgl), len);
+        return 0;
+err_sg_table:
+        kfree(*sgt);
+err_alloced:
+        __free_pages(pages, order);
+err_out:
+        return -ENOMEM;
+}
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+                            struct sg_table *sgt, u32 order,
+                            size_t size)
+{
+        gk20a_dbg_fn("");
+        BUG_ON(sgt == NULL);
+        free_pages((unsigned long)handle, order);
+        sg_free_table(sgt);
+        kfree(sgt);
+}
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+                          void **va, size_t size)
+{
+        FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+        *va = handle;
+        return 0;
+}
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+        FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+}
+#else
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+                            void **handle,
+                            struct sg_table **sgt,
+                            size_t *size)
+{
+        struct device *d = dev_from_vm(vm);
+        u32 num_pages = 1 << order;
+        u32 len = num_pages * PAGE_SIZE;
+        dma_addr_t iova;
+        DEFINE_DMA_ATTRS(attrs);
+        struct page **pages;
+        int err = 0;
+        gk20a_dbg_fn("");
+        *size = len;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
+        if (!pages) {
+                gk20a_err(d, "memory allocation failed\n");
+                goto err_out;
+        }
+        err = gk20a_get_sgtable_from_pages(d, sgt, pages,
+                                iova, len);
+        if (err) {
+                gk20a_err(d, "sgt allocation failed\n");
+                goto err_free;
+        }
+        *handle = (void *)pages;
+        return 0;
+err_free:
+        dma_free_attrs(d, len, pages, iova, &attrs);
+        pages = NULL;
+        iova = 0;
+err_out:
+        return -ENOMEM;
+}
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+                            struct sg_table *sgt, u32 order,
+                            size_t size)
+{
+        struct device *d = dev_from_vm(vm);
+        u64 iova;
+        DEFINE_DMA_ATTRS(attrs);
+        struct page **pages = (struct page **)handle;
+        gk20a_dbg_fn("");
+        BUG_ON(sgt == NULL);
+        iova = sg_dma_address(sgt->sgl);
+        gk20a_free_sgtable(&sgt);
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        dma_free_attrs(d, size, pages, iova, &attrs);
+        pages = NULL;
+        iova = 0;
+}
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+                          void **kva, size_t size)
+{
+        int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+        struct page **pages = (struct page **)handle;
+        gk20a_dbg_fn("");
+        *kva = vmap(pages, count, 0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!(*kva))
+                return -ENOMEM;
+        return 0;
+}
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+        gk20a_dbg_fn("");
+        vunmap(va);
+}
+#endif
+/* allocate a phys contig region big enough for a full
+ * sized gmmu page table for the given gmmu_page_size.
+ * the whole range is zeroed so it's "invalid"/will fault
+ */
+static int zalloc_gmmu_page_table_gk20a(struct vm_gk20a *vm,
+                                        enum gmmu_pgsz_gk20a gmmu_pgsz_idx,
+                                        struct page_table_gk20a *pte)
+{
+        int err;
+        u32 pte_order;
+        void *handle = NULL;
+        struct sg_table *sgt;
+        size_t size;
+        gk20a_dbg_fn("");
+        /* allocate enough pages for the table */
+        pte_order = vm->mm->page_table_sizing[gmmu_pgsz_idx].order;
+        err = alloc_gmmu_pages(vm, pte_order, &handle, &sgt, &size);
+        if (err)
+                return err;
+        gk20a_dbg(gpu_dbg_pte, "pte = 0x%p, addr=%08llx, size %d",
+                        pte, gk20a_mm_iova_addr(sgt->sgl), pte_order);
+        pte->ref = handle;
+        pte->sgt = sgt;
+        pte->size = size;
+        return 0;
+}
+/* given address range (inclusive) determine the pdes crossed */
+static inline void pde_range_from_vaddr_range(struct vm_gk20a *vm,
+                                              u64 addr_lo, u64 addr_hi,
+                                              u32 *pde_lo, u32 *pde_hi)
+{
+        *pde_lo = (u32)(addr_lo >> vm->mm->pde_stride_shift);
+        *pde_hi = (u32)(addr_hi >> vm->mm->pde_stride_shift);
+        gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
+                   addr_lo, addr_hi, vm->mm->pde_stride_shift);
+        gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
+                   *pde_lo, *pde_hi);
+}
+static inline u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+{
+        return (u32 *) (((u8 *)vm->pdes.kv) + i*gmmu_pde__size_v());
+}
+static inline u32 pte_index_from_vaddr(struct vm_gk20a *vm,
+                                       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        u32 ret;
+        /* mask off pde part */
+        addr = addr & ((((u64)1) << vm->mm->pde_stride_shift) - ((u64)1));
+        /* shift over to get pte index. note assumption that pte index
+         * doesn't leak over into the high 32b */
+        ret = (u32)(addr >> gmmu_page_shifts[pgsz_idx]);
+        gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
+        return ret;
+}
+static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
+                                                    u32 *pte_offset)
+{
+        /* ptes are 8B regardless of pagesize */
+        /* pte space pages are 4KB. so 512 ptes per 4KB page*/
+        *pte_page = i >> 9;
+        /* this offset is a pte offset, not a byte offset */
+        *pte_offset = i & ((1<<9)-1);
+        gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
+                   i, *pte_page, *pte_offset);
+}
+/*
+ * given a pde index/page table number make sure it has
+ * backing store and if not go ahead allocate it and
+ * record it in the appropriate pde
+ */
+static int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
+                                u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+{
+        int err;
+        struct page_table_gk20a *pte =
+                vm->pdes.ptes[gmmu_pgsz_idx] + i;
+        gk20a_dbg_fn("");
+        /* if it's already in place it's valid */
+        if (pte->ref)
+                return 0;
+        gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
+                   gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
+        err = zalloc_gmmu_page_table_gk20a(vm, gmmu_pgsz_idx, pte);
+        if (err)
+                return err;
+        /* rewrite pde */
+        update_gmmu_pde_locked(vm, i);
+        return 0;
+}
+static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
+                                                       u64 addr)
+{
+        struct vm_reserved_va_node *va_node;
+        list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
+                if (addr >= va_node->vaddr_start &&
+                    addr < (u64)va_node->vaddr_start + (u64)va_node->size)
+                        return va_node;
+        return NULL;
+}
+int gk20a_vm_get_buffers(struct vm_gk20a *vm,
+                         struct mapped_buffer_node ***mapped_buffers,
+                         int *num_buffers)
+{
+        struct mapped_buffer_node *mapped_buffer;
+        struct mapped_buffer_node **buffer_list;
+        struct rb_node *node;
+        int i = 0;
+        mutex_lock(&vm->update_gmmu_lock);
+        buffer_list = kzalloc(sizeof(*buffer_list) *
+                              vm->num_user_mapped_buffers, GFP_KERNEL);
+        if (!buffer_list) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return -ENOMEM;
+        }
+        node = rb_first(&vm->mapped_buffers);
+        while (node) {
+                mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->user_mapped) {
+                        buffer_list[i] = mapped_buffer;
+                        kref_get(&mapped_buffer->ref);
+                        i++;
+                }
+                node = rb_next(&mapped_buffer->node);
+        }
+        BUG_ON(i != vm->num_user_mapped_buffers);
+        *num_buffers = vm->num_user_mapped_buffers;
+        *mapped_buffers = buffer_list;
+        mutex_unlock(&vm->update_gmmu_lock);
+        return 0;
+}
+static void gk20a_vm_unmap_locked_kref(struct kref *ref)
+{
+        struct mapped_buffer_node *mapped_buffer =
+                container_of(ref, struct mapped_buffer_node, ref);
+        gk20a_vm_unmap_locked(mapped_buffer);
+}
+void gk20a_vm_put_buffers(struct vm_gk20a *vm,
+                                 struct mapped_buffer_node **mapped_buffers,
+                                 int num_buffers)
+{
+        int i;
+        mutex_lock(&vm->update_gmmu_lock);
+        for (i = 0; i < num_buffers; ++i)
+                kref_put(&mapped_buffers[i]->ref,
+                         gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+        kfree(mapped_buffers);
+}
+static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
+{
+        struct device *d = dev_from_vm(vm);
+        int retries;
+        struct mapped_buffer_node *mapped_buffer;
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+                return;
+        }
+        if (mapped_buffer->flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                retries = 1000;
+                while (retries) {
+                        if (atomic_read(&mapped_buffer->ref.refcount) == 1)
+                                break;
+                        retries--;
+                        udelay(50);
+                }
+                if (!retries)
+                        gk20a_err(d, "sync-unmap failed on 0x%llx",
+                                                                offset);
+                mutex_lock(&vm->update_gmmu_lock);
+        }
+        mapped_buffer->user_mapped--;
+        if (mapped_buffer->user_mapped == 0)
+                vm->num_user_mapped_buffers--;
+        kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+static u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
+                             u64 size,
+                             enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+{
+        struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
+        int err;
+        u64 offset;
+        u32 start_page_nr = 0, num_pages;
+        u64 gmmu_page_size = gmmu_page_sizes[gmmu_pgsz_idx];
+        if (gmmu_pgsz_idx >= ARRAY_SIZE(gmmu_page_sizes)) {
+                dev_warn(dev_from_vm(vm),
+                         "invalid page size requested in gk20a vm alloc");
+                return -EINVAL;
+        }
+        if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
+                dev_warn(dev_from_vm(vm),
+                         "unsupportd page size requested");
+                return -EINVAL;
+        }
+        /* be certain we round up to gmmu_page_size if needed */
+        /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+        size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
+        gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
+                        gmmu_page_sizes[gmmu_pgsz_idx]>>10);
+        /* The vma allocator represents page accounting. */
+        num_pages = size >> gmmu_page_shifts[gmmu_pgsz_idx];
+        err = vma->alloc(vma, &start_page_nr, num_pages);
+        if (err) {
+                gk20a_err(dev_from_vm(vm),
+                           "%s oom: sz=0x%llx", vma->name, size);
+                return 0;
+        }
+        offset = (u64)start_page_nr << gmmu_page_shifts[gmmu_pgsz_idx];
+        gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
+        return offset;
+}
+static int gk20a_vm_free_va(struct vm_gk20a *vm,
+                             u64 offset, u64 size,
+                             enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
+        u32 page_size = gmmu_page_sizes[pgsz_idx];
+        u32 page_shift = gmmu_page_shifts[pgsz_idx];
+        u32 start_page_nr, num_pages;
+        int err;
+        gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
+                        vma->name, offset, size);
+        start_page_nr = (u32)(offset >> page_shift);
+        num_pages = (u32)((size + page_size - 1) >> page_shift);
+        err = vma->free(vma, start_page_nr, num_pages);
+        if (err) {
+                gk20a_err(dev_from_vm(vm),
+                           "not found: offset=0x%llx, sz=0x%llx",
+                           offset, size);
+        }
+        return err;
+}
+static int insert_mapped_buffer(struct rb_root *root,
+                                struct mapped_buffer_node *mapped_buffer)
+{
+        struct rb_node **new_node = &(root->rb_node), *parent = NULL;
+        /* Figure out where to put new node */
+        while (*new_node) {
+                struct mapped_buffer_node *cmp_with =
+                        container_of(*new_node, struct mapped_buffer_node,
+                                     node);
+                parent = *new_node;
+                if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
+                        new_node = &((*new_node)->rb_left);
+                else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
+                        new_node = &((*new_node)->rb_right);
+                else
+                        return -EINVAL; /* no fair dup'ing */
+        }
+        /* Add new node and rebalance tree. */
+        rb_link_node(&mapped_buffer->node, parent, new_node);
+        rb_insert_color(&mapped_buffer->node, root);
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+                                struct rb_root *root, struct dma_buf *dmabuf,
+                                u32 kind)
+{
+        struct rb_node *node = rb_first(root);
+        while (node) {
+                struct mapped_buffer_node *mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->dmabuf == dmabuf &&
+                    kind == mapped_buffer->kind)
+                        return mapped_buffer;
+                node = rb_next(&mapped_buffer->node);
+        }
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+                                        struct rb_root *root, u64 addr)
+{
+        struct rb_node *node = root->rb_node;
+        while (node) {
+                struct mapped_buffer_node *mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->addr > addr) /* u64 cmp */
+                        node = node->rb_left;
+                else if (mapped_buffer->addr != addr) /* u64 cmp */
+                        node = node->rb_right;
+                else
+                        return mapped_buffer;
+        }
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_range_locked(
+                                        struct rb_root *root, u64 addr)
+{
+        struct rb_node *node = root->rb_node;
+        while (node) {
+                struct mapped_buffer_node *m =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (m->addr <= addr && m->addr + m->size > addr)
+                        return m;
+                else if (m->addr > addr) /* u64 cmp */
+                        node = node->rb_left;
+                else
+                        node = node->rb_right;
+        }
+        return 0;
+}
+#define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
+struct buffer_attrs {
+        struct sg_table *sgt;
+        u64 size;
+        u64 align;
+        u32 ctag_offset;
+        u32 ctag_lines;
+        int pgsz_idx;
+        u8 kind_v;
+        u8 uc_kind_v;
+};
+static void gmmu_select_page_size(struct buffer_attrs *bfr)
+{
+        int i;
+        /*  choose the biggest first (top->bottom) */
+        for (i = (gmmu_nr_page_sizes-1); i >= 0; i--)
+                if (!(gmmu_page_offset_masks[i] & bfr->align)) {
+                        /* would like to add this too but nvmap returns the
+                         * original requested size not the allocated size.
+                         * (!(gmmu_page_offset_masks[i] & bfr->size)) */
+                        bfr->pgsz_idx = i;
+                        break;
+                }
+}
+static int setup_buffer_kind_and_compression(struct device *d,
+                                             u32 flags,
+                                             struct buffer_attrs *bfr,
+                                             enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        bool kind_compressible;
+        if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
+                bfr->kind_v = gmmu_pte_kind_pitch_v();
+        if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
+                gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
+                return -EINVAL;
+        }
+        bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
+        /* find a suitable uncompressed kind if it becomes necessary later */
+        kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
+        if (kind_compressible) {
+                bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
+                if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
+                        /* shouldn't happen, but it is worth cross-checking */
+                        gk20a_err(d, "comptag kind 0x%x can't be"
+                                   " downgraded to uncompressed kind",
+                                   bfr->kind_v);
+                        return -EINVAL;
+                }
+        }
+        /* comptags only supported for suitable kinds, 128KB pagesize */
+        if (unlikely(kind_compressible &&
+                     (gmmu_page_sizes[pgsz_idx] != 128*1024))) {
+                /*
+                gk20a_warn(d, "comptags specified"
+                " but pagesize being used doesn't support it");*/
+                /* it is safe to fall back to uncompressed as
+                   functionality is not harmed */
+                bfr->kind_v = bfr->uc_kind_v;
+                kind_compressible = false;
+        }
+        if (kind_compressible)
+                bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
+                        COMP_TAG_LINE_SIZE_SHIFT;
+        else
+                bfr->ctag_lines = 0;
+        return 0;
+}
+static int validate_fixed_buffer(struct vm_gk20a *vm,
+                                 struct buffer_attrs *bfr,
+                                 u64 map_offset)
+{
+        struct device *dev = dev_from_vm(vm);
+        struct vm_reserved_va_node *va_node;
+        struct mapped_buffer_node *buffer;
+        if (map_offset & gmmu_page_offset_masks[bfr->pgsz_idx]) {
+                gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
+                           map_offset);
+                return -EINVAL;
+        }
+        /* find the space reservation */
+        va_node = addr_to_reservation(vm, map_offset);
+        if (!va_node) {
+                gk20a_warn(dev, "fixed offset mapping without space allocation");
+                return -EINVAL;
+        }
+        /* check that this mappings does not collide with existing
+         * mappings by checking the overlapping area between the current
+         * buffer and all other mapped buffers */
+        list_for_each_entry(buffer,
+                &va_node->va_buffers_list, va_buffers_list) {
+                s64 begin = max(buffer->addr, map_offset);
+                s64 end = min(buffer->addr +
+                        buffer->size, map_offset + bfr->size);
+                if (end - begin > 0) {
+                        gk20a_warn(dev, "overlapping buffer map requested");
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static u64 __locked_gmmu_map(struct vm_gk20a *vm,
+                                u64 map_offset,
+                                struct sg_table *sgt,
+                                u64 size,
+                                int pgsz_idx,
+                                u8 kind_v,
+                                u32 ctag_offset,
+                                u32 flags,
+                                int rw_flag)
+{
+        int err = 0, i = 0;
+        u32 pde_lo, pde_hi;
+        struct device *d = dev_from_vm(vm);
+        /* Allocate (or validate when map_offset != 0) the virtual address. */
+        if (!map_offset) {
+                map_offset = gk20a_vm_alloc_va(vm, size,
+                                          pgsz_idx);
+                if (!map_offset) {
+                        gk20a_err(d, "failed to allocate va space");
+                        err = -ENOMEM;
+                        goto fail;
+                }
+        }
+        pde_range_from_vaddr_range(vm,
+                                   map_offset,
+                                   map_offset + size - 1,
+                                   &pde_lo, &pde_hi);
+        /* mark the addr range valid (but with 0 phys addr, which will fault) */
+        for (i = pde_lo; i <= pde_hi; i++) {
+                err = validate_gmmu_page_table_gk20a_locked(vm, i,
+                                                            pgsz_idx);
+                if (err) {
+                        gk20a_err(d, "failed to validate page table %d: %d",
+                                                           i, err);
+                        goto fail;
+                }
+        }
+        err = update_gmmu_ptes_locked(vm, pgsz_idx,
+                                      sgt,
+                                      map_offset, map_offset + size - 1,
+                                      kind_v,
+                                      ctag_offset,
+                                      flags &
+                                      NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                      rw_flag);
+        if (err) {
+                gk20a_err(d, "failed to update ptes on map");
+                goto fail;
+        }
+        return map_offset;
+ fail:
+        gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
+        return 0;
+}
+static void __locked_gmmu_unmap(struct vm_gk20a *vm,
+                                u64 vaddr,
+                                u64 size,
+                                int pgsz_idx,
+                                bool va_allocated,
+                                int rw_flag)
+{
+        int err = 0;
+        struct gk20a *g = gk20a_from_vm(vm);
+        if (va_allocated) {
+                err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
+                if (err) {
+                        dev_err(dev_from_vm(vm),
+                                "failed to free va");
+                        return;
+                }
+        }
+        /* unmap here needs to know the page size we assigned at mapping */
+        err = update_gmmu_ptes_locked(vm,
+                                pgsz_idx,
+                                0, /* n/a for unmap */
+                                vaddr,
+                                vaddr + size - 1,
+                                0, 0, false /* n/a for unmap */,
+                                rw_flag);
+        if (err)
+                dev_err(dev_from_vm(vm),
+                        "failed to update gmmu ptes on unmap");
+        /* detect which if any pdes/ptes can now be released */
+        /* flush l2 so any dirty lines are written out *now*.
+         *  also as we could potentially be switching this buffer
+         * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
+         * some point in the future we need to invalidate l2.  e.g. switching
+         * from a render buffer unmap (here) to later using the same memory
+         * for gmmu ptes.  note the positioning of this relative to any smmu
+         * unmapping (below). */
+        gk20a_mm_l2_flush(g, true);
+}
+static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
+                                         struct dma_buf *dmabuf,
+                                         u64 offset_align,
+                                         u32 flags,
+                                         int kind,
+                                         struct sg_table **sgt,
+                                         bool user_mapped,
+                                         int rw_flag)
+{
+        struct mapped_buffer_node *mapped_buffer = 0;
+        mapped_buffer =
+                find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
+                                                  dmabuf, kind);
+        if (!mapped_buffer)
+                return 0;
+        if (mapped_buffer->flags != flags)
+                return 0;
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
+            mapped_buffer->addr != offset_align)
+                return 0;
+        BUG_ON(mapped_buffer->vm != vm);
+        /* mark the buffer as used */
+        if (user_mapped) {
+                if (mapped_buffer->user_mapped == 0)
+                        vm->num_user_mapped_buffers++;
+                mapped_buffer->user_mapped++;
+                /* If the mapping comes from user space, we own
+                 * the handle ref. Since we reuse an
+                 * existing mapping here, we need to give back those
+                 * refs once in order not to leak.
+                 */
+                if (mapped_buffer->own_mem_ref)
+                        dma_buf_put(mapped_buffer->dmabuf);
+                else
+                        mapped_buffer->own_mem_ref = true;
+        }
+        kref_get(&mapped_buffer->ref);
+        gk20a_dbg(gpu_dbg_map,
+                   "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
+                   "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
+                   "own_mem_ref=%d user_mapped=%d",
+                   vm_aspace_id(vm), mapped_buffer->pgsz_idx,
+                   mapped_buffer->flags,
+                   mapped_buffer->ctag_lines,
+                   mapped_buffer->ctag_offset,
+                   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+                   hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+                   lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+                   hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+                   lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+                   mapped_buffer->own_mem_ref, user_mapped);
+        if (sgt)
+                *sgt = mapped_buffer->sgt;
+        return mapped_buffer->addr;
+}
+u64 gk20a_vm_map(struct vm_gk20a *vm,
+                        struct dma_buf *dmabuf,
+                        u64 offset_align,
+                        u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
+                        int kind,
+                        struct sg_table **sgt,
+                        bool user_mapped,
+                        int rw_flag)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
+        struct device *d = dev_from_vm(vm);
+        struct mapped_buffer_node *mapped_buffer = 0;
+        bool inserted = false, va_allocated = false;
+        u32 gmmu_page_size = 0;
+        u64 map_offset = 0;
+        int err = 0;
+        struct buffer_attrs bfr = {0};
+        struct gk20a_comptags comptags;
+        mutex_lock(&vm->update_gmmu_lock);
+        /* check if this buffer is already mapped */
+        map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
+                                                   flags, kind, sgt,
+                                                   user_mapped, rw_flag);
+        if (map_offset) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return map_offset;
+        }
+        /* pin buffer to get phys/iovmm addr */
+        bfr.sgt = gk20a_mm_pin(d, dmabuf);
+        if (IS_ERR(bfr.sgt)) {
+                /* Falling back to physical is actually possible
+                 * here in many cases if we use 4K phys pages in the
+                 * gmmu.  However we have some regions which require
+                 * contig regions to work properly (either phys-contig
+                 * or contig through smmu io_vaspace).  Until we can
+                 * track the difference between those two cases we have
+                 * to fail the mapping when we run out of SMMU space.
+                 */
+                gk20a_warn(d, "oom allocating tracking buffer");
+                goto clean_up;
+        }
+        if (sgt)
+                *sgt = bfr.sgt;
+        bfr.kind_v = kind;
+        bfr.size = dmabuf->size;
+        bfr.align = 1 << __ffs((u64)sg_dma_address(bfr.sgt->sgl));
+        bfr.pgsz_idx = -1;
+        /* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
+         * page size according to memory alignment */
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+                bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
+                                gmmu_page_size_big : gmmu_page_size_small;
+        } else {
+                gmmu_select_page_size(&bfr);
+        }
+        /* validate/adjust bfr attributes */
+        if (unlikely(bfr.pgsz_idx == -1)) {
+                gk20a_err(d, "unsupported page size detected");
+                goto clean_up;
+        }
+        if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
+                     bfr.pgsz_idx > gmmu_page_size_big)) {
+                BUG_ON(1);
+                err = -EINVAL;
+                goto clean_up;
+        }
+        gmmu_page_size = gmmu_page_sizes[bfr.pgsz_idx];
+        /* Check if we should use a fixed offset for mapping this buffer */
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)  {
+                err = validate_fixed_buffer(vm, &bfr, offset_align);
+                if (err)
+                        goto clean_up;
+                map_offset = offset_align;
+                va_allocated = false;
+        } else
+                va_allocated = true;
+        if (sgt)
+                *sgt = bfr.sgt;
+        err = setup_buffer_kind_and_compression(d, flags, &bfr, bfr.pgsz_idx);
+        if (unlikely(err)) {
+                gk20a_err(d, "failure setting up kind and compression");
+                goto clean_up;
+        }
+        /* bar1 and pmu vm don't need ctag */
+        if (!vm->enable_ctag)
+                bfr.ctag_lines = 0;
+        gk20a_get_comptags(d, dmabuf, &comptags);
+        if (bfr.ctag_lines && !comptags.lines) {
+                /* allocate compression resources if needed */
+                err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
+                                           bfr.ctag_lines);
+                if (err) {
+                        /* ok to fall back here if we ran out */
+                        /* TBD: we can partially alloc ctags as well... */
+                        bfr.ctag_lines = bfr.ctag_offset = 0;
+                        bfr.kind_v = bfr.uc_kind_v;
+                } else {
+                        gk20a_get_comptags(d, dmabuf, &comptags);
+                        /* init/clear the ctag buffer */
+                        g->ops.ltc.clear_comptags(g,
+                                          comptags.offset,
+                                          comptags.offset + comptags.lines - 1);
+                }
+        }
+        /* store the comptag info */
+        bfr.ctag_offset = comptags.offset;
+        /* update gmmu ptes */
+        map_offset = __locked_gmmu_map(vm, map_offset,
+                                        bfr.sgt,
+                                        bfr.size,
+                                        bfr.pgsz_idx,
+                                        bfr.kind_v,
+                                        bfr.ctag_offset,
+                                        flags, rw_flag);
+        if (!map_offset)
+                goto clean_up;
+        gk20a_dbg(gpu_dbg_map,
+           "as=%d pgsz=%d "
+           "kind=0x%x kind_uc=0x%x flags=0x%x "
+           "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
+           vm_aspace_id(vm), gmmu_page_size,
+           bfr.kind_v, bfr.uc_kind_v, flags,
+           bfr.ctag_lines, bfr.ctag_offset,
+           hi32(map_offset), lo32(map_offset),
+           hi32((u64)sg_dma_address(bfr.sgt->sgl)),
+           lo32((u64)sg_dma_address(bfr.sgt->sgl)),
+           hi32((u64)sg_phys(bfr.sgt->sgl)),
+           lo32((u64)sg_phys(bfr.sgt->sgl)));
+#if defined(NVHOST_DEBUG)
+        {
+                int i;
+                struct scatterlist *sg = NULL;
+                gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
+                for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
+                        u64 da = sg_dma_address(sg);
+                        u64 pa = sg_phys(sg);
+                        u64 len = sg->length;
+                        gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
+                                   i, hi32(pa), lo32(pa), hi32(da), lo32(da),
+                                   hi32(len), lo32(len));
+                }
+        }
+#endif
+        /* keep track of the buffer for unmapping */
+        /* TBD: check for multiple mapping of same buffer */
+        mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
+        if (!mapped_buffer) {
+                gk20a_warn(d, "oom allocating tracking buffer");
+                goto clean_up;
+        }
+        mapped_buffer->dmabuf      = dmabuf;
+        mapped_buffer->sgt         = bfr.sgt;
+        mapped_buffer->addr        = map_offset;
+        mapped_buffer->size        = bfr.size;
+        mapped_buffer->pgsz_idx    = bfr.pgsz_idx;
+        mapped_buffer->ctag_offset = bfr.ctag_offset;
+        mapped_buffer->ctag_lines  = bfr.ctag_lines;
+        mapped_buffer->vm          = vm;
+        mapped_buffer->flags       = flags;
+        mapped_buffer->kind        = kind;
+        mapped_buffer->va_allocated = va_allocated;
+        mapped_buffer->user_mapped = user_mapped ? 1 : 0;
+        mapped_buffer->own_mem_ref = user_mapped;
+        INIT_LIST_HEAD(&mapped_buffer->unmap_list);
+        INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
+        kref_init(&mapped_buffer->ref);
+        err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
+        if (err) {
+                gk20a_err(d, "failed to insert into mapped buffer tree");
+                goto clean_up;
+        }
+        inserted = true;
+        if (user_mapped)
+                vm->num_user_mapped_buffers++;
+        gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
+        if (!va_allocated) {
+                struct vm_reserved_va_node *va_node;
+                /* find the space reservation */
+                va_node = addr_to_reservation(vm, map_offset);
+                list_add_tail(&mapped_buffer->va_buffers_list,
+                              &va_node->va_buffers_list);
+                mapped_buffer->va_node = va_node;
+        }
+        mutex_unlock(&vm->update_gmmu_lock);
+        /* Invalidate kernel mappings immediately */
+        if (vm_aspace_id(vm) == -1)
+                gk20a_mm_tlb_invalidate(vm);
+        return map_offset;
+clean_up:
+        if (inserted) {
+                rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+                if (user_mapped)
+                        vm->num_user_mapped_buffers--;
+        }
+        kfree(mapped_buffer);
+        if (va_allocated)
+                gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
+        if (!IS_ERR(bfr.sgt))
+                gk20a_mm_unpin(d, dmabuf, bfr.sgt);
+        mutex_unlock(&vm->update_gmmu_lock);
+        gk20a_dbg_info("err=%d\n", err);
+        return 0;
+}
+u64 gk20a_gmmu_map(struct vm_gk20a *vm,
+                struct sg_table **sgt,
+                u64 size,
+                u32 flags,
+                int rw_flag)
+{
+        u64 vaddr;
+        mutex_lock(&vm->update_gmmu_lock);
+        vaddr = __locked_gmmu_map(vm, 0, /* already mapped? - No */
+                                *sgt, /* sg table */
+                                size,
+                                0, /* page size index = 0 i.e. SZ_4K */
+                                0, /* kind */
+                                0, /* ctag_offset */
+                                flags, rw_flag);
+        mutex_unlock(&vm->update_gmmu_lock);
+        if (!vaddr) {
+                gk20a_err(dev_from_vm(vm), "failed to allocate va space");
+                return 0;
+        }
+        /* Invalidate kernel mappings immediately */
+        gk20a_mm_tlb_invalidate(vm);
+        return vaddr;
+}
+void gk20a_gmmu_unmap(struct vm_gk20a *vm,
+                u64 vaddr,
+                u64 size,
+                int rw_flag)
+{
+        mutex_lock(&vm->update_gmmu_lock);
+        __locked_gmmu_unmap(vm,
+                        vaddr,
+                        size,
+                        0, /* page size 4K */
+                        true, /*va_allocated */
+                        rw_flag);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+phys_addr_t gk20a_get_phys_from_iova(struct device *d,
+                                u64 dma_addr)
+{
+        phys_addr_t phys;
+        u64 iova;
+        struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
+        if (!mapping)
+                return dma_addr;
+        iova = dma_addr & PAGE_MASK;
+        phys = iommu_iova_to_phys(mapping->domain, iova);
+        return phys;
+}
+/* get sg_table from already allocated buffer */
+int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
+                        void *cpuva, u64 iova,
+                        size_t size)
+{
+        int err = 0;
+        *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+        if (!(*sgt)) {
+                dev_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = dma_get_sgtable(d, *sgt,
+                        cpuva, iova,
+                        size);
+        if (err) {
+                dev_err(d, "failed to create sg table\n");
+                goto fail;
+        }
+        sg_dma_address((*sgt)->sgl) = iova;
+        return 0;
+ fail:
+        if (*sgt) {
+                kfree(*sgt);
+                *sgt = NULL;
+        }
+        return err;
+}
+int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
+                        struct page **pages, u64 iova,
+                        size_t size)
+{
+        int err = 0;
+        *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+        if (!(*sgt)) {
+                dev_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+        if (err) {
+                dev_err(d, "failed to allocate sg_table\n");
+                goto fail;
+        }
+        sg_set_page((*sgt)->sgl, *pages, size, 0);
+        sg_dma_address((*sgt)->sgl) = iova;
+        return 0;
+ fail:
+        if (*sgt) {
+                kfree(*sgt);
+                *sgt = NULL;
+        }
+        return err;
+}
+void gk20a_free_sgtable(struct sg_table **sgt)
+{
+        sg_free_table(*sgt);
+        kfree(*sgt);
+        *sgt = NULL;
+}
+u64 gk20a_mm_iova_addr(struct scatterlist *sgl)
+{
+        u64 result = sg_phys(sgl);
+#ifdef CONFIG_TEGRA_IOMMU_SMMU
+        if (sg_dma_address(sgl) == DMA_ERROR_CODE)
+                result = 0;
+        else if (sg_dma_address(sgl)) {
+                result = sg_dma_address(sgl) |
+                        1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT;
+        }
+#endif
+        return result;
+}
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+                                   enum gmmu_pgsz_gk20a pgsz_idx,
+                                   struct sg_table *sgt,
+                                   u64 first_vaddr, u64 last_vaddr,
+                                   u8 kind_v, u32 ctag_offset,
+                                   bool cacheable,
+                                   int rw_flag)
+{
+        int err;
+        u32 pde_lo, pde_hi, pde_i;
+        struct scatterlist *cur_chunk;
+        unsigned int cur_offset;
+        u32 pte_w[2] = {0, 0}; /* invalid pte */
+        u32 ctag = ctag_offset;
+        u32 ctag_incr;
+        u32 page_size  = gmmu_page_sizes[pgsz_idx];
+        u64 addr = 0;
+        pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
+                                   &pde_lo, &pde_hi);
+        gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
+                   pgsz_idx, pde_lo, pde_hi);
+        /* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
+         * below (per-pte). Note: this doesn't work unless page size (when
+         * comptags are active) is 128KB. We have checks elsewhere for that. */
+        ctag_incr = !!ctag_offset;
+        if (sgt)
+                cur_chunk = sgt->sgl;
+        else
+                cur_chunk = NULL;
+        cur_offset = 0;
+        for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
+                u32 pte_lo, pte_hi;
+                u32 pte_cur;
+                void *pte_kv_cur;
+                struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
+                if (pde_i == pde_lo)
+                        pte_lo = pte_index_from_vaddr(vm, first_vaddr,
+                                                      pgsz_idx);
+                else
+                        pte_lo = 0;
+                if ((pde_i != pde_hi) && (pde_hi != pde_lo))
+                        pte_hi = vm->mm->page_table_sizing[pgsz_idx].num_ptes-1;
+                else
+                        pte_hi = pte_index_from_vaddr(vm, last_vaddr,
+                                                      pgsz_idx);
+                /* get cpu access to the ptes */
+                err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur,
+                                     pte->size);
+                if (err) {
+                        gk20a_err(dev_from_vm(vm),
+                                   "couldn't map ptes for update as=%d pte_ref_cnt=%d",
+                                   vm_aspace_id(vm), pte->ref_cnt);
+                        goto clean_up;
+                }
+                gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
+                for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
+                        if (likely(sgt)) {
+                                u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
+                                if (new_addr) {
+                                        addr = new_addr;
+                                        addr += cur_offset;
+                                }
+                                pte_w[0] = gmmu_pte_valid_true_f() |
+                                        gmmu_pte_address_sys_f(addr
+                                                >> gmmu_pte_address_shift_v());
+                                pte_w[1] = gmmu_pte_aperture_video_memory_f() |
+                                        gmmu_pte_kind_f(kind_v) |
+                                        gmmu_pte_comptagline_f(ctag);
+                                if (rw_flag == gk20a_mem_flag_read_only) {
+                                        pte_w[0] |= gmmu_pte_read_only_true_f();
+                                        pte_w[1] |=
+                                                gmmu_pte_write_disable_true_f();
+                                } else if (rw_flag ==
+                                           gk20a_mem_flag_write_only) {
+                                        pte_w[1] |=
+                                                gmmu_pte_read_disable_true_f();
+                                }
+                                if (!cacheable)
+                                        pte_w[1] |= gmmu_pte_vol_true_f();
+                                pte->ref_cnt++;
+                                gk20a_dbg(gpu_dbg_pte,
+                                           "pte_cur=%d addr=0x%x,%08x kind=%d"
+                                           " ctag=%d vol=%d refs=%d"
+                                           " [0x%08x,0x%08x]",
+                                           pte_cur, hi32(addr), lo32(addr),
+                                           kind_v, ctag, !cacheable,
+                                           pte->ref_cnt, pte_w[1], pte_w[0]);
+                                ctag += ctag_incr;
+                                cur_offset += page_size;
+                                addr += page_size;
+                                while (cur_chunk &&
+                                        cur_offset >= cur_chunk->length) {
+                                        cur_offset -= cur_chunk->length;
+                                        cur_chunk = sg_next(cur_chunk);
+                                }
+                        } else {
+                                pte->ref_cnt--;
+                                gk20a_dbg(gpu_dbg_pte,
+                                           "pte_cur=%d ref=%d [0x0,0x0]",
+                                           pte_cur, pte->ref_cnt);
+                        }
+                        gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
+                        gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
+                }
+                unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
+                if (pte->ref_cnt == 0) {
+                        /* It can make sense to keep around one page table for
+                         * each flavor (empty)... in case a new map is coming
+                         * right back to alloc (and fill it in) again.
+                         * But: deferring unmapping should help with pathologic
+                         * unmap/map/unmap/map cases where we'd trigger pte
+                         * free/alloc/free/alloc.
+                         */
+                        free_gmmu_pages(vm, pte->ref, pte->sgt,
+                                vm->mm->page_table_sizing[pgsz_idx].order,
+                                pte->size);
+                        pte->ref = NULL;
+                        /* rewrite pde */
+                        update_gmmu_pde_locked(vm, pde_i);
+                }
+        }
+        smp_mb();
+        vm->tlb_dirty = true;
+        gk20a_dbg_fn("set tlb dirty");
+        return 0;
+clean_up:
+        /*TBD: potentially rewrite above to pre-map everything it needs to
+         * as that's the only way it can fail */
+        return err;
+}
+/* for gk20a the "video memory" apertures here are misnomers. */
+static inline u32 big_valid_pde0_bits(u64 pte_addr)
+{
+        u32 pde0_bits =
+                gmmu_pde_aperture_big_video_memory_f() |
+                gmmu_pde_address_big_sys_f(
+                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+        return  pde0_bits;
+}
+static inline u32 small_valid_pde1_bits(u64 pte_addr)
+{
+        u32 pde1_bits =
+                gmmu_pde_aperture_small_video_memory_f() |
+                gmmu_pde_vol_small_true_f() | /* tbd: why? */
+                gmmu_pde_address_small_sys_f(
+                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+        return pde1_bits;
+}
+/* Given the current state of the ptes associated with a pde,
+   determine value and write it out.  There's no checking
+   here to determine whether or not a change was actually
+   made.  So, superfluous updates will cause unnecessary
+   pde invalidations.
+*/
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
+{
+        bool small_valid, big_valid;
+        u64 pte_addr[2] = {0, 0};
+        struct page_table_gk20a *small_pte =
+                vm->pdes.ptes[gmmu_page_size_small] + i;
+        struct page_table_gk20a *big_pte =
+                vm->pdes.ptes[gmmu_page_size_big] + i;
+        u32 pde_v[2] = {0, 0};
+        u32 *pde;
+        small_valid = small_pte && small_pte->ref;
+        big_valid   = big_pte && big_pte->ref;
+        if (small_valid)
+                pte_addr[gmmu_page_size_small] =
+                        gk20a_mm_iova_addr(small_pte->sgt->sgl);
+        if (big_valid)
+                pte_addr[gmmu_page_size_big] =
+                        gk20a_mm_iova_addr(big_pte->sgt->sgl);
+        pde_v[0] = gmmu_pde_size_full_f();
+        pde_v[0] |= big_valid ?
+                big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
+                :
+                (gmmu_pde_aperture_big_invalid_f());
+        pde_v[1] |= (small_valid ?
+                     small_valid_pde1_bits(pte_addr[gmmu_page_size_small])
+                     :
+                     (gmmu_pde_aperture_small_invalid_f() |
+                      gmmu_pde_vol_small_false_f())
+                     )
+                |
+                (big_valid ? (gmmu_pde_vol_big_true_f()) :
+                 gmmu_pde_vol_big_false_f());
+        pde = pde_from_index(vm, i);
+        gk20a_mem_wr32(pde, 0, pde_v[0]);
+        gk20a_mem_wr32(pde, 1, pde_v[1]);
+        smp_mb();
+        FLUSH_CPU_DCACHE(pde,
+                         sg_phys(vm->pdes.sgt->sgl) + (i*gmmu_pde__size_v()),
+                         sizeof(u32)*2);
+        gk20a_mm_l2_invalidate(vm->mm->g);
+        gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
+        vm->tlb_dirty  = true;
+}
+static int gk20a_vm_put_empty(struct vm_gk20a *vm, u64 vaddr,
+                               u32 num_pages, u32 pgsz_idx)
+{
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = mm->g;
+        u32 pgsz = gmmu_page_sizes[pgsz_idx];
+        u32 i;
+        dma_addr_t iova;
+        /* allocate the zero page if the va does not already have one */
+        if (!vm->zero_page_cpuva) {
+                int err = 0;
+                vm->zero_page_cpuva = dma_alloc_coherent(&g->dev->dev,
+                                                         mm->big_page_size,
+                                                         &iova,
+                                                         GFP_KERNEL);
+                if (!vm->zero_page_cpuva) {
+                        dev_err(&g->dev->dev, "failed to allocate zero page\n");
+                        return -ENOMEM;
+                }
+                vm->zero_page_iova = iova;
+                err = gk20a_get_sgtable(&g->dev->dev, &vm->zero_page_sgt,
+                                        vm->zero_page_cpuva, vm->zero_page_iova,
+                                        mm->big_page_size);
+                if (err) {
+                        dma_free_coherent(&g->dev->dev, mm->big_page_size,
+                                          vm->zero_page_cpuva,
+                                          vm->zero_page_iova);
+                        vm->zero_page_iova = 0;
+                        vm->zero_page_cpuva = NULL;
+                        dev_err(&g->dev->dev, "failed to create sg table for zero page\n");
+                        return -ENOMEM;
+                }
+        }
+        for (i = 0; i < num_pages; i++) {
+                u64 page_vaddr = __locked_gmmu_map(vm, vaddr,
+                        vm->zero_page_sgt, pgsz, pgsz_idx, 0, 0,
+                        NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET,
+                        gk20a_mem_flag_none);
+                if (!page_vaddr) {
+                        gk20a_err(dev_from_vm(vm), "failed to remap clean buffers!");
+                        goto err_unmap;
+                }
+                vaddr += pgsz;
+        }
+        gk20a_mm_l2_flush(mm->g, true);
+        return 0;
+err_unmap:
+        WARN_ON(1);
+        /* something went wrong. unmap pages */
+        while (i--) {
+                vaddr -= pgsz;
+                __locked_gmmu_unmap(vm, vaddr, pgsz, pgsz_idx, 0,
+                                    gk20a_mem_flag_none);
+        }
+        return -EINVAL;
+}
+/* NOTE! mapped_buffers lock must be held */
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
+{
+        struct vm_gk20a *vm = mapped_buffer->vm;
+        if (mapped_buffer->va_node &&
+            mapped_buffer->va_node->sparse) {
+                u64 vaddr = mapped_buffer->addr;
+                u32 pgsz_idx = mapped_buffer->pgsz_idx;
+                u32 num_pages = mapped_buffer->size >>
+                        gmmu_page_shifts[pgsz_idx];
+                /* there is little we can do if this fails... */
+                gk20a_vm_put_empty(vm, vaddr, num_pages, pgsz_idx);
+        } else
+                __locked_gmmu_unmap(vm,
+                                mapped_buffer->addr,
+                                mapped_buffer->size,
+                                mapped_buffer->pgsz_idx,
+                                mapped_buffer->va_allocated,
+                                gk20a_mem_flag_none);
+        gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
+                   vm_aspace_id(vm), gmmu_page_sizes[mapped_buffer->pgsz_idx],
+                   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+                   mapped_buffer->own_mem_ref);
+        gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
+                       mapped_buffer->sgt);
+        /* remove from mapped buffer tree and remove list, free */
+        rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+        if (!list_empty(&mapped_buffer->va_buffers_list))
+                list_del(&mapped_buffer->va_buffers_list);
+        /* keep track of mapped buffers */
+        if (mapped_buffer->user_mapped)
+                vm->num_user_mapped_buffers--;
+        if (mapped_buffer->own_mem_ref)
+                dma_buf_put(mapped_buffer->dmabuf);
+        kfree(mapped_buffer);
+        return;
+}
+void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
+{
+        struct device *d = dev_from_vm(vm);
+        struct mapped_buffer_node *mapped_buffer;
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+                return;
+        }
+        kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+static void gk20a_vm_remove_support(struct vm_gk20a *vm)
+{
+        struct gk20a *g = vm->mm->g;
+        struct mapped_buffer_node *mapped_buffer;
+        struct vm_reserved_va_node *va_node, *va_node_tmp;
+        struct rb_node *node;
+        gk20a_dbg_fn("");
+        mutex_lock(&vm->update_gmmu_lock);
+        /* TBD: add a flag here for the unmap code to recognize teardown
+         * and short-circuit any otherwise expensive operations. */
+        node = rb_first(&vm->mapped_buffers);
+        while (node) {
+                mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                gk20a_vm_unmap_locked(mapped_buffer);
+                node = rb_first(&vm->mapped_buffers);
+        }
+        /* destroy remaining reserved memory areas */
+        list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
+                reserved_va_list) {
+                list_del(&va_node->reserved_va_list);
+                kfree(va_node);
+        }
+        /* TBD: unmapping all buffers above may not actually free
+         * all vm ptes.  jettison them here for certain... */
+        unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
+        free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0, vm->pdes.size);
+        kfree(vm->pdes.ptes[gmmu_page_size_small]);
+        kfree(vm->pdes.ptes[gmmu_page_size_big]);
+        gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+        gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
+        mutex_unlock(&vm->update_gmmu_lock);
+        /* release zero page if used */
+        if (vm->zero_page_cpuva)
+                dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
+                                  vm->zero_page_cpuva, vm->zero_page_iova);
+        /* vm is not used anymore. release it. */
+        kfree(vm);
+}
+static void gk20a_vm_remove_support_kref(struct kref *ref)
+{
+        struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
+        gk20a_vm_remove_support(vm);
+}
+void gk20a_vm_get(struct vm_gk20a *vm)
+{
+        kref_get(&vm->ref);
+}
+void gk20a_vm_put(struct vm_gk20a *vm)
+{
+        kref_put(&vm->ref, gk20a_vm_remove_support_kref);
+}
+/* address space interfaces for the gk20a module */
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
+{
+        struct gk20a_as *as = as_share->as;
+        struct gk20a *g = gk20a_from_as(as);
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm;
+        u64 vma_size;
+        u32 num_pages, low_hole_pages;
+        char name[32];
+        int err;
+        gk20a_dbg_fn("");
+        vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+        if (!vm)
+                return -ENOMEM;
+        as_share->vm = vm;
+        vm->mm = mm;
+        vm->as_share = as_share;
+        vm->big_pages = true;
+        vm->va_start  = mm->pde_stride;   /* create a one pde hole */
+        vm->va_limit  = mm->channel.size; /* note this means channel.size is
+                                             really just the max */
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                return -ENOMEM;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                return -ENOMEM;
+        }
+        gk20a_dbg(gpu_dbg_pte, "pdes.kv = 0x%p, pdes.phys = 0x%llx",
+                        vm->pdes.kv,
+                        gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        /* low-half: alloc small pages */
+        /* high-half: alloc big pages */
+        vma_size = mm->channel.size >> 1;
+        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+                 gmmu_page_sizes[gmmu_page_size_small]>>10);
+        num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
+        /* num_pages above is without regard to the low-side hole. */
+        low_hole_pages = (vm->va_start >>
+                          gmmu_page_shifts[gmmu_page_size_small]);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
+              low_hole_pages,             /* start */
+              num_pages - low_hole_pages, /* length */
+              1);                         /* align */
+        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+                 gmmu_page_sizes[gmmu_page_size_big]>>10);
+        num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
+                              num_pages, /* start */
+                              num_pages, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        vm->enable_ctag = true;
+        return 0;
+}
+int gk20a_vm_release_share(struct gk20a_as_share *as_share)
+{
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        vm->as_share = NULL;
+        /* put as reference to vm */
+        gk20a_vm_put(vm);
+        as_share->vm = NULL;
+        return 0;
+}
+int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
+                         struct nvhost_as_alloc_space_args *args)
+{       int err = -ENOMEM;
+        int pgsz_idx;
+        u32 start_page_nr;
+        struct gk20a_allocator *vma;
+        struct vm_gk20a *vm = as_share->vm;
+        struct vm_reserved_va_node *va_node;
+        u64 vaddr_start = 0;
+        gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
+                        args->flags, args->page_size, args->pages,
+                        args->o_a.offset);
+        /* determine pagesz idx */
+        for (pgsz_idx = gmmu_page_size_small;
+             pgsz_idx < gmmu_nr_page_sizes;
+             pgsz_idx++) {
+                if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+                        break;
+        }
+        if (pgsz_idx >= gmmu_nr_page_sizes) {
+                err = -EINVAL;
+                goto clean_up;
+        }
+        va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
+        if (!va_node) {
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE &&
+            pgsz_idx != gmmu_page_size_big) {
+                err = -ENOSYS;
+                kfree(va_node);
+                goto clean_up;
+        }
+        start_page_nr = 0;
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+                start_page_nr = (u32)(args->o_a.offset >>
+                                      gmmu_page_shifts[pgsz_idx]);
+        vma = &vm->vma[pgsz_idx];
+        err = vma->alloc(vma, &start_page_nr, args->pages);
+        if (err) {
+                kfree(va_node);
+                goto clean_up;
+        }
+        vaddr_start = (u64)start_page_nr << gmmu_page_shifts[pgsz_idx];
+        va_node->vaddr_start = vaddr_start;
+        va_node->size = (u64)args->page_size * (u64)args->pages;
+        va_node->pgsz_idx = args->page_size;
+        INIT_LIST_HEAD(&va_node->va_buffers_list);
+        INIT_LIST_HEAD(&va_node->reserved_va_list);
+        mutex_lock(&vm->update_gmmu_lock);
+        /* mark that we need to use sparse mappings here */
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE) {
+                err = gk20a_vm_put_empty(vm, vaddr_start, args->pages,
+                                         pgsz_idx);
+                if (err) {
+                        mutex_unlock(&vm->update_gmmu_lock);
+                        vma->free(vma, start_page_nr, args->pages);
+                        kfree(va_node);
+                        goto clean_up;
+                }
+                va_node->sparse = true;
+        }
+        list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
+        mutex_unlock(&vm->update_gmmu_lock);
+        args->o_a.offset = vaddr_start;
+clean_up:
+        return err;
+}
+int gk20a_vm_free_space(struct gk20a_as_share *as_share,
+                        struct nvhost_as_free_space_args *args)
+{
+        int err = -ENOMEM;
+        int pgsz_idx;
+        u32 start_page_nr;
+        struct gk20a_allocator *vma;
+        struct vm_gk20a *vm = as_share->vm;
+        struct vm_reserved_va_node *va_node;
+        gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
+                        args->pages, args->offset);
+        /* determine pagesz idx */
+        for (pgsz_idx = gmmu_page_size_small;
+             pgsz_idx < gmmu_nr_page_sizes;
+             pgsz_idx++) {
+                if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+                        break;
+        }
+        if (pgsz_idx >= gmmu_nr_page_sizes) {
+                err = -EINVAL;
+                goto clean_up;
+        }
+        start_page_nr = (u32)(args->offset >>
+                              gmmu_page_shifts[pgsz_idx]);
+        vma = &vm->vma[pgsz_idx];
+        err = vma->free(vma, start_page_nr, args->pages);
+        if (err)
+                goto clean_up;
+        mutex_lock(&vm->update_gmmu_lock);
+        va_node = addr_to_reservation(vm, args->offset);
+        if (va_node) {
+                struct mapped_buffer_node *buffer;
+                /* there is no need to unallocate the buffers in va. Just
+                 * convert them into normal buffers */
+                list_for_each_entry(buffer,
+                        &va_node->va_buffers_list, va_buffers_list)
+                        list_del_init(&buffer->va_buffers_list);
+                list_del(&va_node->reserved_va_list);
+                /* if this was a sparse mapping, free the va */
+                if (va_node->sparse)
+                        __locked_gmmu_unmap(vm,
+                                va_node->vaddr_start,
+                                va_node->size,
+                                va_node->pgsz_idx,
+                                false,
+                                gk20a_mem_flag_none);
+                kfree(va_node);
+        }
+        mutex_unlock(&vm->update_gmmu_lock);
+clean_up:
+        return err;
+}
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+                          struct channel_gk20a *ch)
+{
+        int err = 0;
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        ch->vm = vm;
+        err = channel_gk20a_commit_va(ch);
+        if (err)
+                ch->vm = 0;
+        return err;
+}
+int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
+{
+        struct gk20a_dmabuf_priv *priv;
+        static DEFINE_MUTEX(priv_lock);
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (likely(priv))
+                return 0;
+        mutex_lock(&priv_lock);
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (priv)
+                goto priv_exist_or_err;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (!priv) {
+                priv = ERR_PTR(-ENOMEM);
+                goto priv_exist_or_err;
+        }
+        mutex_init(&priv->lock);
+        dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
+priv_exist_or_err:
+        mutex_unlock(&priv_lock);
+        if (IS_ERR(priv))
+                return -ENOMEM;
+        return 0;
+}
+static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
+{
+        int kind = 0;
+#ifdef CONFIG_TEGRA_NVMAP
+        int err;
+        u64 nvmap_param;
+        err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
+                                     &nvmap_param);
+        kind = err ? kind : nvmap_param;
+#endif
+        return kind;
+}
+int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
+                        int dmabuf_fd,
+                        u64 *offset_align,
+                        u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
+                        int kind)
+{
+        int err = 0;
+        struct vm_gk20a *vm = as_share->vm;
+        struct dma_buf *dmabuf;
+        u64 ret_va;
+        gk20a_dbg_fn("");
+        /* get ref to the mem handle (released on unmap_locked) */
+        dmabuf = dma_buf_get(dmabuf_fd);
+        if (!dmabuf)
+                return 0;
+        err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
+        if (err) {
+                dma_buf_put(dmabuf);
+                return err;
+        }
+        if (kind == -1)
+                kind = gk20a_dmabuf_get_kind(dmabuf);
+        ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
+                        flags, kind, NULL, true,
+                        gk20a_mem_flag_none);
+        *offset_align = ret_va;
+        if (!ret_va) {
+                dma_buf_put(dmabuf);
+                err = -EINVAL;
+        }
+        return err;
+}
+int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
+{
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        gk20a_vm_unmap_user(vm, offset);
+        return 0;
+}
+int gk20a_init_bar1_vm(struct mm_gk20a *mm)
+{
+        int err;
+        phys_addr_t inst_pa;
+        void *inst_ptr;
+        struct vm_gk20a *vm = &mm->bar1.vm;
+        struct gk20a *g = gk20a_from_mm(mm);
+        struct device *d = dev_from_gk20a(g);
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        u64 pde_addr;
+        u32 pde_addr_lo;
+        u32 pde_addr_hi;
+        dma_addr_t iova;
+        vm->mm = mm;
+        mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
+        gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
+        vm->va_start = mm->pde_stride * 1;
+        vm->va_limit = mm->bar1.aperture_size;
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        /* bar1 is likely only to ever use/need small page sizes. */
+        /* But just in case, for now... arrange for both.*/
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for bar1 va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                goto clean_up;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                goto clean_up;
+        }
+        gk20a_dbg(gpu_dbg_pte, "bar 1 pdes.kv = 0x%p, pdes.phys = 0x%llx",
+                        vm->pdes.kv, gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+        pde_addr_lo = u64_lo32(pde_addr >> 12);
+        pde_addr_hi = u64_hi32(pde_addr);
+        gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+                (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl),
+                pde_addr_lo, pde_addr_hi);
+        /* allocate instance mem for bar1 */
+        inst_block->size = ram_in_alloc_size_v();
+        inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+                                &iova, GFP_KERNEL);
+        if (!inst_block->cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_block->iova = iova;
+        inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+        if (!inst_block->cpu_pa) {
+                gk20a_err(d, "%s: failed to get phys address\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_pa = inst_block->cpu_pa;
+        inst_ptr = inst_block->cpuva;
+        gk20a_dbg_info("bar1 inst block physical phys = 0x%llx, kv = 0x%p",
+                (u64)inst_pa, inst_ptr);
+        memset(inst_ptr, 0, ram_fc_size_val_v());
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pde_addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pde_addr_hi));
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+        gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_bar1",
+                              1,/*start*/
+                              (vm->va_limit >> 12) - 1 /* length*/,
+                              1); /* align */
+        /* initialize just in case we try to use it anyway */
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_bar1-unused",
+                              0x0badc0de, /* start */
+                              1, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        return 0;
+clean_up:
+        /* free, etc */
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        return err;
+}
+/* pmu vm, share channel_vm interfaces */
+int gk20a_init_pmu_vm(struct mm_gk20a *mm)
+{
+        int err;
+        phys_addr_t inst_pa;
+        void *inst_ptr;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct gk20a *g = gk20a_from_mm(mm);
+        struct device *d = dev_from_gk20a(g);
+        struct inst_desc *inst_block = &mm->pmu.inst_block;
+        u64 pde_addr;
+        u32 pde_addr_lo;
+        u32 pde_addr_hi;
+        dma_addr_t iova;
+        vm->mm = mm;
+        mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
+        gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
+        vm->va_start  = GK20A_PMU_VA_START;
+        vm->va_limit  = vm->va_start + mm->pmu.aperture_size;
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        /* The pmu is likely only to ever use/need small page sizes. */
+        /* But just in case, for now... arrange for both.*/
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for pmu va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                goto clean_up;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                goto clean_up;
+        }
+        gk20a_dbg_info("pmu pdes phys @ 0x%llx",
+                        (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+        pde_addr_lo = u64_lo32(pde_addr >> 12);
+        pde_addr_hi = u64_hi32(pde_addr);
+        gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+                        (u64)pde_addr, pde_addr_lo, pde_addr_hi);
+        /* allocate instance mem for pmu */
+        inst_block->size = GK20A_PMU_INST_SIZE;
+        inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+                                &iova, GFP_KERNEL);
+        if (!inst_block->cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_block->iova = iova;
+        inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+        if (!inst_block->cpu_pa) {
+                gk20a_err(d, "%s: failed to get phys address\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_pa = inst_block->cpu_pa;
+        inst_ptr = inst_block->cpuva;
+        gk20a_dbg_info("pmu inst block physical addr: 0x%llx", (u64)inst_pa);
+        memset(inst_ptr, 0, GK20A_PMU_INST_SIZE);
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pde_addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pde_addr_hi));
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_pmu",
+                              (vm->va_start >> 12), /* start */
+                              (vm->va_limit - vm->va_start) >> 12, /*length*/
+                              1); /* align */
+        /* initialize just in case we try to use it anyway */
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_pmu-unused",
+                              0x0badc0de, /* start */
+                              1, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        return 0;
+clean_up:
+        /* free, etc */
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        return err;
+}
+void gk20a_mm_fb_flush(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        s32 retry = 100;
+        gk20a_dbg_fn("");
+        mutex_lock(&mm->l2_op_lock);
+        g->ops.ltc.elpg_flush(g);
+        /* Make sure all previous writes are committed to the L2. There's no
+           guarantee that writes are to DRAM. This will be a sysmembar internal
+           to the L2. */
+        gk20a_writel(g, flush_fb_flush_r(),
+                flush_fb_flush_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_fb_flush_r());
+                if (flush_fb_flush_outstanding_v(data) ==
+                        flush_fb_flush_outstanding_true_v() ||
+                    flush_fb_flush_pending_v(data) ==
+                        flush_fb_flush_pending_busy_v()) {
+                                gk20a_dbg_info("fb_flush 0x%x", data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "fb_flush too many retries");
+        mutex_unlock(&mm->l2_op_lock);
+}
+static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
+{
+        u32 data;
+        s32 retry = 200;
+        /* Invalidate any clean lines from the L2 so subsequent reads go to
+           DRAM. Dirty lines are not affected by this operation. */
+        gk20a_writel(g, flush_l2_system_invalidate_r(),
+                flush_l2_system_invalidate_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_system_invalidate_r());
+                if (flush_l2_system_invalidate_outstanding_v(data) ==
+                        flush_l2_system_invalidate_outstanding_true_v() ||
+                    flush_l2_system_invalidate_pending_v(data) ==
+                        flush_l2_system_invalidate_pending_busy_v()) {
+                                gk20a_dbg_info("l2_system_invalidate 0x%x",
+                                                data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "l2_system_invalidate too many retries");
+}
+void gk20a_mm_l2_invalidate(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        mutex_lock(&mm->l2_op_lock);
+        gk20a_mm_l2_invalidate_locked(g);
+        mutex_unlock(&mm->l2_op_lock);
+}
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        s32 retry = 200;
+        gk20a_dbg_fn("");
+        mutex_lock(&mm->l2_op_lock);
+        /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
+           as clean, so subsequent reads might hit in the L2. */
+        gk20a_writel(g, flush_l2_flush_dirty_r(),
+                flush_l2_flush_dirty_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_flush_dirty_r());
+                if (flush_l2_flush_dirty_outstanding_v(data) ==
+                        flush_l2_flush_dirty_outstanding_true_v() ||
+                    flush_l2_flush_dirty_pending_v(data) ==
+                        flush_l2_flush_dirty_pending_busy_v()) {
+                                gk20a_dbg_info("l2_flush_dirty 0x%x", data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "l2_flush_dirty too many retries");
+        if (invalidate)
+                gk20a_mm_l2_invalidate_locked(g);
+        mutex_unlock(&mm->l2_op_lock);
+}
+int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
+                         struct dma_buf **dmabuf,
+                         u64 *offset)
+{
+        struct mapped_buffer_node *mapped_buffer;
+        gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
+                                                        gpu_va);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return -EINVAL;
+        }
+        *dmabuf = mapped_buffer->dmabuf;
+        *offset = gpu_va - mapped_buffer->addr;
+        mutex_unlock(&vm->update_gmmu_lock);
+        return 0;
+}
+void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
+{
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->pdes.sgt->sgl) >> 12);
+        u32 data;
+        s32 retry = 200;
+        gk20a_dbg_fn("");
+        /* pagetables are considered sw states which are preserved after
+           prepare_poweroff. When gk20a deinit releases those pagetables,
+           common code in vm unmap path calls tlb invalidate that touches
+           hw. Use the power_on flag to skip tlb invalidation when gpu
+           power is turned off */
+        if (!g->power_on)
+                return;
+        /* No need to invalidate if tlb is clean */
+        mutex_lock(&vm->update_gmmu_lock);
+        if (!vm->tlb_dirty) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return;
+        }
+        vm->tlb_dirty = false;
+        mutex_unlock(&vm->update_gmmu_lock);
+        mutex_lock(&mm->tlb_lock);
+        do {
+                data = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
+                        break;
+                usleep_range(20, 40);
+                retry--;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "wait mmu fifo space too many retries");
+        gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
+                fb_mmu_invalidate_pdb_addr_f(addr_lo) |
+                fb_mmu_invalidate_pdb_aperture_vid_mem_f());
+        /* this is a sledgehammer, it would seem */
+        gk20a_writel(g, fb_mmu_invalidate_r(),
+                fb_mmu_invalidate_all_pdb_true_f() |
+                fb_mmu_invalidate_all_va_true_f() |
+                fb_mmu_invalidate_trigger_true_f());
+        do {
+                data = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
+                        fb_mmu_ctrl_pri_fifo_empty_false_f())
+                        break;
+                retry--;
+                usleep_range(20, 40);
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "mmu invalidate too many retries");
+        mutex_unlock(&mm->tlb_lock);
+}
+int gk20a_mm_suspend(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+void gk20a_mm_ltc_isr(struct gk20a *g)
+{
+        u32 intr;
+        intr = gk20a_readl(g, ltc_ltc0_ltss_intr_r());
+        gk20a_err(dev_from_gk20a(g), "ltc: %08x\n", intr);
+        gk20a_writel(g, ltc_ltc0_ltss_intr_r(), intr);
+}
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
+{
+        u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
+        return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
+                fb_mmu_debug_ctrl_debug_enabled_v();
+}
+static int gk20a_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
+                                            const unsigned int msec)
+{
+        unsigned long timeout;
+        timeout = jiffies + msecs_to_jiffies(msec);
+        while (1) {
+                u32 val;
+                val = gk20a_readl(g, fb_mmu_vpr_info_r());
+                if (fb_mmu_vpr_info_fetch_v(val) ==
+                    fb_mmu_vpr_info_fetch_false_v())
+                        break;
+                if (tegra_platform_is_silicon() &&
+                                WARN_ON(time_after(jiffies, timeout)))
+                        return -ETIME;
+        }
+        return 0;
+}
+int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g)
+{
+        int ret = 0;
+        gk20a_busy_noresume(g->dev);
+        if (!pm_runtime_active(&g->dev->dev))
+                goto fail;
+        if (gk20a_mm_mmu_vpr_info_fetch_wait(g, 5)) {
+                ret = -ETIME;
+                goto fail;
+        }
+        gk20a_writel(g, fb_mmu_vpr_info_r(),
+                     fb_mmu_vpr_info_fetch_true_v());
+        ret = gk20a_mm_mmu_vpr_info_fetch_wait(g, 5);
+ fail:
+        gk20a_idle(g->dev);
+        return ret;
+}
author	Arto Merilainen <amerilainen@nvidia.com>	2014-03-19 03:38:25 -0400
committer	Dan Willemsen <dwillemsen@nvidia.com>	2015-03-18 15:08:53 -0400
commit	a9785995d5f22aaeb659285f8aeb64d8b56982e0 (patch)
tree	cc75f75bcf43db316a002a7a240b81f299bf6d7f /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parent	61efaf843c22b85424036ec98015121c08f5f16c (diff)