1 files changed, 632 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
new file mode 100644
index 00000000..759ef816
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <trace/events/gk20a.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/vm_area.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/pramin.h>
+#include <nvgpu/list.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/semaphore.h>
+#include <nvgpu/page_allocator.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/vidmem.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "fence_gk20a.h"
+#include "bus_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_bus_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_flush_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ltc_gk20a.h>
+/*
+ * GPU mapping life cycle
+ * ======================
+ *
+ * Kernel mappings
+ * ---------------
+ *
+ * Kernel mappings are created through vm.map(..., false):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path does not support deferred unmapping (i.e. kernel must wait for
+ *    all hw operations on the buffer to complete before unmapping).
+ *  - References to dmabuf are owned and managed by the (kernel) clients of
+ *    the gk20a_vm layer.
+ *
+ *
+ * User space mappings
+ * -------------------
+ *
+ * User space mappings are created through as.map_buffer -> vm.map(..., true):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
+ *    until all hw operations have completed).
+ *  - References to dmabuf are owned and managed by the vm_gk20a
+ *    layer itself. vm.map acquires these refs, and sets
+ *    mapped_buffer->own_mem_ref to record that we must release the refs when we
+ *    actually unmap.
+ *
+ */
+/* make sure gk20a_init_mm_support is called before */
+int gk20a_init_mm_setup_hw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        int err;
+        gk20a_dbg_fn("");
+        g->ops.fb.set_mmu_page_size(g);
+        if (g->ops.fb.set_use_full_comp_tag_line)
+                mm->use_full_comp_tag_line =
+                        g->ops.fb.set_use_full_comp_tag_line(g);
+        g->ops.fb.init_hw(g);
+        if (g->ops.bus.bar1_bind)
+                g->ops.bus.bar1_bind(g, &mm->bar1.inst_block);
+        if (g->ops.mm.init_bar2_mm_hw_setup) {
+                err = g->ops.mm.init_bar2_mm_hw_setup(g);
+                if (err)
+                        return err;
+        }
+        if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g))
+                return -EBUSY;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
+{
+        return vm->mmu_levels[0].lo_bit[0];
+}
+/* for gk20a the "video memory" apertures here are misnomers. */
+static inline u32 big_valid_pde0_bits(struct gk20a *g,
+                                      struct nvgpu_gmmu_pd *pd, u64 addr)
+{
+        u32 pde0_bits =
+                nvgpu_aperture_mask(g, pd->mem,
+                  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
+                  gmmu_pde_aperture_big_video_memory_f()) |
+                gmmu_pde_address_big_sys_f(
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
+        return pde0_bits;
+}
+static inline u32 small_valid_pde1_bits(struct gk20a *g,
+                                        struct nvgpu_gmmu_pd *pd, u64 addr)
+{
+        u32 pde1_bits =
+                nvgpu_aperture_mask(g, pd->mem,
+                  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
+                  gmmu_pde_aperture_small_video_memory_f()) |
+                gmmu_pde_vol_small_true_f() | /* tbd: why? */
+                gmmu_pde_address_small_sys_f(
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
+        return pde1_bits;
+}
+static void update_gmmu_pde_locked(struct vm_gk20a *vm,
+                                   const struct gk20a_mmu_level *l,
+                                   struct nvgpu_gmmu_pd *pd,
+                                   u32 pd_idx,
+                                   u64 virt_addr,
+                                   u64 phys_addr,
+                                   struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        bool small_valid, big_valid;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
+        u32 pde_v[2] = {0, 0};
+        small_valid = attrs->pgsz == gmmu_page_size_small;
+        big_valid   = attrs->pgsz == gmmu_page_size_big;
+        pde_v[0] = gmmu_pde_size_full_f();
+        pde_v[0] |= big_valid ?
+                big_valid_pde0_bits(g, pd, phys_addr) :
+                gmmu_pde_aperture_big_invalid_f();
+        pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
+                     (gmmu_pde_aperture_small_invalid_f() |
+                      gmmu_pde_vol_small_false_f()))
+                |
+                (big_valid ? (gmmu_pde_vol_big_true_f()) :
+                 gmmu_pde_vol_big_false_f());
+        pte_dbg(g, attrs,
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
+                "GPU %#-12llx  phys %#-12llx "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                small_valid ? 'S' : '-',
+                big_valid ?   'B' : '-',
+                virt_addr, phys_addr,
+                pde_v[1], pde_v[0]);
+        pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
+        pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
+}
+static void __update_pte_sparse(u32 *pte_w)
+{
+        pte_w[0]  = gmmu_pte_valid_false_f();
+        pte_w[1] |= gmmu_pte_vol_true_f();
+}
+static void __update_pte(struct vm_gk20a *vm,
+                         u32 *pte_w,
+                         u64 phys_addr,
+                         struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pte_valid = attrs->valid ?
+                gmmu_pte_valid_true_f() :
+                gmmu_pte_valid_false_f();
+        u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
+        u32 addr = attrs->aperture == APERTURE_SYSMEM ?
+                gmmu_pte_address_sys_f(phys_shifted) :
+                gmmu_pte_address_vid_f(phys_shifted);
+        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
+        pte_w[0] = pte_valid | addr;
+        if (attrs->priv)
+                pte_w[0] |= gmmu_pte_privilege_true_f();
+        pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
+                                         gmmu_pte_aperture_sys_mem_ncoh_f(),
+                                         gmmu_pte_aperture_video_memory_f()) |
+                gmmu_pte_kind_f(attrs->kind_v) |
+                gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
+        if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
+            phys_addr & 0x10000)
+                pte_w[1] |= gmmu_pte_comptagline_f(
+                        1 << (gmmu_pte_comptagline_s() - 1));
+        if (attrs->rw_flag == gk20a_mem_flag_read_only) {
+                pte_w[0] |= gmmu_pte_read_only_true_f();
+                pte_w[1] |= gmmu_pte_write_disable_true_f();
+        } else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
+                pte_w[1] |= gmmu_pte_read_disable_true_f();
+        }
+        if (!attrs->cacheable)
+                pte_w[1] |= gmmu_pte_vol_true_f();
+        if (attrs->ctag)
+                attrs->ctag += page_size;
+}
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
+                                   const struct gk20a_mmu_level *l,
+                                   struct nvgpu_gmmu_pd *pd,
+                                   u32 pd_idx,
+                                   u64 virt_addr,
+                                   u64 phys_addr,
+                                   struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
+        u32 pte_w[2] = {0, 0};
+        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
+        if (phys_addr)
+                __update_pte(vm, pte_w, phys_addr, attrs);
+        else if (attrs->sparse)
+                __update_pte_sparse(pte_w);
+        pte_dbg(g, attrs,
+                "PTE: i=%-4u size=%-2u offs=%-4u | "
+                "GPU %#-12llx  phys %#-12llx "
+                "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c%c "
+                "ctag=0x%08x "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                page_size >> 10,
+                nvgpu_gmmu_perm_str(attrs->rw_flag),
+                attrs->kind_v,
+                nvgpu_aperture_str(attrs->aperture),
+                attrs->cacheable ? 'C' : 'v',
+                attrs->sparse    ? 'S' : '-',
+                attrs->priv      ? 'P' : '-',
+                attrs->coherent  ? 'c' : '-',
+                attrs->valid     ? 'V' : '-',
+                (u32)attrs->ctag >> ctag_shift,
+                pte_w[1], pte_w[0]);
+        pd_write(g, pd, pd_offset + 0, pte_w[0]);
+        pd_write(g, pd, pd_offset + 1, pte_w[1]);
+}
+enum gmmu_pgsz_gk20a gk20a_get_pde_pgsz(struct gk20a *g,
+                                        struct nvgpu_gmmu_pd *pd, u32 pd_idx)
+{
+        /*
+         * big and small page sizes are the same
+         */
+        return gmmu_page_size_small;
+}
+enum gmmu_pgsz_gk20a gk20a_get_pte_pgsz(struct gk20a *g,
+                                        struct nvgpu_gmmu_pd *pd, u32 pd_idx)
+{
+        /*
+         * return invalid
+         */
+        return gmmu_nr_page_sizes;
+}
+const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
+        {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
+         .lo_bit = {26, 26},
+         .update_entry = update_gmmu_pde_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pde_pgsz},
+        {.hi_bit = {25, 25},
+         .lo_bit = {12, 16},
+         .update_entry = update_gmmu_pte_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pte_pgsz},
+        {.update_entry = NULL}
+};
+const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
+        {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
+         .lo_bit = {27, 27},
+         .update_entry = update_gmmu_pde_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pde_pgsz},
+        {.hi_bit = {26, 26},
+         .lo_bit = {12, 17},
+         .update_entry = update_gmmu_pte_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pte_pgsz},
+        {.update_entry = NULL}
+};
+int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
+{
+        int err = 0;
+        gk20a_dbg_fn("");
+        nvgpu_vm_get(vm);
+        ch->vm = vm;
+        err = channel_gk20a_commit_va(ch);
+        if (err)
+                ch->vm = NULL;
+        nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
+                  ch->chid, vm->name);
+        return err;
+}
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+                          struct channel_gk20a *ch)
+{
+        return __gk20a_vm_bind_channel(as_share->vm, ch);
+}
+void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
+                struct vm_gk20a *vm)
+{
+        u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
+        u32 pdb_addr_hi = u64_hi32(pdb_addr);
+        gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
+        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
+                nvgpu_aperture_mask(g, vm->pdb.mem,
+                  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
+                  ram_in_page_dir_base_target_vid_mem_f()) |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pdb_addr_lo));
+        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pdb_addr_hi));
+}
+void gk20a_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
+                u32 big_page_size)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
+                nvgpu_inst_block_addr(g, inst_block), inst_block->cpu_va);
+        g->ops.mm.init_pdb(g, inst_block, vm);
+        nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
+                u64_lo32(vm->va_limit - 1) & ~0xfff);
+        nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
+        if (big_page_size && g->ops.mm.set_big_page_size)
+                g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
+}
+int gk20a_alloc_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+        int err;
+        gk20a_dbg_fn("");
+        err = nvgpu_dma_alloc(g, ram_in_alloc_size_v(), inst_block);
+        if (err) {
+                nvgpu_err(g, "%s: memory allocation failed", __func__);
+                return err;
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_mm_fb_flush(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        int ret = 0;
+        struct nvgpu_timeout timeout;
+        u32 retries;
+        gk20a_dbg_fn("");
+        gk20a_busy_noresume(g);
+        if (!g->power_on) {
+                gk20a_idle_nosuspend(g);
+                return 0;
+        }
+        retries = 100;
+        if (g->ops.mm.get_flush_retries)
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_FB);
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        /* Make sure all previous writes are committed to the L2. There's no
+           guarantee that writes are to DRAM. This will be a sysmembar internal
+           to the L2. */
+        trace_gk20a_mm_fb_flush(g->name);
+        gk20a_writel(g, flush_fb_flush_r(),
+                flush_fb_flush_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_fb_flush_r());
+                if (flush_fb_flush_outstanding_v(data) ==
+                        flush_fb_flush_outstanding_true_v() ||
+                    flush_fb_flush_pending_v(data) ==
+                        flush_fb_flush_pending_busy_v()) {
+                                gk20a_dbg_info("fb_flush 0x%x", data);
+                                nvgpu_udelay(5);
+                } else
+                        break;
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (nvgpu_timeout_peek_expired(&timeout)) {
+                if (g->ops.fb.dump_vpr_wpr_info)
+                        g->ops.fb.dump_vpr_wpr_info(g);
+                ret = -EBUSY;
+        }
+        trace_gk20a_mm_fb_flush_done(g->name);
+        nvgpu_mutex_release(&mm->l2_op_lock);
+        gk20a_idle_nosuspend(g);
+        return ret;
+}
+static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
+{
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 200;
+        trace_gk20a_mm_l2_invalidate(g->name);
+        if (g->ops.mm.get_flush_retries)
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_INV);
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        /* Invalidate any clean lines from the L2 so subsequent reads go to
+           DRAM. Dirty lines are not affected by this operation. */
+        gk20a_writel(g, flush_l2_system_invalidate_r(),
+                flush_l2_system_invalidate_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_system_invalidate_r());
+                if (flush_l2_system_invalidate_outstanding_v(data) ==
+                        flush_l2_system_invalidate_outstanding_true_v() ||
+                    flush_l2_system_invalidate_pending_v(data) ==
+                        flush_l2_system_invalidate_pending_busy_v()) {
+                                gk20a_dbg_info("l2_system_invalidate 0x%x",
+                                                data);
+                                nvgpu_udelay(5);
+                } else
+                        break;
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (nvgpu_timeout_peek_expired(&timeout))
+                nvgpu_warn(g, "l2_system_invalidate too many retries");
+        trace_gk20a_mm_l2_invalidate_done(g->name);
+}
+void gk20a_mm_l2_invalidate(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        gk20a_busy_noresume(g);
+        if (g->power_on) {
+                nvgpu_mutex_acquire(&mm->l2_op_lock);
+                gk20a_mm_l2_invalidate_locked(g);
+                nvgpu_mutex_release(&mm->l2_op_lock);
+        }
+        gk20a_idle_nosuspend(g);
+}
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 2000;
+        gk20a_dbg_fn("");
+        gk20a_busy_noresume(g);
+        if (!g->power_on)
+                goto hw_was_off;
+        if (g->ops.mm.get_flush_retries)
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_FLUSH);
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        trace_gk20a_mm_l2_flush(g->name);
+        /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
+           as clean, so subsequent reads might hit in the L2. */
+        gk20a_writel(g, flush_l2_flush_dirty_r(),
+                flush_l2_flush_dirty_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_flush_dirty_r());
+                if (flush_l2_flush_dirty_outstanding_v(data) ==
+                        flush_l2_flush_dirty_outstanding_true_v() ||
+                    flush_l2_flush_dirty_pending_v(data) ==
+                        flush_l2_flush_dirty_pending_busy_v()) {
+                                gk20a_dbg_info("l2_flush_dirty 0x%x", data);
+                                nvgpu_udelay(5);
+                } else
+                        break;
+        } while (!nvgpu_timeout_expired_msg(&timeout,
+                                         "l2_flush_dirty too many retries"));
+        trace_gk20a_mm_l2_flush_done(g->name);
+        if (invalidate)
+                gk20a_mm_l2_invalidate_locked(g);
+        nvgpu_mutex_release(&mm->l2_op_lock);
+hw_was_off:
+        gk20a_idle_nosuspend(g);
+}
+void gk20a_mm_cbc_clean(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 200;
+        gk20a_dbg_fn("");
+        gk20a_busy_noresume(g);
+        if (!g->power_on)
+                goto hw_was_off;
+        if (g->ops.mm.get_flush_retries)
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_CBC_CLEAN);
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        /* Flush all dirty lines from the CBC to L2 */
+        gk20a_writel(g, flush_l2_clean_comptags_r(),
+                flush_l2_clean_comptags_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_clean_comptags_r());
+                if (flush_l2_clean_comptags_outstanding_v(data) ==
+                        flush_l2_clean_comptags_outstanding_true_v() ||
+                    flush_l2_clean_comptags_pending_v(data) ==
+                        flush_l2_clean_comptags_pending_busy_v()) {
+                                gk20a_dbg_info("l2_clean_comptags 0x%x", data);
+                                nvgpu_udelay(5);
+                } else
+                        break;
+        } while (!nvgpu_timeout_expired_msg(&timeout,
+                                         "l2_clean_comptags too many retries"));
+        nvgpu_mutex_release(&mm->l2_op_lock);
+hw_was_off:
+        gk20a_idle_nosuspend(g);
+}
+u32 gk20a_mm_get_iommu_bit(struct gk20a *g)
+{
+        return 34;
+}
+const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
+                                                      u32 big_page_size)
+{
+        return (big_page_size == SZ_64K) ?
+                 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
+}

diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c new file mode 100644 index 00000000..759ef816 --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -0,0 +1,632 @@
	1	/*
	2	* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* Permission is hereby granted, free of charge, to any person obtaining a
	5	* copy of this software and associated documentation files (the "Software"),
	6	* to deal in the Software without restriction, including without limitation
	7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	8	* and/or sell copies of the Software, and to permit persons to whom the
	9	* Software is furnished to do so, subject to the following conditions:
	10	*
	11	* The above copyright notice and this permission notice shall be included in
	12	* all copies or substantial portions of the Software.
	13	*
	14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	19	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	20	* DEALINGS IN THE SOFTWARE.
	21	*/
	22
	23	#include <trace/events/gk20a.h>
	24
	25	#include <nvgpu/mm.h>
	26	#include <nvgpu/vm.h>
	27	#include <nvgpu/vm_area.h>
	28	#include <nvgpu/dma.h>
	29	#include <nvgpu/kmem.h>
	30	#include <nvgpu/timers.h>
	31	#include <nvgpu/pramin.h>
	32	#include <nvgpu/list.h>
	33	#include <nvgpu/nvgpu_mem.h>
	34	#include <nvgpu/allocator.h>
	35	#include <nvgpu/semaphore.h>
	36	#include <nvgpu/page_allocator.h>
	37	#include <nvgpu/log.h>
	38	#include <nvgpu/bug.h>
	39	#include <nvgpu/log2.h>
	40	#include <nvgpu/enabled.h>
	41	#include <nvgpu/vidmem.h>
	42
	43	#include "gk20a.h"
	44	#include "mm_gk20a.h"
	45	#include "fence_gk20a.h"
	46	#include "bus_gk20a.h"
	47
	48	#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
	49	#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
	50	#include <nvgpu/hw/gk20a/hw_pram_gk20a.h>
	51	#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
	52	#include <nvgpu/hw/gk20a/hw_bus_gk20a.h>
	53	#include <nvgpu/hw/gk20a/hw_flush_gk20a.h>
	54	#include <nvgpu/hw/gk20a/hw_ltc_gk20a.h>
	55
	56	/*
	57	* GPU mapping life cycle
	58	* ======================
	59	*
	60	* Kernel mappings
	61	* ---------------
	62	*
	63	* Kernel mappings are created through vm.map(..., false):
	64	*
	65	* - Mappings to the same allocations are reused and refcounted.
	66	* - This path does not support deferred unmapping (i.e. kernel must wait for
	67	* all hw operations on the buffer to complete before unmapping).
	68	* - References to dmabuf are owned and managed by the (kernel) clients of
	69	* the gk20a_vm layer.
	70	*
	71	*
	72	* User space mappings
	73	* -------------------
	74	*
	75	* User space mappings are created through as.map_buffer -> vm.map(..., true):
	76	*
	77	* - Mappings to the same allocations are reused and refcounted.
	78	* - This path supports deferred unmapping (i.e. we delay the actual unmapping
	79	* until all hw operations have completed).
	80	* - References to dmabuf are owned and managed by the vm_gk20a
	81	* layer itself. vm.map acquires these refs, and sets
	82	* mapped_buffer->own_mem_ref to record that we must release the refs when we
	83	* actually unmap.
	84	*
	85	*/
	86
	87	/* make sure gk20a_init_mm_support is called before */
	88	int gk20a_init_mm_setup_hw(struct gk20a *g)
	89	{
	90	struct mm_gk20a *mm = &g->mm;
	91	int err;
	92
	93	gk20a_dbg_fn("");
	94
	95	g->ops.fb.set_mmu_page_size(g);
	96	if (g->ops.fb.set_use_full_comp_tag_line)
	97	mm->use_full_comp_tag_line =
	98	g->ops.fb.set_use_full_comp_tag_line(g);
	99
	100	g->ops.fb.init_hw(g);
	101
	102	if (g->ops.bus.bar1_bind)
	103	g->ops.bus.bar1_bind(g, &mm->bar1.inst_block);
	104
	105	if (g->ops.mm.init_bar2_mm_hw_setup) {
	106	err = g->ops.mm.init_bar2_mm_hw_setup(g);
	107	if (err)
	108	return err;
	109	}
	110
	111	if (gk20a_mm_fb_flush(g) \|\| gk20a_mm_fb_flush(g))
	112	return -EBUSY;
	113
	114	gk20a_dbg_fn("done");
	115	return 0;
	116	}
	117
	118	int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
	119	{
	120	return vm->mmu_levels[0].lo_bit[0];
	121	}
	122
	123	/* for gk20a the "video memory" apertures here are misnomers. */
	124	static inline u32 big_valid_pde0_bits(struct gk20a *g,
	125	struct nvgpu_gmmu_pd *pd, u64 addr)
	126	{
	127	u32 pde0_bits =
	128	nvgpu_aperture_mask(g, pd->mem,
	129	gmmu_pde_aperture_big_sys_mem_ncoh_f(),
	130	gmmu_pde_aperture_big_video_memory_f()) \|
	131	gmmu_pde_address_big_sys_f(
	132	(u32)(addr >> gmmu_pde_address_shift_v()));
	133
	134	return pde0_bits;
	135	}
	136
	137	static inline u32 small_valid_pde1_bits(struct gk20a *g,
	138	struct nvgpu_gmmu_pd *pd, u64 addr)
	139	{
	140	u32 pde1_bits =
	141	nvgpu_aperture_mask(g, pd->mem,
	142	gmmu_pde_aperture_small_sys_mem_ncoh_f(),
	143	gmmu_pde_aperture_small_video_memory_f()) \|
	144	gmmu_pde_vol_small_true_f() \| /* tbd: why? */
	145	gmmu_pde_address_small_sys_f(
	146	(u32)(addr >> gmmu_pde_address_shift_v()));
	147
	148	return pde1_bits;
	149	}
	150
	151	static void update_gmmu_pde_locked(struct vm_gk20a *vm,
	152	const struct gk20a_mmu_level *l,
	153	struct nvgpu_gmmu_pd *pd,
	154	u32 pd_idx,
	155	u64 virt_addr,
	156	u64 phys_addr,
	157	struct nvgpu_gmmu_attrs *attrs)
	158	{
	159	struct gk20a *g = gk20a_from_vm(vm);
	160	bool small_valid, big_valid;
	161	u32 pd_offset = pd_offset_from_index(l, pd_idx);
	162	u32 pde_v[2] = {0, 0};
	163
	164	small_valid = attrs->pgsz == gmmu_page_size_small;
	165	big_valid = attrs->pgsz == gmmu_page_size_big;
	166
	167	pde_v[0] = gmmu_pde_size_full_f();
	168	pde_v[0] \|= big_valid ?
	169	big_valid_pde0_bits(g, pd, phys_addr) :
	170	gmmu_pde_aperture_big_invalid_f();
	171
	172	pde_v[1] \|= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
	173	(gmmu_pde_aperture_small_invalid_f() \|
	174	gmmu_pde_vol_small_false_f()))
	175	\|
	176	(big_valid ? (gmmu_pde_vol_big_true_f()) :
	177	gmmu_pde_vol_big_false_f());
	178
	179	pte_dbg(g, attrs,
	180	"PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c \| "
	181	"GPU %#-12llx phys %#-12llx "
	182	"[0x%08x, 0x%08x]",
	183	pd_idx, l->entry_size, pd_offset,
	184	small_valid ? 'S' : '-',
	185	big_valid ? 'B' : '-',
	186	virt_addr, phys_addr,
	187	pde_v[1], pde_v[0]);
	188
	189	pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
	190	pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
	191	}
	192
	193	static void __update_pte_sparse(u32 *pte_w)
	194	{
	195	pte_w[0] = gmmu_pte_valid_false_f();
	196	pte_w[1] \|= gmmu_pte_vol_true_f();
	197	}
	198
	199	static void __update_pte(struct vm_gk20a *vm,
	200	u32 *pte_w,
	201	u64 phys_addr,
	202	struct nvgpu_gmmu_attrs *attrs)
	203	{
	204	struct gk20a *g = gk20a_from_vm(vm);
	205	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
	206	u32 pte_valid = attrs->valid ?
	207	gmmu_pte_valid_true_f() :
	208	gmmu_pte_valid_false_f();
	209	u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
	210	u32 addr = attrs->aperture == APERTURE_SYSMEM ?
	211	gmmu_pte_address_sys_f(phys_shifted) :
	212	gmmu_pte_address_vid_f(phys_shifted);
	213	int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
	214
	215	pte_w[0] = pte_valid \| addr;
	216
	217	if (attrs->priv)
	218	pte_w[0] \|= gmmu_pte_privilege_true_f();
	219
	220	pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
	221	gmmu_pte_aperture_sys_mem_ncoh_f(),
	222	gmmu_pte_aperture_video_memory_f()) \|
	223	gmmu_pte_kind_f(attrs->kind_v) \|
	224	gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
	225
	226	if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
	227	phys_addr & 0x10000)
	228	pte_w[1] \|= gmmu_pte_comptagline_f(
	229	1 << (gmmu_pte_comptagline_s() - 1));
	230
	231	if (attrs->rw_flag == gk20a_mem_flag_read_only) {
	232	pte_w[0] \|= gmmu_pte_read_only_true_f();
	233	pte_w[1] \|= gmmu_pte_write_disable_true_f();
	234	} else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
	235	pte_w[1] \|= gmmu_pte_read_disable_true_f();
	236	}
	237
	238	if (!attrs->cacheable)
	239	pte_w[1] \|= gmmu_pte_vol_true_f();
	240
	241	if (attrs->ctag)
	242	attrs->ctag += page_size;
	243	}
	244
	245	static void update_gmmu_pte_locked(struct vm_gk20a *vm,
	246	const struct gk20a_mmu_level *l,
	247	struct nvgpu_gmmu_pd *pd,
	248	u32 pd_idx,
	249	u64 virt_addr,
	250	u64 phys_addr,
	251	struct nvgpu_gmmu_attrs *attrs)
	252	{
	253	struct gk20a *g = gk20a_from_vm(vm);
	254	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
	255	u32 pd_offset = pd_offset_from_index(l, pd_idx);
	256	u32 pte_w[2] = {0, 0};
	257	int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
	258
	259	if (phys_addr)
	260	__update_pte(vm, pte_w, phys_addr, attrs);
	261	else if (attrs->sparse)
	262	__update_pte_sparse(pte_w);
	263
	264	pte_dbg(g, attrs,
	265	"PTE: i=%-4u size=%-2u offs=%-4u \| "
	266	"GPU %#-12llx phys %#-12llx "
	267	"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c%c "
	268	"ctag=0x%08x "
	269	"[0x%08x, 0x%08x]",
	270	pd_idx, l->entry_size, pd_offset,
	271	virt_addr, phys_addr,
	272	page_size >> 10,
	273	nvgpu_gmmu_perm_str(attrs->rw_flag),
	274	attrs->kind_v,
	275	nvgpu_aperture_str(attrs->aperture),
	276	attrs->cacheable ? 'C' : 'v',
	277	attrs->sparse ? 'S' : '-',
	278	attrs->priv ? 'P' : '-',
	279	attrs->coherent ? 'c' : '-',
	280	attrs->valid ? 'V' : '-',
	281	(u32)attrs->ctag >> ctag_shift,
	282	pte_w[1], pte_w[0]);
	283
	284	pd_write(g, pd, pd_offset + 0, pte_w[0]);
	285	pd_write(g, pd, pd_offset + 1, pte_w[1]);
	286	}
	287
	288	enum gmmu_pgsz_gk20a gk20a_get_pde_pgsz(struct gk20a *g,
	289	struct nvgpu_gmmu_pd *pd, u32 pd_idx)
	290	{
	291	/*
	292	* big and small page sizes are the same
	293	*/
	294	return gmmu_page_size_small;
	295	}
	296
	297	enum gmmu_pgsz_gk20a gk20a_get_pte_pgsz(struct gk20a *g,
	298	struct nvgpu_gmmu_pd *pd, u32 pd_idx)
	299	{
	300	/*
	301	* return invalid
	302	*/
	303	return gmmu_nr_page_sizes;
	304	}
	305
	306	const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
	307	{.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
	308	.lo_bit = {26, 26},
	309	.update_entry = update_gmmu_pde_locked,
	310	.entry_size = 8,
	311	.get_pgsz = gk20a_get_pde_pgsz},
	312	{.hi_bit = {25, 25},
	313	.lo_bit = {12, 16},
	314	.update_entry = update_gmmu_pte_locked,
	315	.entry_size = 8,
	316	.get_pgsz = gk20a_get_pte_pgsz},
	317	{.update_entry = NULL}
	318	};
	319
	320	const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
	321	{.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
	322	.lo_bit = {27, 27},
	323	.update_entry = update_gmmu_pde_locked,
	324	.entry_size = 8,
	325	.get_pgsz = gk20a_get_pde_pgsz},
	326	{.hi_bit = {26, 26},
	327	.lo_bit = {12, 17},
	328	.update_entry = update_gmmu_pte_locked,
	329	.entry_size = 8,
	330	.get_pgsz = gk20a_get_pte_pgsz},
	331	{.update_entry = NULL}
	332	};
	333
	334	int __gk20a_vm_bind_channel(struct vm_gk20a vm, struct channel_gk20a ch)
	335	{
	336	int err = 0;
	337
	338	gk20a_dbg_fn("");
	339
	340	nvgpu_vm_get(vm);
	341	ch->vm = vm;
	342	err = channel_gk20a_commit_va(ch);
	343	if (err)
	344	ch->vm = NULL;
	345
	346	nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
	347	ch->chid, vm->name);
	348
	349	return err;
	350	}
	351
	352	int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
	353	struct channel_gk20a *ch)
	354	{
	355	return __gk20a_vm_bind_channel(as_share->vm, ch);
	356	}
	357
	358	void gk20a_mm_init_pdb(struct gk20a g, struct nvgpu_mem inst_block,
	359	struct vm_gk20a *vm)
	360	{
	361	u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
	362	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
	363	u32 pdb_addr_hi = u64_hi32(pdb_addr);
	364
	365	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
	366
	367	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
	368	nvgpu_aperture_mask(g, vm->pdb.mem,
	369	ram_in_page_dir_base_target_sys_mem_ncoh_f(),
	370	ram_in_page_dir_base_target_vid_mem_f()) \|
	371	ram_in_page_dir_base_vol_true_f() \|
	372	ram_in_page_dir_base_lo_f(pdb_addr_lo));
	373
	374	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_hi_w(),
	375	ram_in_page_dir_base_hi_f(pdb_addr_hi));
	376	}
	377
	378	void gk20a_init_inst_block(struct nvgpu_mem inst_block, struct vm_gk20a vm,
	379	u32 big_page_size)
	380	{
	381	struct gk20a *g = gk20a_from_vm(vm);
	382
	383	gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
	384	nvgpu_inst_block_addr(g, inst_block), inst_block->cpu_va);
	385
	386	g->ops.mm.init_pdb(g, inst_block, vm);
	387
	388	nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
	389	u64_lo32(vm->va_limit - 1) & ~0xfff);
	390
	391	nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
	392	ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
	393
	394	if (big_page_size && g->ops.mm.set_big_page_size)
	395	g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
	396	}
	397
	398	int gk20a_alloc_inst_block(struct gk20a g, struct nvgpu_mem inst_block)
	399	{
	400	int err;
	401
	402	gk20a_dbg_fn("");
	403
	404	err = nvgpu_dma_alloc(g, ram_in_alloc_size_v(), inst_block);
	405	if (err) {
	406	nvgpu_err(g, "%s: memory allocation failed", __func__);
	407	return err;
	408	}
	409
	410	gk20a_dbg_fn("done");
	411	return 0;
	412	}
	413
	414	int gk20a_mm_fb_flush(struct gk20a *g)
	415	{
	416	struct mm_gk20a *mm = &g->mm;
	417	u32 data;
	418	int ret = 0;
	419	struct nvgpu_timeout timeout;
	420	u32 retries;
	421
	422	gk20a_dbg_fn("");
	423
	424	gk20a_busy_noresume(g);
	425	if (!g->power_on) {
	426	gk20a_idle_nosuspend(g);
	427	return 0;
	428	}
	429
	430	retries = 100;
	431
	432	if (g->ops.mm.get_flush_retries)
	433	retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_FB);
	434
	435	nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
	436
	437	nvgpu_mutex_acquire(&mm->l2_op_lock);
	438
	439	/* Make sure all previous writes are committed to the L2. There's no
	440	guarantee that writes are to DRAM. This will be a sysmembar internal
	441	to the L2. */
	442
	443	trace_gk20a_mm_fb_flush(g->name);
	444
	445	gk20a_writel(g, flush_fb_flush_r(),
	446	flush_fb_flush_pending_busy_f());
	447
	448	do {
	449	data = gk20a_readl(g, flush_fb_flush_r());
	450
	451	if (flush_fb_flush_outstanding_v(data) ==
	452	flush_fb_flush_outstanding_true_v() \|\|
	453	flush_fb_flush_pending_v(data) ==
	454	flush_fb_flush_pending_busy_v()) {
	455	gk20a_dbg_info("fb_flush 0x%x", data);
	456	nvgpu_udelay(5);
	457	} else
	458	break;
	459	} while (!nvgpu_timeout_expired(&timeout));
	460
	461	if (nvgpu_timeout_peek_expired(&timeout)) {
	462	if (g->ops.fb.dump_vpr_wpr_info)
	463	g->ops.fb.dump_vpr_wpr_info(g);
	464	ret = -EBUSY;
	465	}
	466
	467	trace_gk20a_mm_fb_flush_done(g->name);
	468
	469	nvgpu_mutex_release(&mm->l2_op_lock);
	470
	471	gk20a_idle_nosuspend(g);
	472
	473	return ret;
	474	}
	475
	476	static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
	477	{
	478	u32 data;
	479	struct nvgpu_timeout timeout;
	480	u32 retries = 200;
	481
	482	trace_gk20a_mm_l2_invalidate(g->name);
	483
	484	if (g->ops.mm.get_flush_retries)
	485	retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_INV);
	486
	487	nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
	488
	489	/* Invalidate any clean lines from the L2 so subsequent reads go to
	490	DRAM. Dirty lines are not affected by this operation. */
	491	gk20a_writel(g, flush_l2_system_invalidate_r(),
	492	flush_l2_system_invalidate_pending_busy_f());
	493
	494	do {
	495	data = gk20a_readl(g, flush_l2_system_invalidate_r());
	496
	497	if (flush_l2_system_invalidate_outstanding_v(data) ==
	498	flush_l2_system_invalidate_outstanding_true_v() \|\|
	499	flush_l2_system_invalidate_pending_v(data) ==
	500	flush_l2_system_invalidate_pending_busy_v()) {
	501	gk20a_dbg_info("l2_system_invalidate 0x%x",
	502	data);
	503	nvgpu_udelay(5);
	504	} else
	505	break;
	506	} while (!nvgpu_timeout_expired(&timeout));
	507
	508	if (nvgpu_timeout_peek_expired(&timeout))
	509	nvgpu_warn(g, "l2_system_invalidate too many retries");
	510
	511	trace_gk20a_mm_l2_invalidate_done(g->name);
	512	}
	513
	514	void gk20a_mm_l2_invalidate(struct gk20a *g)
	515	{
	516	struct mm_gk20a *mm = &g->mm;
	517	gk20a_busy_noresume(g);
	518	if (g->power_on) {
	519	nvgpu_mutex_acquire(&mm->l2_op_lock);
	520	gk20a_mm_l2_invalidate_locked(g);
	521	nvgpu_mutex_release(&mm->l2_op_lock);
	522	}
	523	gk20a_idle_nosuspend(g);
	524	}
	525
	526	void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
	527	{
	528	struct mm_gk20a *mm = &g->mm;
	529	u32 data;
	530	struct nvgpu_timeout timeout;
	531	u32 retries = 2000;
	532
	533	gk20a_dbg_fn("");
	534
	535	gk20a_busy_noresume(g);
	536	if (!g->power_on)
	537	goto hw_was_off;
	538
	539	if (g->ops.mm.get_flush_retries)
	540	retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_FLUSH);
	541
	542	nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
	543
	544	nvgpu_mutex_acquire(&mm->l2_op_lock);
	545
	546	trace_gk20a_mm_l2_flush(g->name);
	547
	548	/* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
	549	as clean, so subsequent reads might hit in the L2. */
	550	gk20a_writel(g, flush_l2_flush_dirty_r(),
	551	flush_l2_flush_dirty_pending_busy_f());
	552
	553	do {
	554	data = gk20a_readl(g, flush_l2_flush_dirty_r());
	555
	556	if (flush_l2_flush_dirty_outstanding_v(data) ==
	557	flush_l2_flush_dirty_outstanding_true_v() \|\|
	558	flush_l2_flush_dirty_pending_v(data) ==
	559	flush_l2_flush_dirty_pending_busy_v()) {
	560	gk20a_dbg_info("l2_flush_dirty 0x%x", data);
	561	nvgpu_udelay(5);
	562	} else
	563	break;
	564	} while (!nvgpu_timeout_expired_msg(&timeout,
	565	"l2_flush_dirty too many retries"));
	566
	567	trace_gk20a_mm_l2_flush_done(g->name);
	568
	569	if (invalidate)
	570	gk20a_mm_l2_invalidate_locked(g);
	571
	572	nvgpu_mutex_release(&mm->l2_op_lock);
	573
	574	hw_was_off:
	575	gk20a_idle_nosuspend(g);
	576	}
	577
	578	void gk20a_mm_cbc_clean(struct gk20a *g)
	579	{
	580	struct mm_gk20a *mm = &g->mm;
	581	u32 data;
	582	struct nvgpu_timeout timeout;
	583	u32 retries = 200;
	584
	585	gk20a_dbg_fn("");
	586
	587	gk20a_busy_noresume(g);
	588	if (!g->power_on)
	589	goto hw_was_off;
	590
	591	if (g->ops.mm.get_flush_retries)
	592	retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_CBC_CLEAN);
	593
	594	nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
	595
	596	nvgpu_mutex_acquire(&mm->l2_op_lock);
	597
	598	/* Flush all dirty lines from the CBC to L2 */
	599	gk20a_writel(g, flush_l2_clean_comptags_r(),
	600	flush_l2_clean_comptags_pending_busy_f());
	601
	602	do {
	603	data = gk20a_readl(g, flush_l2_clean_comptags_r());
	604
	605	if (flush_l2_clean_comptags_outstanding_v(data) ==
	606	flush_l2_clean_comptags_outstanding_true_v() \|\|
	607	flush_l2_clean_comptags_pending_v(data) ==
	608	flush_l2_clean_comptags_pending_busy_v()) {
	609	gk20a_dbg_info("l2_clean_comptags 0x%x", data);
	610	nvgpu_udelay(5);
	611	} else
	612	break;
	613	} while (!nvgpu_timeout_expired_msg(&timeout,
	614	"l2_clean_comptags too many retries"));
	615
	616	nvgpu_mutex_release(&mm->l2_op_lock);
	617
	618	hw_was_off:
	619	gk20a_idle_nosuspend(g);
	620	}
	621
	622	u32 gk20a_mm_get_iommu_bit(struct gk20a *g)
	623	{
	624	return 34;
	625	}
	626
	627	const struct gk20a_mmu_level gk20a_mm_get_mmu_levels(struct gk20a g,
	628	u32 big_page_size)
	629	{
	630	return (big_page_size == SZ_64K) ?
	631	gk20a_mm_levels_64k : gk20a_mm_levels_128k;
	632	}