/* * GK20A memory management * * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gk20a.h" #include "platform_gk20a.h" #include "mm_gk20a.h" #include "fence_gk20a.h" #include "kind_gk20a.h" #include "bus_gk20a.h" #include "common/linux/os_linux.h" #include #include #include #include #include #include #include /* * GPU mapping life cycle * ====================== * * Kernel mappings * --------------- * * Kernel mappings are created through vm.map(..., false): * * - Mappings to the same allocations are reused and refcounted. * - This path does not support deferred unmapping (i.e. kernel must wait for * all hw operations on the buffer to complete before unmapping). * - References to dmabuf are owned and managed by the (kernel) clients of * the gk20a_vm layer. * * * User space mappings * ------------------- * * User space mappings are created through as.map_buffer -> vm.map(..., true): * * - Mappings to the same allocations are reused and refcounted. * - This path supports deferred unmapping (i.e. we delay the actual unmapping * until all hw operations have completed). * - References to dmabuf are owned and managed by the vm_gk20a * layer itself. vm.map acquires these refs, and sets * mapped_buffer->own_mem_ref to record that we must release the refs when we * actually unmap. * */ static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm); static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_ce_vm(struct mm_gk20a *mm); static int gk20a_init_mm_reset_enable_hw(struct gk20a *g) { gk20a_dbg_fn(""); if (g->ops.fb.reset) g->ops.fb.reset(g); if (g->ops.clock_gating.slcg_fb_load_gating_prod) g->ops.clock_gating.slcg_fb_load_gating_prod(g, g->slcg_enabled); if (g->ops.clock_gating.slcg_ltc_load_gating_prod) g->ops.clock_gating.slcg_ltc_load_gating_prod(g, g->slcg_enabled); if (g->ops.clock_gating.blcg_fb_load_gating_prod) g->ops.clock_gating.blcg_fb_load_gating_prod(g, g->blcg_enabled); if (g->ops.clock_gating.blcg_ltc_load_gating_prod) g->ops.clock_gating.blcg_ltc_load_gating_prod(g, g->blcg_enabled); if (g->ops.fb.init_fs_state) g->ops.fb.init_fs_state(g); return 0; } static void gk20a_remove_mm_ce_support(struct mm_gk20a *mm) { struct gk20a *g = gk20a_from_mm(mm); if (mm->vidmem.ce_ctx_id != (u32)~0) gk20a_ce_delete_context_priv(g, mm->vidmem.ce_ctx_id); mm->vidmem.ce_ctx_id = (u32)~0; nvgpu_vm_put(mm->ce.vm); } static void gk20a_remove_mm_support(struct mm_gk20a *mm) { struct gk20a *g = gk20a_from_mm(mm); if (g->ops.mm.fault_info_mem_destroy) g->ops.mm.fault_info_mem_destroy(g); if (g->ops.mm.remove_bar2_vm) g->ops.mm.remove_bar2_vm(g); if (g->ops.mm.is_bar1_supported(g)) { gk20a_free_inst_block(g, &mm->bar1.inst_block); nvgpu_vm_put(mm->bar1.vm); } gk20a_free_inst_block(g, &mm->pmu.inst_block); gk20a_free_inst_block(g, &mm->hwpm.inst_block); nvgpu_vm_put(mm->pmu.vm); nvgpu_vm_put(mm->cde.vm); nvgpu_semaphore_sea_destroy(g); nvgpu_vidmem_destroy(g); nvgpu_pd_cache_fini(g); } static int gk20a_alloc_sysmem_flush(struct gk20a *g) { return nvgpu_dma_alloc_sys(g, SZ_4K, &g->mm.sysmem_flush); } int gk20a_init_mm_setup_sw(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; int err; gk20a_dbg_fn(""); if (mm->sw_ready) { gk20a_dbg_fn("skip init"); return 0; } mm->g = g; nvgpu_mutex_init(&mm->l2_op_lock); /*TBD: make channel vm size configurable */ mm->channel.user_size = NV_MM_DEFAULT_USER_SIZE - NV_MM_DEFAULT_KERNEL_SIZE; mm->channel.kernel_size = NV_MM_DEFAULT_KERNEL_SIZE; gk20a_dbg_info("channel vm size: user %dMB kernel %dMB", (int)(mm->channel.user_size >> 20), (int)(mm->channel.kernel_size >> 20)); nvgpu_init_pramin(mm); mm->vidmem.ce_ctx_id = (u32)~0; err = nvgpu_vidmem_init(mm); if (err) return err; /* * this requires fixed allocations in vidmem which must be * allocated before all other buffers */ if (g->ops.pmu.alloc_blob_space && !nvgpu_is_enabled(g, NVGPU_MM_UNIFIED_MEMORY)) { err = g->ops.pmu.alloc_blob_space(g, 0, &g->acr.ucode_blob); if (err) return err; } err = gk20a_alloc_sysmem_flush(g); if (err) return err; if (g->ops.mm.is_bar1_supported(g)) { err = gk20a_init_bar1_vm(mm); if (err) return err; } if (g->ops.mm.init_bar2_vm) { err = g->ops.mm.init_bar2_vm(g); if (err) return err; } err = gk20a_init_system_vm(mm); if (err) return err; err = gk20a_init_hwpm(mm); if (err) return err; err = gk20a_init_cde_vm(mm); if (err) return err; err = gk20a_init_ce_vm(mm); if (err) return err; mm->remove_support = gk20a_remove_mm_support; mm->remove_ce_support = gk20a_remove_mm_ce_support; mm->sw_ready = true; gk20a_dbg_fn("done"); return 0; } /* make sure gk20a_init_mm_support is called before */ int gk20a_init_mm_setup_hw(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; int err; gk20a_dbg_fn(""); g->ops.fb.set_mmu_page_size(g); if (g->ops.fb.set_use_full_comp_tag_line) mm->use_full_comp_tag_line = g->ops.fb.set_use_full_comp_tag_line(g); g->ops.fb.init_hw(g); if (g->ops.bus.bar1_bind) g->ops.bus.bar1_bind(g, &mm->bar1.inst_block); if (g->ops.mm.init_bar2_mm_hw_setup) { err = g->ops.mm.init_bar2_mm_hw_setup(g); if (err) return err; } if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g)) return -EBUSY; gk20a_dbg_fn("done"); return 0; } int gk20a_init_mm_support(struct gk20a *g) { u32 err; err = gk20a_init_mm_reset_enable_hw(g); if (err) return err; err = gk20a_init_mm_setup_sw(g); if (err) return err; if (g->ops.mm.init_mm_setup_hw) err = g->ops.mm.init_mm_setup_hw(g); return err; } void gk20a_init_mm_ce_context(struct gk20a *g) { #if defined(CONFIG_GK20A_VIDMEM) if (g->mm.vidmem.size && (g->mm.vidmem.ce_ctx_id == (u32)~0)) { g->mm.vidmem.ce_ctx_id = gk20a_ce_create_context_with_cb(g, gk20a_fifo_get_fast_ce_runlist_id(g), -1, -1, -1, NULL); if (g->mm.vidmem.ce_ctx_id == (u32)~0) nvgpu_err(g, "Failed to allocate CE context for vidmem page clearing support"); } #endif } int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm) { return vm->mmu_levels[0].lo_bit[0]; } /* for gk20a the "video memory" apertures here are misnomers. */ static inline u32 big_valid_pde0_bits(struct gk20a *g, struct nvgpu_gmmu_pd *pd, u64 addr) { u32 pde0_bits = nvgpu_aperture_mask(g, pd->mem, gmmu_pde_aperture_big_sys_mem_ncoh_f(), gmmu_pde_aperture_big_video_memory_f()) | gmmu_pde_address_big_sys_f( (u32)(addr >> gmmu_pde_address_shift_v())); return pde0_bits; } static inline u32 small_valid_pde1_bits(struct gk20a *g, struct nvgpu_gmmu_pd *pd, u64 addr) { u32 pde1_bits = nvgpu_aperture_mask(g, pd->mem, gmmu_pde_aperture_small_sys_mem_ncoh_f(), gmmu_pde_aperture_small_video_memory_f()) | gmmu_pde_vol_small_true_f() | /* tbd: why? */ gmmu_pde_address_small_sys_f( (u32)(addr >> gmmu_pde_address_shift_v())); return pde1_bits; } static void update_gmmu_pde_locked(struct vm_gk20a *vm, const struct gk20a_mmu_level *l, struct nvgpu_gmmu_pd *pd, u32 pd_idx, u64 virt_addr, u64 phys_addr, struct nvgpu_gmmu_attrs *attrs) { struct gk20a *g = gk20a_from_vm(vm); bool small_valid, big_valid; u32 pd_offset = pd_offset_from_index(l, pd_idx); u32 pde_v[2] = {0, 0}; small_valid = attrs->pgsz == gmmu_page_size_small; big_valid = attrs->pgsz == gmmu_page_size_big; pde_v[0] = gmmu_pde_size_full_f(); pde_v[0] |= big_valid ? big_valid_pde0_bits(g, pd, phys_addr) : gmmu_pde_aperture_big_invalid_f(); pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) : (gmmu_pde_aperture_small_invalid_f() | gmmu_pde_vol_small_false_f())) | (big_valid ? (gmmu_pde_vol_big_true_f()) : gmmu_pde_vol_big_false_f()); pte_dbg(g, attrs, "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | " "GPU %#-12llx phys %#-12llx " "[0x%08x, 0x%08x]", pd_idx, l->entry_size, pd_offset, small_valid ? 'S' : '-', big_valid ? 'B' : '-', virt_addr, phys_addr, pde_v[1], pde_v[0]); pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]); pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]); } static void __update_pte_sparse(u32 *pte_w) { pte_w[0] = gmmu_pte_valid_false_f(); pte_w[1] |= gmmu_pte_vol_true_f(); } static void __update_pte(struct vm_gk20a *vm, u32 *pte_w, u64 phys_addr, struct nvgpu_gmmu_attrs *attrs) { struct gk20a *g = gk20a_from_vm(vm); u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; u32 pte_valid = attrs->valid ? gmmu_pte_valid_true_f() : gmmu_pte_valid_false_f(); u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v(); u32 addr = attrs->aperture == APERTURE_SYSMEM ? gmmu_pte_address_sys_f(phys_shifted) : gmmu_pte_address_vid_f(phys_shifted); int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); pte_w[0] = pte_valid | addr; if (attrs->priv) pte_w[0] |= gmmu_pte_privilege_true_f(); pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture, gmmu_pte_aperture_sys_mem_ncoh_f(), gmmu_pte_aperture_video_memory_f()) | gmmu_pte_kind_f(attrs->kind_v) | gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift)); if (attrs->ctag && vm->mm->use_full_comp_tag_line && phys_addr & 0x10000) pte_w[1] |= gmmu_pte_comptagline_f( 1 << (gmmu_pte_comptagline_s() - 1)); if (attrs->rw_flag == gk20a_mem_flag_read_only) { pte_w[0] |= gmmu_pte_read_only_true_f(); pte_w[1] |= gmmu_pte_write_disable_true_f(); } else if (attrs->rw_flag == gk20a_mem_flag_write_only) { pte_w[1] |= gmmu_pte_read_disable_true_f(); } if (!attrs->cacheable) pte_w[1] |= gmmu_pte_vol_true_f(); if (attrs->ctag) attrs->ctag += page_size; } static void update_gmmu_pte_locked(struct vm_gk20a *vm, const struct gk20a_mmu_level *l, struct nvgpu_gmmu_pd *pd, u32 pd_idx, u64 virt_addr, u64 phys_addr, struct nvgpu_gmmu_attrs *attrs) { struct gk20a *g = gk20a_from_vm(vm); u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; u32 pd_offset = pd_offset_from_index(l, pd_idx); u32 pte_w[2] = {0, 0}; int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); if (phys_addr) __update_pte(vm, pte_w, phys_addr, attrs); else if (attrs->sparse) __update_pte_sparse(pte_w); pte_dbg(g, attrs, "PTE: i=%-4u size=%-2u offs=%-4u | " "GPU %#-12llx phys %#-12llx " "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c%c " "ctag=0x%08x " "[0x%08x, 0x%08x]", pd_idx, l->entry_size, pd_offset, virt_addr, phys_addr, page_size >> 10, nvgpu_gmmu_perm_str(attrs->rw_flag), attrs->kind_v, nvgpu_aperture_str(attrs->aperture), attrs->cacheable ? 'C' : 'v', attrs->sparse ? 'S' : '-', attrs->priv ? 'P' : '-', attrs->coherent ? 'c' : '-', attrs->valid ? 'V' : '-', (u32)attrs->ctag >> ctag_shift, pte_w[1], pte_w[0]); pd_write(g, pd, pd_offset + 0, pte_w[0]); pd_write(g, pd, pd_offset + 1, pte_w[1]); } const struct gk20a_mmu_level gk20a_mm_levels_64k[] = { {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1}, .lo_bit = {26, 26}, .update_entry = update_gmmu_pde_locked, .entry_size = 8}, {.hi_bit = {25, 25}, .lo_bit = {12, 16}, .update_entry = update_gmmu_pte_locked, .entry_size = 8}, {.update_entry = NULL} }; const struct gk20a_mmu_level gk20a_mm_levels_128k[] = { {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1}, .lo_bit = {27, 27}, .update_entry = update_gmmu_pde_locked, .entry_size = 8}, {.hi_bit = {26, 26}, .lo_bit = {12, 17}, .update_entry = update_gmmu_pte_locked, .entry_size = 8}, {.update_entry = NULL} }; /* * Attempt to find a reserved memory area to determine PTE size for the passed * mapping. If no reserved area can be found use small pages. */ enum gmmu_pgsz_gk20a __get_pte_size_fixed_map(struct vm_gk20a *vm, u64 base, u64 size) { struct nvgpu_vm_area *vm_area; vm_area = nvgpu_vm_area_find(vm, base); if (!vm_area) return gmmu_page_size_small; return vm_area->pgsz_idx; } /* * This is for when the address space does not support unified address spaces. */ static enum gmmu_pgsz_gk20a __get_pte_size_split_addr(struct vm_gk20a *vm, u64 base, u64 size) { if (!base) { if (size >= vm->gmmu_page_sizes[gmmu_page_size_big]) return gmmu_page_size_big; return gmmu_page_size_small; } else { if (base < __nv_gmmu_va_small_page_limit()) return gmmu_page_size_small; else return gmmu_page_size_big; } } /* * This determines the PTE size for a given alloc. Used by both the GVA space * allocator and the mm core code so that agreement can be reached on how to * map allocations. * * The page size of a buffer is this: * * o If the VM doesn't support large pages then obviously small pages * must be used. * o If the base address is non-zero (fixed address map): * - Attempt to find a reserved memory area and use the page size * based on that. * - If no reserved page size is available, default to small pages. * o If the base is zero: * - If the size is larger than or equal to the big page size, use big * pages. * - Otherwise use small pages. */ enum gmmu_pgsz_gk20a __get_pte_size(struct vm_gk20a *vm, u64 base, u64 size) { struct gk20a *g = gk20a_from_vm(vm); if (!vm->big_pages) return gmmu_page_size_small; if (!nvgpu_is_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES)) return __get_pte_size_split_addr(vm, base, size); if (base) return __get_pte_size_fixed_map(vm, base, size); if (size >= vm->gmmu_page_sizes[gmmu_page_size_big]) return gmmu_page_size_big; return gmmu_page_size_small; } int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch) { int err = 0; gk20a_dbg_fn(""); nvgpu_vm_get(vm); ch->vm = vm; err = channel_gk20a_commit_va(ch); if (err) ch->vm = NULL; nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s", ch->chid, vm->name); return err; } int gk20a_vm_bind_channel(struct gk20a_as_share *as_share, struct channel_gk20a *ch) { return __gk20a_vm_bind_channel(as_share->vm, ch); } int gk20a_alloc_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block) { int err; gk20a_dbg_fn(""); err = nvgpu_dma_alloc(g, ram_in_alloc_size_v(), inst_block); if (err) { nvgpu_err(g, "%s: memory allocation failed", __func__); return err; } gk20a_dbg_fn("done"); return 0; } void gk20a_free_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block) { if (inst_block->size) nvgpu_dma_free(g, inst_block); } u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block) { if (g->mm.has_physical_mode) return nvgpu_mem_get_phys_addr(g, inst_block); else return nvgpu_mem_get_addr(g, inst_block); } static int gk20a_init_bar1_vm(struct mm_gk20a *mm) { int err; struct gk20a *g = gk20a_from_mm(mm); struct nvgpu_mem *inst_block = &mm->bar1.inst_block; u32 big_page_size = g->ops.mm.get_default_big_page_size(); mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20; gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size); mm->bar1.vm = nvgpu_vm_init(g, big_page_size, SZ_4K, mm->bar1.aperture_size - SZ_4K, mm->bar1.aperture_size, true, false, "bar1"); if (!mm->bar1.vm) return -ENOMEM; err = gk20a_alloc_inst_block(g, inst_block); if (err) goto clean_up_vm; g->ops.mm.init_inst_block(inst_block, mm->bar1.vm, big_page_size); return 0; clean_up_vm: nvgpu_vm_put(mm->bar1.vm); return err; } /* pmu vm, share channel_vm interfaces */ static int gk20a_init_system_vm(struct mm_gk20a *mm) { int err; struct gk20a *g = gk20a_from_mm(mm); struct nvgpu_mem *inst_block = &mm->pmu.inst_block; u32 big_page_size = g->ops.mm.get_default_big_page_size(); u32 low_hole, aperture_size; /* * No user region - so we will pass that as zero sized. */ low_hole = SZ_4K * 16; aperture_size = GK20A_PMU_VA_SIZE * 2; mm->pmu.aperture_size = GK20A_PMU_VA_SIZE; gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size); mm->pmu.vm = nvgpu_vm_init(g, big_page_size, low_hole, aperture_size - low_hole, aperture_size, true, false, "system"); if (!mm->pmu.vm) return -ENOMEM; err = gk20a_alloc_inst_block(g, inst_block); if (err) goto clean_up_vm; g->ops.mm.init_inst_block(inst_block, mm->pmu.vm, big_page_size); return 0; clean_up_vm: nvgpu_vm_put(mm->pmu.vm); return err; } static int gk20a_init_hwpm(struct mm_gk20a *mm) { int err; struct gk20a *g = gk20a_from_mm(mm); struct nvgpu_mem *inst_block = &mm->hwpm.inst_block; err = gk20a_alloc_inst_block(g, inst_block); if (err) return err; g->ops.mm.init_inst_block(inst_block, mm->pmu.vm, 0); return 0; } static int gk20a_init_cde_vm(struct mm_gk20a *mm) { struct gk20a *g = gk20a_from_mm(mm); u32 big_page_size = g->ops.mm.get_default_big_page_size(); mm->cde.vm = nvgpu_vm_init(g, big_page_size, big_page_size << 10, NV_MM_DEFAULT_KERNEL_SIZE, NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE, false, false, "cde"); if (!mm->cde.vm) return -ENOMEM; return 0; } static int gk20a_init_ce_vm(struct mm_gk20a *mm) { struct gk20a *g = gk20a_from_mm(mm); u32 big_page_size = g->ops.mm.get_default_big_page_size(); mm->ce.vm = nvgpu_vm_init(g, big_page_size, big_page_size << 10, NV_MM_DEFAULT_KERNEL_SIZE, NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE, false, false, "ce"); if (!mm->ce.vm) return -ENOMEM; return 0; } void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, struct vm_gk20a *vm) { u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem); u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); u32 pdb_addr_hi = u64_hi32(pdb_addr); gk20a_dbg_info("pde pa=0x%llx", pdb_addr); nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), nvgpu_aperture_mask(g, vm->pdb.mem, ram_in_page_dir_base_target_sys_mem_ncoh_f(), ram_in_page_dir_base_target_vid_mem_f()) | ram_in_page_dir_base_vol_true_f() | ram_in_page_dir_base_lo_f(pdb_addr_lo)); nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_hi_w(), ram_in_page_dir_base_hi_f(pdb_addr_hi)); } void gk20a_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm, u32 big_page_size) { struct gk20a *g = gk20a_from_vm(vm); gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p", gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va); g->ops.mm.init_pdb(g, inst_block, vm); nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(), u64_lo32(vm->va_limit - 1) & ~0xfff); nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(), ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1))); if (big_page_size && g->ops.mm.set_big_page_size) g->ops.mm.set_big_page_size(g, inst_block, big_page_size); } int gk20a_mm_fb_flush(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; u32 data; int ret = 0; struct nvgpu_timeout timeout; gk20a_dbg_fn(""); gk20a_busy_noresume(g); if (!g->power_on) { gk20a_idle_nosuspend(g); return 0; } nvgpu_timeout_init(g, &timeout, 100, NVGPU_TIMER_RETRY_TIMER); nvgpu_mutex_acquire(&mm->l2_op_lock); /* Make sure all previous writes are committed to the L2. There's no guarantee that writes are to DRAM. This will be a sysmembar internal to the L2. */ trace_gk20a_mm_fb_flush(g->name); gk20a_writel(g, flush_fb_flush_r(), flush_fb_flush_pending_busy_f()); do { data = gk20a_readl(g, flush_fb_flush_r()); if (flush_fb_flush_outstanding_v(data) == flush_fb_flush_outstanding_true_v() || flush_fb_flush_pending_v(data) == flush_fb_flush_pending_busy_v()) { gk20a_dbg_info("fb_flush 0x%x", data); nvgpu_udelay(5); } else break; } while (!nvgpu_timeout_expired(&timeout)); if (nvgpu_timeout_peek_expired(&timeout)) { if (g->ops.fb.dump_vpr_wpr_info) g->ops.fb.dump_vpr_wpr_info(g); ret = -EBUSY; } trace_gk20a_mm_fb_flush_done(g->name); nvgpu_mutex_release(&mm->l2_op_lock); gk20a_idle_nosuspend(g); return ret; } static void gk20a_mm_l2_invalidate_locked(struct gk20a *g) { u32 data; struct nvgpu_timeout timeout; trace_gk20a_mm_l2_invalidate(g->name); nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER); /* Invalidate any clean lines from the L2 so subsequent reads go to DRAM. Dirty lines are not affected by this operation. */ gk20a_writel(g, flush_l2_system_invalidate_r(), flush_l2_system_invalidate_pending_busy_f()); do { data = gk20a_readl(g, flush_l2_system_invalidate_r()); if (flush_l2_system_invalidate_outstanding_v(data) == flush_l2_system_invalidate_outstanding_true_v() || flush_l2_system_invalidate_pending_v(data) == flush_l2_system_invalidate_pending_busy_v()) { gk20a_dbg_info("l2_system_invalidate 0x%x", data); nvgpu_udelay(5); } else break; } while (!nvgpu_timeout_expired(&timeout)); if (nvgpu_timeout_peek_expired(&timeout)) nvgpu_warn(g, "l2_system_invalidate too many retries"); trace_gk20a_mm_l2_invalidate_done(g->name); } void gk20a_mm_l2_invalidate(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; gk20a_busy_noresume(g); if (g->power_on) { nvgpu_mutex_acquire(&mm->l2_op_lock); gk20a_mm_l2_invalidate_locked(g); nvgpu_mutex_release(&mm->l2_op_lock); } gk20a_idle_nosuspend(g); } void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate) { struct mm_gk20a *mm = &g->mm; u32 data; struct nvgpu_timeout timeout; gk20a_dbg_fn(""); gk20a_busy_noresume(g); if (!g->power_on) goto hw_was_off; nvgpu_timeout_init(g, &timeout, 2000, NVGPU_TIMER_RETRY_TIMER); nvgpu_mutex_acquire(&mm->l2_op_lock); trace_gk20a_mm_l2_flush(g->name); /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2 as clean, so subsequent reads might hit in the L2. */ gk20a_writel(g, flush_l2_flush_dirty_r(), flush_l2_flush_dirty_pending_busy_f()); do { data = gk20a_readl(g, flush_l2_flush_dirty_r()); if (flush_l2_flush_dirty_outstanding_v(data) == flush_l2_flush_dirty_outstanding_true_v() || flush_l2_flush_dirty_pending_v(data) == flush_l2_flush_dirty_pending_busy_v()) { gk20a_dbg_info("l2_flush_dirty 0x%x", data); nvgpu_udelay(5); } else break; } while (!nvgpu_timeout_expired_msg(&timeout, "l2_flush_dirty too many retries")); trace_gk20a_mm_l2_flush_done(g->name); if (invalidate) gk20a_mm_l2_invalidate_locked(g); nvgpu_mutex_release(&mm->l2_op_lock); hw_was_off: gk20a_idle_nosuspend(g); } void gk20a_mm_cbc_clean(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; u32 data; struct nvgpu_timeout timeout; gk20a_dbg_fn(""); gk20a_busy_noresume(g); if (!g->power_on) goto hw_was_off; nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER); nvgpu_mutex_acquire(&mm->l2_op_lock); /* Flush all dirty lines from the CBC to L2 */ gk20a_writel(g, flush_l2_clean_comptags_r(), flush_l2_clean_comptags_pending_busy_f()); do { data = gk20a_readl(g, flush_l2_clean_comptags_r()); if (flush_l2_clean_comptags_outstanding_v(data) == flush_l2_clean_comptags_outstanding_true_v() || flush_l2_clean_comptags_pending_v(data) == flush_l2_clean_comptags_pending_busy_v()) { gk20a_dbg_info("l2_clean_comptags 0x%x", data); nvgpu_udelay(5); } else break; } while (!nvgpu_timeout_expired_msg(&timeout, "l2_clean_comptags too many retries")); nvgpu_mutex_release(&mm->l2_op_lock); hw_was_off: gk20a_idle_nosuspend(g); } int gk20a_mm_suspend(struct gk20a *g) { gk20a_dbg_fn(""); #if defined(CONFIG_GK20A_VIDMEM) cancel_work_sync(&g->mm.vidmem.clear_mem_worker); #endif g->ops.mm.cbc_clean(g); g->ops.mm.l2_flush(g, false); gk20a_dbg_fn("done"); return 0; } u32 gk20a_mm_get_iommu_bit(struct gk20a *g) { return 34; } const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g, u32 big_page_size) { return (big_page_size == SZ_64K) ? gk20a_mm_levels_64k : gk20a_mm_levels_128k; }