From 3e431e26c5c3aba6da8a6555ec3d7b7df53f534a Mon Sep 17 00:00:00 2001 From: Konsta Holtta Date: Mon, 16 May 2016 11:33:38 +0300 Subject: gpu: nvgpu: add PRAMIN support for mem accessors To support vidmem, implement a way to access buffers via the PRAMIN window instead of just kernel-mapped sysmem buffers for iGPU as of now. Depending on the buffer aperture, choose between the two access types in the buffer memory accessor functions. vmap()/vunmap() pairs are no-ops for buffers that can't be cpu-mapped. Two uses of DMA_ATTR_READ_ONLY are removed in the ucode loading path to support writing to them too via the indirection in addition to cpu. JIRA DNVGPU-23 Change-Id: I282dba6741c6b8224bc12e69c1fb3936bde7e6ed Signed-off-by: Konsta Holtta Reviewed-on: http://git-master/r/1141314 Reviewed-by: Terje Bergstrom Tested-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 3 +- drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h | 24 ++++++++ drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h | 57 ++++++++++++++++++ drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 102 +++++++++++++++++++++++++++++--- drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 15 +++++ drivers/gpu/nvgpu/gk20a/pmu_gk20a.c | 3 +- drivers/gpu/nvgpu/gm20b/hw_bus_gm20b.h | 24 ++++++++ drivers/gpu/nvgpu/gm20b/hw_pram_gm20b.h | 57 ++++++++++++++++++ 8 files changed, 272 insertions(+), 13 deletions(-) create mode 100644 drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h create mode 100644 drivers/gpu/nvgpu/gm20b/hw_pram_gm20b.h diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index a5caf048..076ddd12 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -2036,8 +2036,7 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g) g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32), g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32)); - err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_READ_ONLY, ucode_size, - &ucode_info->surface_desc); + err = gk20a_gmmu_alloc(g, ucode_size, &ucode_info->surface_desc); if (err) goto clean_up; diff --git a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h index 8a69c573..2c902f52 100644 --- a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h @@ -50,6 +50,30 @@ #ifndef _hw_bus_gk20a_h_ #define _hw_bus_gk20a_h_ +static inline u32 bus_bar0_window_r(void) +{ + return 0x00001700; +} +static inline u32 bus_bar0_window_base_f(u32 v) +{ + return (v & 0xffffff) << 0; +} +static inline u32 bus_bar0_window_target_vid_mem_f(void) +{ + return 0x0; +} +static inline u32 bus_bar0_window_target_sys_mem_coherent_f(void) +{ + return 0x2000000; +} +static inline u32 bus_bar0_window_target_sys_mem_noncoherent_f(void) +{ + return 0x3000000; +} +static inline u32 bus_bar0_window_target_bar0_window_base_shift_v(void) +{ + return 0x00000010; +} static inline u32 bus_bar1_block_r(void) { return 0x00001704; diff --git a/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h new file mode 100644 index 00000000..918dad9a --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/* + * Function naming determines intended use: + * + * _r(void) : Returns the offset for register . + * + * _o(void) : Returns the offset for element . + * + * _w(void) : Returns the word offset for word (4 byte) element . + * + * __s(void) : Returns size of field of register in bits. + * + * __f(u32 v) : Returns a value based on 'v' which has been shifted + * and masked to place it at field of register . This value + * can be |'d with others to produce a full register value for + * register . + * + * __m(void) : Returns a mask for field of register . This + * value can be ~'d and then &'d to clear the value of field for + * register . + * + * ___f(void) : Returns the constant value after being shifted + * to place it at field of register . This value can be |'d + * with others to produce a full register value for . + * + * __v(u32 r) : Returns the value of field from a full register + * value 'r' after being shifted to place its LSB at bit 0. + * This value is suitable for direct comparison with other unshifted + * values appropriate for use in field of register . + * + * ___v(void) : Returns the constant value for defined for + * field of register . This value is suitable for direct + * comparison with unshifted values appropriate for use in field + * of register . + */ +#ifndef _hw_pram_gk20a_h_ +#define _hw_pram_gk20a_h_ + +static inline u32 pram_data032_r(u32 i) +{ + return 0x00700000 + i*4; +} +#endif diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index eb4f01e0..ec946fb6 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -37,6 +37,7 @@ #include "hw_fb_gk20a.h" #include "hw_bus_gk20a.h" #include "hw_ram_gk20a.h" +#include "hw_pram_gk20a.h" #include "hw_mc_gk20a.h" #include "hw_flush_gk20a.h" #include "hw_ltc_gk20a.h" @@ -44,10 +45,20 @@ #include "kind_gk20a.h" #include "semaphore_gk20a.h" +/* + * Flip this to force all gk20a_mem* accesses via PRAMIN from the start of the + * boot, even for buffers that would work via cpu_va. In runtime, the flag is + * in debugfs, called "force_pramin". + */ +#define GK20A_FORCE_PRAMIN_DEFAULT false + int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem) { void *cpu_va; + if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin) + return 0; + if (WARN_ON(mem->cpu_va)) { gk20a_warn(dev_from_gk20a(g), "nested %s", __func__); return -EBUSY; @@ -66,20 +77,66 @@ int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem) void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem) { + if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin) + return; + vunmap(mem->cpu_va); mem->cpu_va = NULL; } +/* WARNING: returns pramin_base_lock taken, complement with pramin_exit() */ +static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w) +{ + u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0); + u64 addr = bufbase + w * sizeof(u32); + u32 hi = (u32)((addr & ~(u64)0xfffff) + >> bus_bar0_window_target_bar0_window_base_shift_v()); + u32 lo = (addr & 0xfffff); + + gk20a_dbg(gpu_dbg_mem, "0x%08x:%08x begin for %p", hi, lo, mem); + + WARN_ON(!bufbase); + spin_lock(&g->mm.pramin_base_lock); + if (g->mm.pramin_base != hi) { + gk20a_writel(g, bus_bar0_window_r(), + (g->mm.vidmem_is_vidmem + && mem->aperture == APERTURE_SYSMEM ? + bus_bar0_window_target_sys_mem_noncoherent_f() : + bus_bar0_window_target_vid_mem_f()) | + bus_bar0_window_base_f(hi)); + gk20a_readl(g, bus_bar0_window_r()); + g->mm.pramin_base = hi; + } + + return lo; +} + +static void gk20a_pramin_exit(struct gk20a *g, struct mem_desc *mem) +{ + gk20a_dbg(gpu_dbg_mem, "end for %p", mem); + spin_unlock(&g->mm.pramin_base_lock); +} + u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w) { - u32 *ptr = mem->cpu_va; - u32 data; + u32 data = 0; + + if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) { + u32 *ptr = mem->cpu_va; - WARN_ON(!ptr); - data = ptr[w]; + WARN_ON(!ptr); + data = ptr[w]; #ifdef CONFIG_TEGRA_SIMULATION_PLATFORM - gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data); + gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data); #endif + } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) { + u32 addr = gk20a_pramin_enter(g, mem, w); + data = gk20a_readl(g, pram_data032_r(addr / sizeof(u32))); + gk20a_pramin_exit(g, mem); + } else { + WARN_ON("Accessing unallocated mem_desc"); + } + return data; } @@ -106,13 +163,23 @@ void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data) { - u32 *ptr = mem->cpu_va; + if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) { + u32 *ptr = mem->cpu_va; - WARN_ON(!ptr); + WARN_ON(!ptr); #ifdef CONFIG_TEGRA_SIMULATION_PLATFORM - gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data); + gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data); #endif - ptr[w] = data; + ptr[w] = data; + } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) { + u32 addr = gk20a_pramin_enter(g, mem, w); + gk20a_writel(g, pram_data032_r(addr / sizeof(u32)), data); + /* read back to synchronize accesses*/ + gk20a_readl(g, pram_data032_r(addr / sizeof(u32))); + gk20a_pramin_exit(g, mem); + } else { + WARN_ON("Accessing unallocated mem_desc"); + } } void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data) @@ -535,6 +602,13 @@ static int gk20a_alloc_sysmem_flush(struct gk20a *g) return gk20a_gmmu_alloc(g, SZ_4K, &g->mm.sysmem_flush); } +static void gk20a_init_pramin(struct mm_gk20a *mm) +{ + mm->pramin_base = 0; + spin_lock_init(&mm->pramin_base_lock); + mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT; +} + int gk20a_init_mm_setup_sw(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; @@ -558,6 +632,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g) (int)(mm->channel.user_size >> 20), (int)(mm->channel.kernel_size >> 20)); + gk20a_init_pramin(mm); + err = gk20a_alloc_sysmem_flush(g); if (err) return err; @@ -586,6 +662,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g) /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */ g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share; mm->remove_support = gk20a_remove_mm_support; + mm->sw_ready = true; gk20a_dbg_fn("done"); @@ -690,6 +767,7 @@ static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order, entry->mem.cpu_va = page_address(pages); memset(entry->mem.cpu_va, 0, len); entry->mem.size = len; + entry->mem.aperture = APERTURE_SYSMEM; FLUSH_CPU_DCACHE(entry->mem.cpu_va, sg_phys(entry->mem.sgt->sgl), len); return 0; @@ -716,6 +794,7 @@ static void free_gmmu_phys_pages(struct vm_gk20a *vm, kfree(entry->mem.sgt); entry->mem.sgt = NULL; entry->mem.size = 0; + entry->mem.aperture = APERTURE_INVALID; } static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry) @@ -2164,6 +2243,7 @@ int gk20a_gmmu_alloc_attr(struct gk20a *g, enum dma_attr attr, size_t size, stru goto fail_free; mem->size = size; + mem->aperture = APERTURE_SYSMEM; gk20a_dbg_fn("done"); @@ -2210,6 +2290,7 @@ void gk20a_gmmu_free_attr(struct gk20a *g, enum dma_attr attr, gk20a_free_sgtable(&mem->sgt); mem->size = 0; + mem->aperture = APERTURE_INVALID; } void gk20a_gmmu_free(struct gk20a *g, struct mem_desc *mem) @@ -4015,6 +4096,9 @@ void gk20a_mm_debugfs_init(struct device *dev) debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root, &g->separate_fixed_allocs); + + debugfs_create_bool("force_pramin", 0664, gpu_root, + &g->mm.force_pramin); } void gk20a_init_mm(struct gpu_ops *gops) diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index d943b231..c58a4fec 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -40,10 +40,17 @@ outer_flush_range(pa, pa + (size_t)(size)); \ } while (0) +enum gk20a_aperture { + APERTURE_INVALID, /* e.g., unallocated */ + APERTURE_SYSMEM, + APERTURE_VIDMEM +}; + struct mem_desc { void *cpu_va; struct page **pages; struct sg_table *sgt; + enum gk20a_aperture aperture; size_t size; u64 gpu_va; }; @@ -357,6 +364,14 @@ struct mm_gk20a { bool vidmem_is_vidmem; struct mem_desc sysmem_flush; + + u32 pramin_base; + spinlock_t pramin_base_lock; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,4,0) + u32 force_pramin; /* via debugfs */ +#else + bool force_pramin; /* via debugfs */ +#endif }; int gk20a_mm_init(struct mm_gk20a *mm); diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c index 8bf382fd..08ef7738 100644 --- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c @@ -2443,8 +2443,7 @@ static int gk20a_prepare_ucode(struct gk20a *g) pmu->ucode_image = (u32 *)((u8 *)pmu->desc + pmu->desc->descriptor_size); - err = gk20a_gmmu_alloc_map_attr(vm, DMA_ATTR_READ_ONLY, - GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode); + err = gk20a_gmmu_alloc_map(vm, GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode); if (err) goto err_release_fw; diff --git a/drivers/gpu/nvgpu/gm20b/hw_bus_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_bus_gm20b.h index e69275e0..0b4eefe0 100644 --- a/drivers/gpu/nvgpu/gm20b/hw_bus_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/hw_bus_gm20b.h @@ -50,6 +50,30 @@ #ifndef _hw_bus_gm20b_h_ #define _hw_bus_gm20b_h_ +static inline u32 bus_bar0_window_r(void) +{ + return 0x00001700; +} +static inline u32 bus_bar0_window_base_f(u32 v) +{ + return (v & 0xffffff) << 0; +} +static inline u32 bus_bar0_window_target_vid_mem_f(void) +{ + return 0x0; +} +static inline u32 bus_bar0_window_target_sys_mem_coherent_f(void) +{ + return 0x2000000; +} +static inline u32 bus_bar0_window_target_sys_mem_noncoherent_f(void) +{ + return 0x3000000; +} +static inline u32 bus_bar0_window_target_bar0_window_base_shift_v(void) +{ + return 0x00000010; +} static inline u32 bus_bar1_block_r(void) { return 0x00001704; diff --git a/drivers/gpu/nvgpu/gm20b/hw_pram_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_pram_gm20b.h new file mode 100644 index 00000000..f9c6f3d4 --- /dev/null +++ b/drivers/gpu/nvgpu/gm20b/hw_pram_gm20b.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/* + * Function naming determines intended use: + * + * _r(void) : Returns the offset for register . + * + * _o(void) : Returns the offset for element . + * + * _w(void) : Returns the word offset for word (4 byte) element . + * + * __s(void) : Returns size of field of register in bits. + * + * __f(u32 v) : Returns a value based on 'v' which has been shifted + * and masked to place it at field of register . This value + * can be |'d with others to produce a full register value for + * register . + * + * __m(void) : Returns a mask for field of register . This + * value can be ~'d and then &'d to clear the value of field for + * register . + * + * ___f(void) : Returns the constant value after being shifted + * to place it at field of register . This value can be |'d + * with others to produce a full register value for . + * + * __v(u32 r) : Returns the value of field from a full register + * value 'r' after being shifted to place its LSB at bit 0. + * This value is suitable for direct comparison with other unshifted + * values appropriate for use in field of register . + * + * ___v(void) : Returns the constant value for defined for + * field of register . This value is suitable for direct + * comparison with unshifted values appropriate for use in field + * of register . + */ +#ifndef _hw_pram_gm20b_h_ +#define _hw_pram_gm20b_h_ + +static inline u32 pram_data032_r(u32 i) +{ + return 0x00700000 + i*4; +} +#endif -- cgit v1.2.2