gpu: nvgpu: add PRAMIN support for mem accessors

To support vidmem, implement a way to access buffers via the PRAMIN window instead of just kernel-mapped sysmem buffers for iGPU as of now. Depending on the buffer aperture, choose between the two access types in the buffer memory accessor functions. vmap()/vunmap() pairs are no-ops for buffers that can't be cpu-mapped. Two uses of DMA_ATTR_READ_ONLY are removed in the ucode loading path to support writing to them too via the indirection in addition to cpu. JIRA DNVGPU-23 Change-Id: I282dba6741c6b8224bc12e69c1fb3936bde7e6ed Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1141314 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2016-05-16 04:33:38 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-05-24 15:39:06 -0400
commit: 3e431e26c5c3aba6da8a6555ec3d7b7df53f534a (patch)
tree: de7baabb5bbc4a5d27af36d62c00827b7bad3f54 /drivers/gpu/nvgpu/gk20a
parent: dc7af18bf8056c213165d4cd1c55ea0fba9f1341 (diff)
6 files changed, 191 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index a5caf048..076ddd12 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -2036,8 +2036,7 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
                g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
                g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
-        err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_READ_ONLY, ucode_size,
+        err = gk20a_gmmu_alloc(g, ucode_size, &ucode_info->surface_desc);
-                        &ucode_info->surface_desc);
        if (err)
                goto clean_up;
diff --git a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
index 8a69c573..2c902f52 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
@@ -50,6 +50,30 @@
 #ifndef _hw_bus_gk20a_h_
 #define _hw_bus_gk20a_h_
+static inline u32 bus_bar0_window_r(void)
+{
+        return 0x00001700;
+}
+static inline u32 bus_bar0_window_base_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 bus_bar0_window_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 bus_bar0_window_target_sys_mem_coherent_f(void)
+{
+        return 0x2000000;
+}
+static inline u32 bus_bar0_window_target_sys_mem_noncoherent_f(void)
+{
+        return 0x3000000;
+}
+static inline u32 bus_bar0_window_target_bar0_window_base_shift_v(void)
+{
+        return 0x00000010;
+}
 static inline u32 bus_bar1_block_r(void)
 {
        return 0x00001704;
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h
new file mode 100644
index 00000000..918dad9a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pram_gk20a_h_
+#define _hw_pram_gk20a_h_
+static inline u32 pram_data032_r(u32 i)
+{
+        return 0x00700000 + i*4;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index eb4f01e0..ec946fb6 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -37,6 +37,7 @@
 #include "hw_fb_gk20a.h"
 #include "hw_bus_gk20a.h"
 #include "hw_ram_gk20a.h"
+#include "hw_pram_gk20a.h"
 #include "hw_mc_gk20a.h"
 #include "hw_flush_gk20a.h"
 #include "hw_ltc_gk20a.h"
@@ -44,10 +45,20 @@
 #include "kind_gk20a.h"
 #include "semaphore_gk20a.h"
+/*
+ * Flip this to force all gk20a_mem* accesses via PRAMIN from the start of the
+ * boot, even for buffers that would work via cpu_va. In runtime, the flag is
+ * in debugfs, called "force_pramin".
+ */
+#define GK20A_FORCE_PRAMIN_DEFAULT false
 int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
 {
        void *cpu_va;
+        if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
+                return 0;
        if (WARN_ON(mem->cpu_va)) {
                gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
                return -EBUSY;
@@ -66,20 +77,66 @@ int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
 void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
 {
+        if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
+                return;
        vunmap(mem->cpu_va);
        mem->cpu_va = NULL;
 }
+/* WARNING: returns pramin_base_lock taken, complement with pramin_exit() */
+static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w)
+{
+        u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+        u64 addr = bufbase + w * sizeof(u32);
+        u32 hi = (u32)((addr & ~(u64)0xfffff)
+                >> bus_bar0_window_target_bar0_window_base_shift_v());
+        u32 lo = (addr & 0xfffff);
+        gk20a_dbg(gpu_dbg_mem, "0x%08x:%08x begin for %p", hi, lo, mem);
+        WARN_ON(!bufbase);
+        spin_lock(&g->mm.pramin_base_lock);
+        if (g->mm.pramin_base != hi) {
+                gk20a_writel(g, bus_bar0_window_r(),
+                                (g->mm.vidmem_is_vidmem
+                                 && mem->aperture == APERTURE_SYSMEM ?
+                                 bus_bar0_window_target_sys_mem_noncoherent_f() :
+                                 bus_bar0_window_target_vid_mem_f()) |
+                                bus_bar0_window_base_f(hi));
+                gk20a_readl(g, bus_bar0_window_r());
+                g->mm.pramin_base = hi;
+        }
+        return lo;
+}
+static void gk20a_pramin_exit(struct gk20a *g, struct mem_desc *mem)
+{
+        gk20a_dbg(gpu_dbg_mem, "end for %p", mem);
+        spin_unlock(&g->mm.pramin_base_lock);
+}
 u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
 {
-        u32 *ptr = mem->cpu_va;
+        u32 data = 0;
-        u32 data;
+        if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
+                u32 *ptr = mem->cpu_va;
-        WARN_ON(!ptr);
+                WARN_ON(!ptr);
-        data = ptr[w];
+                data = ptr[w];
 #ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+                gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
 #endif
+        } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
+                u32 addr = gk20a_pramin_enter(g, mem, w);
+                data = gk20a_readl(g, pram_data032_r(addr / sizeof(u32)));
+                gk20a_pramin_exit(g, mem);
+        } else {
+                WARN_ON("Accessing unallocated mem_desc");
+        }
        return data;
 }
@@ -106,13 +163,23 @@ void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
 void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
 {
-        u32 *ptr = mem->cpu_va;
+        if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
+                u32 *ptr = mem->cpu_va;
-        WARN_ON(!ptr);
+                WARN_ON(!ptr);
 #ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+                gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
 #endif
-        ptr[w] = data;
+                ptr[w] = data;
+        } else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
+                u32 addr = gk20a_pramin_enter(g, mem, w);
+                gk20a_writel(g, pram_data032_r(addr / sizeof(u32)), data);
+                /* read back to synchronize accesses*/
+                gk20a_readl(g, pram_data032_r(addr / sizeof(u32)));
+                gk20a_pramin_exit(g, mem);
+        } else {
+                WARN_ON("Accessing unallocated mem_desc");
+        }
 }
 void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
@@ -535,6 +602,13 @@ static int gk20a_alloc_sysmem_flush(struct gk20a *g)
        return gk20a_gmmu_alloc(g, SZ_4K, &g->mm.sysmem_flush);
 }
+static void gk20a_init_pramin(struct mm_gk20a *mm)
+{
+        mm->pramin_base = 0;
+        spin_lock_init(&mm->pramin_base_lock);
+        mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT;
+}
 int gk20a_init_mm_setup_sw(struct gk20a *g)
 {
        struct mm_gk20a *mm = &g->mm;
@@ -558,6 +632,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
                       (int)(mm->channel.user_size >> 20),
                       (int)(mm->channel.kernel_size >> 20));
+        gk20a_init_pramin(mm);
        err = gk20a_alloc_sysmem_flush(g);
        if (err)
                return err;
@@ -586,6 +662,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
        /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
        g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
        mm->remove_support = gk20a_remove_mm_support;
        mm->sw_ready = true;
        gk20a_dbg_fn("done");
@@ -690,6 +767,7 @@ static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
        entry->mem.cpu_va = page_address(pages);
        memset(entry->mem.cpu_va, 0, len);
        entry->mem.size = len;
+        entry->mem.aperture = APERTURE_SYSMEM;
        FLUSH_CPU_DCACHE(entry->mem.cpu_va, sg_phys(entry->mem.sgt->sgl), len);
        return 0;
@@ -716,6 +794,7 @@ static void free_gmmu_phys_pages(struct vm_gk20a *vm,
        kfree(entry->mem.sgt);
        entry->mem.sgt = NULL;
        entry->mem.size = 0;
+        entry->mem.aperture = APERTURE_INVALID;
 }
 static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
@@ -2164,6 +2243,7 @@ int gk20a_gmmu_alloc_attr(struct gk20a *g, enum dma_attr attr, size_t size, stru
                goto fail_free;
        mem->size = size;
+        mem->aperture = APERTURE_SYSMEM;
        gk20a_dbg_fn("done");
@@ -2210,6 +2290,7 @@ void gk20a_gmmu_free_attr(struct gk20a *g, enum dma_attr attr,
                gk20a_free_sgtable(&mem->sgt);
        mem->size = 0;
+        mem->aperture = APERTURE_INVALID;
 }
 void gk20a_gmmu_free(struct gk20a *g, struct mem_desc *mem)
@@ -4015,6 +4096,9 @@ void gk20a_mm_debugfs_init(struct device *dev)
        debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,
                           &g->separate_fixed_allocs);
+        debugfs_create_bool("force_pramin", 0664, gpu_root,
+                           &g->mm.force_pramin);
 }
 void gk20a_init_mm(struct gpu_ops *gops)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index d943b231..c58a4fec 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -40,10 +40,17 @@
                outer_flush_range(pa, pa + (size_t)(size));             \
        } while (0)
+enum gk20a_aperture {
+        APERTURE_INVALID, /* e.g., unallocated */
+        APERTURE_SYSMEM,
+        APERTURE_VIDMEM
+};
 struct mem_desc {
        void *cpu_va;
        struct page **pages;
        struct sg_table *sgt;
+        enum gk20a_aperture aperture;
        size_t size;
        u64 gpu_va;
 };
@@ -357,6 +364,14 @@ struct mm_gk20a {
        bool vidmem_is_vidmem;
        struct mem_desc sysmem_flush;
+        u32 pramin_base;
+        spinlock_t pramin_base_lock;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,4,0)
+        u32 force_pramin; /* via debugfs */
+#else
+        bool force_pramin; /* via debugfs */
+#endif
 };
 int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 8bf382fd..08ef7738 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2443,8 +2443,7 @@ static int gk20a_prepare_ucode(struct gk20a *g)
        pmu->ucode_image = (u32 *)((u8 *)pmu->desc +
                        pmu->desc->descriptor_size);
-        err = gk20a_gmmu_alloc_map_attr(vm, DMA_ATTR_READ_ONLY,
+        err = gk20a_gmmu_alloc_map(vm, GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode);
-                               GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode);
        if (err)
                goto err_release_fw;
author	Konsta Holtta <kholtta@nvidia.com>	2016-05-16 04:33:38 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-05-24 15:39:06 -0400
commit	3e431e26c5c3aba6da8a6555ec3d7b7df53f534a (patch)
tree	de7baabb5bbc4a5d27af36d62c00827b7bad3f54 /drivers/gpu/nvgpu/gk20a
parent	dc7af18bf8056c213165d4cd1c55ea0fba9f1341 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index a5caf048..076ddd12 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -2036,8 +2036,7 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
2036	g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),	2036	g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
2037	g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));	2037	g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
2038		2038
2039	err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_READ_ONLY, ucode_size,	2039	err = gk20a_gmmu_alloc(g, ucode_size, &ucode_info->surface_desc);
2040	&ucode_info->surface_desc);
2041	if (err)	2040	if (err)
2042	goto clean_up;	2041	goto clean_up;
2043		2042


diff --git a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h index 8a69c573..2c902f52 100644 --- a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
@@ -50,6 +50,30 @@
50	#ifndef _hw_bus_gk20a_h_	50	#ifndef _hw_bus_gk20a_h_
51	#define _hw_bus_gk20a_h_	51	#define _hw_bus_gk20a_h_
52		52
		53	static inline u32 bus_bar0_window_r(void)
		54	{
		55	return 0x00001700;
		56	}
		57	static inline u32 bus_bar0_window_base_f(u32 v)
		58	{
		59	return (v & 0xffffff) << 0;
		60	}
		61	static inline u32 bus_bar0_window_target_vid_mem_f(void)
		62	{
		63	return 0x0;
		64	}
		65	static inline u32 bus_bar0_window_target_sys_mem_coherent_f(void)
		66	{
		67	return 0x2000000;
		68	}
		69	static inline u32 bus_bar0_window_target_sys_mem_noncoherent_f(void)
		70	{
		71	return 0x3000000;
		72	}
		73	static inline u32 bus_bar0_window_target_bar0_window_base_shift_v(void)
		74	{
		75	return 0x00000010;
		76	}
53	static inline u32 bus_bar1_block_r(void)	77	static inline u32 bus_bar1_block_r(void)
54	{	78	{
55	return 0x00001704;	79	return 0x00001704;


diff --git a/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h new file mode 100644 index 00000000..918dad9a --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/hw_pram_gk20a.h
@@ -0,0 +1,57 @@
		1	/*
		2	* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
		3	*
		4	* This program is free software; you can redistribute it and/or modify it
		5	* under the terms and conditions of the GNU General Public License,
		6	* version 2, as published by the Free Software Foundation.
		7	*
		8	* This program is distributed in the hope it will be useful, but WITHOUT
		9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
		10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
		11	* more details.
		12	*
		13	* You should have received a copy of the GNU General Public License
		14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		15	*/
		16	/*
		17	* Function naming determines intended use:
		18	*
		19	* <x>_r(void) : Returns the offset for register <x>.
		20	*
		21	* <x>_o(void) : Returns the offset for element <x>.
		22	*
		23	* <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
		24	*
		25	* <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
		26	*
		27	* <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
		28	* and masked to place it at field <y> of register <x>. This value
		29	* can be \|'d with others to produce a full register value for
		30	* register <x>.
		31	*
		32	* <x>_<y>_m(void) : Returns a mask for field <y> of register <x>. This
		33	* value can be ~'d and then &'d to clear the value of field <y> for
		34	* register <x>.
		35	*
		36	* <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
		37	* to place it at field <y> of register <x>. This value can be \|'d
		38	* with others to produce a full register value for <x>.
		39	*
		40	* <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
		41	* <x> value 'r' after being shifted to place its LSB at bit 0.
		42	* This value is suitable for direct comparison with other unshifted
		43	* values appropriate for use in field <y> of register <x>.
		44	*
		45	* <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
		46	* field <y> of register <x>. This value is suitable for direct
		47	* comparison with unshifted values appropriate for use in field <y>
		48	* of register <x>.
		49	*/
		50	#ifndef _hw_pram_gk20a_h_
		51	#define _hw_pram_gk20a_h_
		52
		53	static inline u32 pram_data032_r(u32 i)
		54	{
		55	return 0x00700000 + i*4;
		56	}
		57	#endif


diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index eb4f01e0..ec946fb6 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -37,6 +37,7 @@
37	#include "hw_fb_gk20a.h"	37	#include "hw_fb_gk20a.h"
38	#include "hw_bus_gk20a.h"	38	#include "hw_bus_gk20a.h"
39	#include "hw_ram_gk20a.h"	39	#include "hw_ram_gk20a.h"
		40	#include "hw_pram_gk20a.h"
40	#include "hw_mc_gk20a.h"	41	#include "hw_mc_gk20a.h"
41	#include "hw_flush_gk20a.h"	42	#include "hw_flush_gk20a.h"
42	#include "hw_ltc_gk20a.h"	43	#include "hw_ltc_gk20a.h"
@@ -44,10 +45,20 @@
44	#include "kind_gk20a.h"	45	#include "kind_gk20a.h"
45	#include "semaphore_gk20a.h"	46	#include "semaphore_gk20a.h"
46		47
		48	/*
		49	* Flip this to force all gk20a_mem* accesses via PRAMIN from the start of the
		50	* boot, even for buffers that would work via cpu_va. In runtime, the flag is
		51	* in debugfs, called "force_pramin".
		52	*/
		53	#define GK20A_FORCE_PRAMIN_DEFAULT false
		54
47	int gk20a_mem_begin(struct gk20a g, struct mem_desc mem)	55	int gk20a_mem_begin(struct gk20a g, struct mem_desc mem)
48	{	56	{
49	void *cpu_va;	57	void *cpu_va;
50		58
		59	if (mem->aperture != APERTURE_SYSMEM \|\| g->mm.force_pramin)
		60	return 0;
		61
51	if (WARN_ON(mem->cpu_va)) {	62	if (WARN_ON(mem->cpu_va)) {
52	gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);	63	gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
53	return -EBUSY;	64	return -EBUSY;
@@ -66,20 +77,66 @@ int gk20a_mem_begin(struct gk20a g, struct mem_desc mem)
66		77
67	void gk20a_mem_end(struct gk20a g, struct mem_desc mem)	78	void gk20a_mem_end(struct gk20a g, struct mem_desc mem)
68	{	79	{
		80	if (mem->aperture != APERTURE_SYSMEM \|\| g->mm.force_pramin)
		81	return;
		82
69	vunmap(mem->cpu_va);	83	vunmap(mem->cpu_va);
70	mem->cpu_va = NULL;	84	mem->cpu_va = NULL;
71	}	85	}
72		86
		87	/* WARNING: returns pramin_base_lock taken, complement with pramin_exit() */
		88	static u32 gk20a_pramin_enter(struct gk20a g, struct mem_desc mem, u32 w)
		89	{
		90	u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
		91	u64 addr = bufbase + w * sizeof(u32);
		92	u32 hi = (u32)((addr & ~(u64)0xfffff)
		93	>> bus_bar0_window_target_bar0_window_base_shift_v());
		94	u32 lo = (addr & 0xfffff);
		95
		96	gk20a_dbg(gpu_dbg_mem, "0x%08x:%08x begin for %p", hi, lo, mem);
		97
		98	WARN_ON(!bufbase);
		99	spin_lock(&g->mm.pramin_base_lock);
		100	if (g->mm.pramin_base != hi) {
		101	gk20a_writel(g, bus_bar0_window_r(),
		102	(g->mm.vidmem_is_vidmem
		103	&& mem->aperture == APERTURE_SYSMEM ?
		104	bus_bar0_window_target_sys_mem_noncoherent_f() :
		105	bus_bar0_window_target_vid_mem_f()) \|
		106	bus_bar0_window_base_f(hi));
		107	gk20a_readl(g, bus_bar0_window_r());
		108	g->mm.pramin_base = hi;
		109	}
		110
		111	return lo;
		112	}
		113
		114	static void gk20a_pramin_exit(struct gk20a g, struct mem_desc mem)
		115	{
		116	gk20a_dbg(gpu_dbg_mem, "end for %p", mem);
		117	spin_unlock(&g->mm.pramin_base_lock);
		118	}
		119
73	u32 gk20a_mem_rd32(struct gk20a g, struct mem_desc mem, u32 w)	120	u32 gk20a_mem_rd32(struct gk20a g, struct mem_desc mem, u32 w)
74	{	121	{
75	u32 *ptr = mem->cpu_va;	122	u32 data = 0;
76	u32 data;	123
		124	if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
		125	u32 *ptr = mem->cpu_va;
77		126
78	WARN_ON(!ptr);	127	WARN_ON(!ptr);
79	data = ptr[w];	128	data = ptr[w];
80	#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM	129	#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
81	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);	130	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
82	#endif	131	#endif
		132	} else if (mem->aperture == APERTURE_VIDMEM \|\| g->mm.force_pramin) {
		133	u32 addr = gk20a_pramin_enter(g, mem, w);
		134	data = gk20a_readl(g, pram_data032_r(addr / sizeof(u32)));
		135	gk20a_pramin_exit(g, mem);
		136	} else {
		137	WARN_ON("Accessing unallocated mem_desc");
		138	}
		139
83	return data;	140	return data;
84	}	141	}
85		142
@@ -106,13 +163,23 @@ void gk20a_mem_rd_n(struct gk20a g, struct mem_desc mem,
106		163
107	void gk20a_mem_wr32(struct gk20a g, struct mem_desc mem, u32 w, u32 data)	164	void gk20a_mem_wr32(struct gk20a g, struct mem_desc mem, u32 w, u32 data)
108	{	165	{
109	u32 *ptr = mem->cpu_va;	166	if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
		167	u32 *ptr = mem->cpu_va;
110		168
111	WARN_ON(!ptr);	169	WARN_ON(!ptr);
112	#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM	170	#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
113	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);	171	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
114	#endif	172	#endif
115	ptr[w] = data;	173	ptr[w] = data;
		174	} else if (mem->aperture == APERTURE_VIDMEM \|\| g->mm.force_pramin) {
		175	u32 addr = gk20a_pramin_enter(g, mem, w);
		176	gk20a_writel(g, pram_data032_r(addr / sizeof(u32)), data);
		177	/* read back to synchronize accesses*/
		178	gk20a_readl(g, pram_data032_r(addr / sizeof(u32)));
		179	gk20a_pramin_exit(g, mem);
		180	} else {
		181	WARN_ON("Accessing unallocated mem_desc");
		182	}
116	}	183	}
117		184
118	void gk20a_mem_wr(struct gk20a g, struct mem_desc mem, u32 offset, u32 data)	185	void gk20a_mem_wr(struct gk20a g, struct mem_desc mem, u32 offset, u32 data)
@@ -535,6 +602,13 @@ static int gk20a_alloc_sysmem_flush(struct gk20a *g)
535	return gk20a_gmmu_alloc(g, SZ_4K, &g->mm.sysmem_flush);	602	return gk20a_gmmu_alloc(g, SZ_4K, &g->mm.sysmem_flush);
536	}	603	}
537		604
		605	static void gk20a_init_pramin(struct mm_gk20a *mm)
		606	{
		607	mm->pramin_base = 0;
		608	spin_lock_init(&mm->pramin_base_lock);
		609	mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT;
		610	}
		611
538	int gk20a_init_mm_setup_sw(struct gk20a *g)	612	int gk20a_init_mm_setup_sw(struct gk20a *g)
539	{	613	{
540	struct mm_gk20a *mm = &g->mm;	614	struct mm_gk20a *mm = &g->mm;
@@ -558,6 +632,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
558	(int)(mm->channel.user_size >> 20),	632	(int)(mm->channel.user_size >> 20),
559	(int)(mm->channel.kernel_size >> 20));	633	(int)(mm->channel.kernel_size >> 20));
560		634
		635	gk20a_init_pramin(mm);
		636
561	err = gk20a_alloc_sysmem_flush(g);	637	err = gk20a_alloc_sysmem_flush(g);
562	if (err)	638	if (err)
563	return err;	639	return err;
@@ -586,6 +662,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
586	/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */	662	/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
587	g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;	663	g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
588	mm->remove_support = gk20a_remove_mm_support;	664	mm->remove_support = gk20a_remove_mm_support;
		665
589	mm->sw_ready = true;	666	mm->sw_ready = true;
590		667
591	gk20a_dbg_fn("done");	668	gk20a_dbg_fn("done");
@@ -690,6 +767,7 @@ static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
690	entry->mem.cpu_va = page_address(pages);	767	entry->mem.cpu_va = page_address(pages);
691	memset(entry->mem.cpu_va, 0, len);	768	memset(entry->mem.cpu_va, 0, len);
692	entry->mem.size = len;	769	entry->mem.size = len;
		770	entry->mem.aperture = APERTURE_SYSMEM;
693	FLUSH_CPU_DCACHE(entry->mem.cpu_va, sg_phys(entry->mem.sgt->sgl), len);	771	FLUSH_CPU_DCACHE(entry->mem.cpu_va, sg_phys(entry->mem.sgt->sgl), len);
694		772
695	return 0;	773	return 0;
@@ -716,6 +794,7 @@ static void free_gmmu_phys_pages(struct vm_gk20a *vm,
716	kfree(entry->mem.sgt);	794	kfree(entry->mem.sgt);
717	entry->mem.sgt = NULL;	795	entry->mem.sgt = NULL;
718	entry->mem.size = 0;	796	entry->mem.size = 0;
		797	entry->mem.aperture = APERTURE_INVALID;
719	}	798	}
720		799
721	static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)	800	static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
@@ -2164,6 +2243,7 @@ int gk20a_gmmu_alloc_attr(struct gk20a *g, enum dma_attr attr, size_t size, stru
2164	goto fail_free;	2243	goto fail_free;
2165		2244
2166	mem->size = size;	2245	mem->size = size;
		2246	mem->aperture = APERTURE_SYSMEM;
2167		2247
2168	gk20a_dbg_fn("done");	2248	gk20a_dbg_fn("done");
2169		2249
@@ -2210,6 +2290,7 @@ void gk20a_gmmu_free_attr(struct gk20a *g, enum dma_attr attr,
2210	gk20a_free_sgtable(&mem->sgt);	2290	gk20a_free_sgtable(&mem->sgt);
2211		2291
2212	mem->size = 0;	2292	mem->size = 0;
		2293	mem->aperture = APERTURE_INVALID;
2213	}	2294	}
2214		2295
2215	void gk20a_gmmu_free(struct gk20a g, struct mem_desc mem)	2296	void gk20a_gmmu_free(struct gk20a g, struct mem_desc mem)
@@ -4015,6 +4096,9 @@ void gk20a_mm_debugfs_init(struct device *dev)
4015		4096
4016	debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,	4097	debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,
4017	&g->separate_fixed_allocs);	4098	&g->separate_fixed_allocs);
		4099
		4100	debugfs_create_bool("force_pramin", 0664, gpu_root,
		4101	&g->mm.force_pramin);
4018	}	4102	}
4019		4103
4020	void gk20a_init_mm(struct gpu_ops *gops)	4104	void gk20a_init_mm(struct gpu_ops *gops)


diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index d943b231..c58a4fec 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -40,10 +40,17 @@
40	outer_flush_range(pa, pa + (size_t)(size)); \	40	outer_flush_range(pa, pa + (size_t)(size)); \
41	} while (0)	41	} while (0)
42		42
		43	enum gk20a_aperture {
		44	APERTURE_INVALID, /* e.g., unallocated */
		45	APERTURE_SYSMEM,
		46	APERTURE_VIDMEM
		47	};
		48
43	struct mem_desc {	49	struct mem_desc {
44	void *cpu_va;	50	void *cpu_va;
45	struct page **pages;	51	struct page **pages;
46	struct sg_table *sgt;	52	struct sg_table *sgt;
		53	enum gk20a_aperture aperture;
47	size_t size;	54	size_t size;
48	u64 gpu_va;	55	u64 gpu_va;
49	};	56	};
@@ -357,6 +364,14 @@ struct mm_gk20a {
357	bool vidmem_is_vidmem;	364	bool vidmem_is_vidmem;
358		365
359	struct mem_desc sysmem_flush;	366	struct mem_desc sysmem_flush;
		367
		368	u32 pramin_base;
		369	spinlock_t pramin_base_lock;
		370	#if LINUX_VERSION_CODE < KERNEL_VERSION(4,4,0)
		371	u32 force_pramin; /* via debugfs */
		372	#else
		373	bool force_pramin; /* via debugfs */
		374	#endif
360	};	375	};
361		376
362	int gk20a_mm_init(struct mm_gk20a *mm);	377	int gk20a_mm_init(struct mm_gk20a *mm);


diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c index 8bf382fd..08ef7738 100644 --- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2443,8 +2443,7 @@ static int gk20a_prepare_ucode(struct gk20a *g)
2443	pmu->ucode_image = (u32 )((u8 )pmu->desc +	2443	pmu->ucode_image = (u32 )((u8 )pmu->desc +
2444	pmu->desc->descriptor_size);	2444	pmu->desc->descriptor_size);
2445		2445
2446	err = gk20a_gmmu_alloc_map_attr(vm, DMA_ATTR_READ_ONLY,	2446	err = gk20a_gmmu_alloc_map(vm, GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode);
2447	GK20A_PMU_UCODE_SIZE_MAX, &pmu->ucode);
2448	if (err)	2447	if (err)
2449	goto err_release_fw;	2448	goto err_release_fw;
2450		2449