gpu: nvgpu: gmmu programming rewrite

Update the high level mapping logic. Instead of iterating over the GPU VA iterate over the scatter-gather table chunks. As a result each GMMU page table update call is simplified dramatically. This also modifies the chip level code to no longer require an SGL as an argument. Each call to the chip level code will be guaranteed to be contiguous so it only has to worry about making a mapping from virt -> phys. This removes the dependency on Linux that the chip code currently has. With this patch the core GMMU code still uses the Linux SGL but the logic is highly transferable to a different, nvgpu specific, scatter gather list format in the near future. The last major update is to push most of the page table attribute arguments to a struct. That struct is passed on through the various mapping levels. This makes the funtions calls more simple and easier to follow. JIRA NVGPU-30 Change-Id: Ibb6b11755f99818fe642622ca0bd4cbed054f602 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1484104 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
author: Alex Waterman <alexw@nvidia.com> 2017-05-11 16:59:22 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-07-06 17:44:15 -0400
commit: c1393d5b68e63c992f4c689cb788139fdf8c2f1a (patch)
tree: 00a588d35342d75c05fed7733e91da753ba640fb /drivers/gpu/nvgpu/gp10b/mm_gp10b.c
parent: 84f712dee8b582dd7d2a19345c621a2ae3bd6292 (diff)
1 files changed, 145 insertions, 164 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d..c3867e9d 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
 */
 #include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
        return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
 }
-static u32 pde3_from_index(u32 i)
+static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
-{
+                                    const struct gk20a_mmu_level *l,
-        return i * gmmu_new_pde__size_v() / sizeof(u32);
+                                    struct nvgpu_gmmu_pd *pd,
-}
+                                    u32 pd_idx,
+                                    u64 virt_addr,
-static u32 pte3_from_index(u32 i)
+                                    u64 phys_addr,
-{
+                                    struct nvgpu_gmmu_attrs *attrs)
-        return i * gmmu_new_pte__size_v() / sizeof(u32);
-}
-static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *parent,
-                           u32 i, u32 gmmu_pgsz_idx,
-                           struct scatterlist **sgl,
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
-        u64 pte_addr = 0;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
-        struct gk20a_mm_entry *pte = parent->entries + i;
        u32 pde_v[2] = {0, 0};
-        u32 pde;
-        gk20a_dbg_fn("");
-        pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();
+        phys_addr >>= gmmu_new_pde_address_shift_v();
-        pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem,
+        pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_pde_aperture_sys_mem_ncoh_f(),
                        gmmu_new_pde_aperture_video_memory_f());
-        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr));
+        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
        pde_v[0] |= gmmu_new_pde_vol_true_f();
-        pde_v[1] |= pte_addr >> 24;
+        pde_v[1] |= phys_addr >> 24;
-        pde = pde3_from_index(i);
+        pd_write(g, pd, pd_offset + 0, pde_v[0]);
-        gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 1, pde_v[1]);
-        gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);
+        pte_dbg(g, attrs,
-        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
-                  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
+                "GPU %#-12llx  phys %#-12llx "
-        gk20a_dbg_fn("done");
+                "[0x%08x, 0x%08x]",
-        return 0;
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                pde_v[1], pde_v[0]);
 }
-static u32 pde0_from_index(u32 i)
+static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
-{
+                                    const struct gk20a_mmu_level *l,
-        return i * gmmu_new_dual_pde__size_v() / sizeof(u32);
+                                    struct nvgpu_gmmu_pd *pd,
-}
+                                    u32 pd_idx,
+                                    u64 virt_addr,
-static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
+                                    u64 phys_addr,
-                           struct gk20a_mm_entry *pte,
+                                    struct nvgpu_gmmu_attrs *attrs)
-                           u32 i, u32 gmmu_pgsz_idx,
-                           struct scatterlist **sgl,
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
        struct gk20a *g = gk20a_from_vm(vm);
        bool small_valid, big_valid;
-        u32 pte_addr_small = 0, pte_addr_big = 0;
+        u32 small_addr = 0, big_addr = 0;
-        struct gk20a_mm_entry *entry = pte->entries + i;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
        u32 pde_v[4] = {0, 0, 0, 0};
-        u32 pde;
-        gk20a_dbg_fn("");
-        small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
+        small_valid = attrs->pgsz == gmmu_page_size_small;
-        big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
+        big_valid   = attrs->pgsz == gmmu_page_size_big;
-        if (small_valid) {
+        if (small_valid)
-                pte_addr_small = gk20a_pde_addr(g, entry)
+                small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
-                                 >> gmmu_new_dual_pde_address_shift_v();
-        }
        if (big_valid)
-                pte_addr_big = gk20a_pde_addr(g, entry)
+                big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
-                               >> gmmu_new_dual_pde_address_big_shift_v();
        if (small_valid) {
-                pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small);
+                pde_v[2] |=
-                pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem,
+                        gmmu_new_dual_pde_address_small_sys_f(small_addr);
+                pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_small_video_memory_f());
                pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
-                pde_v[3] |= pte_addr_small >> 24;
+                pde_v[3] |= small_addr >> 24;
        }
        if (big_valid) {
-                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big);
+                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
                pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-                pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem,
+                pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
                        gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_big_video_memory_f());
-                pde_v[1] |= pte_addr_big >> 28;
+                pde_v[1] |= big_addr >> 28;
        }
-        pde = pde0_from_index(i);
+        pd_write(g, pd, pd_offset + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 1, pde_v[1]);
-        gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);
+        pd_write(g, pd, pd_offset + 2, pde_v[2]);
-        gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);
+        pd_write(g, pd, pd_offset + 3, pde_v[3]);
-        gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);
-        gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);
+        pte_dbg(g, attrs,
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
-        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",
+                "GPU %#-12llx  phys %#-12llx "
-                  i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
+                "[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
-        gk20a_dbg_fn("done");
+                pd_idx, l->entry_size, pd_offset,
-        return 0;
+                small_valid ? 'S' : '-',
+                big_valid ?   'B' : '-',
+                virt_addr, phys_addr,
+                pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
 }
-static int update_gmmu_pte_locked(struct vm_gk20a *vm,
+static void __update_pte(struct vm_gk20a *vm,
-                           struct gk20a_mm_entry *pte,
+                         u32 *pte_w,
-                           u32 i, u32 gmmu_pgsz_idx,
+                         u64 phys_addr,
-                           struct scatterlist **sgl,
+                         struct nvgpu_gmmu_attrs *attrs)
-                           u64 *offset,
-                           u64 *iova,
-                           u32 kind_v, u64 *ctag,
-                           bool cacheable, bool unmapped_pte,
-                           int rw_flag, bool sparse, bool priv,
-                           enum nvgpu_aperture aperture)
 {
-        struct gk20a *g = vm->mm->g;
+        struct gk20a *g = gk20a_from_vm(vm);
-        u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
        u64 ctag_granularity = g->ops.fb.compression_page_size(g);
-        u32 pte_w[2] = {0, 0}; /* invalid pte */
+        u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
-        u32 pte_i;
+        u32 pte_valid = attrs->valid ?
+                gmmu_new_pte_valid_true_f() :
-        if (*iova) {
+                gmmu_new_pte_valid_false_f();
-                u32 pte_valid = unmapped_pte ?
+        u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
-                        gmmu_new_pte_valid_false_f() :
+        u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
-                        gmmu_new_pte_valid_true_f();
+                gmmu_new_pte_address_sys_f(phys_shifted) :
-                u32 iova_v = *iova >> gmmu_new_pte_address_shift_v();
+                gmmu_new_pte_address_vid_f(phys_shifted);
-                u32 pte_addr = aperture == APERTURE_SYSMEM ?
+        u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
-                                gmmu_new_pte_address_sys_f(iova_v) :
+                gmmu_new_pte_aperture_sys_mem_ncoh_f(),
-                                gmmu_new_pte_address_vid_f(iova_v);
+                gmmu_new_pte_aperture_video_memory_f());
-                u32 pte_tgt = __nvgpu_aperture_mask(g, aperture,
-                                gmmu_new_pte_aperture_sys_mem_ncoh_f(),
+        pte_w[0] = pte_valid | pte_addr | pte_tgt;
-                                gmmu_new_pte_aperture_video_memory_f());
+        if (attrs->priv)
-                pte_w[0] = pte_valid | pte_addr | pte_tgt;
+                pte_w[0] |= gmmu_new_pte_privilege_true_f();
-                if (priv)
+        pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
-                        pte_w[0] |= gmmu_new_pte_privilege_true_f();
+                gmmu_new_pte_kind_f(attrs->kind_v) |
+                gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
-                pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) |
+                                                 ctag_granularity));
-                           gmmu_new_pte_kind_f(kind_v) |
-                           gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity));
+        if (attrs->rw_flag == gk20a_mem_flag_read_only)
+                pte_w[0] |= gmmu_new_pte_read_only_true_f();
-                if (rw_flag == gk20a_mem_flag_read_only)
-                        pte_w[0] |= gmmu_new_pte_read_only_true_f();
+        if (!attrs->valid && !attrs->cacheable)
-                if (unmapped_pte && !cacheable)
+                pte_w[0] |= gmmu_new_pte_read_only_true_f();
-                        pte_w[0] |= gmmu_new_pte_read_only_true_f();
+        else if (!attrs->cacheable)
-                else if (!cacheable)
-                        pte_w[0] |= gmmu_new_pte_vol_true_f();
-                gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
-                           " ctag=%d vol=%d"
-                           " [0x%08x, 0x%08x]",
-                           i, *iova,
-                           kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
-                           pte_w[1], pte_w[0]);
-                if (*ctag)
-                        *ctag += page_size;
-        } else if (sparse) {
-                pte_w[0] = gmmu_new_pte_valid_false_f();
                pte_w[0] |= gmmu_new_pte_vol_true_f();
-        } else {
-                gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
-        }
-        pte_i = pte3_from_index(i);
+        if (attrs->ctag)
+                attrs->ctag += page_size;
-        gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);
-        gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);
+}
-        if (*iova) {
+static void __update_pte_sparse(u32 *pte_w)
-                *iova += page_size;
+{
-                *offset += page_size;
+        pte_w[0] = gmmu_new_pte_valid_false_f();
-                if (*sgl && *offset + page_size > (*sgl)->length) {
+        pte_w[0] |= gmmu_new_pte_vol_true_f();
-                        u64 new_iova;
+}
-                        *sgl = sg_next(*sgl);
-                        if (*sgl) {
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
-                                new_iova = sg_phys(*sgl);
+                                   const struct gk20a_mmu_level *l,
-                                gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+                                   struct nvgpu_gmmu_pd *pd,
-                                          new_iova, (*sgl)->length);
+                                   u32 pd_idx,
-                                if (new_iova) {
+                                   u64 virt_addr,
-                                        *offset = 0;
+                                   u64 phys_addr,
-                                        *iova = new_iova;
+                                   struct nvgpu_gmmu_attrs *attrs)
-                                }
+{
-                        }
+        struct gk20a *g = vm->mm->g;
-                }
+        u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
-        }
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
-        return 0;
+        u32 pte_w[2] = {0, 0};
+        if (phys_addr)
+                __update_pte(vm, pte_w, phys_addr, attrs);
+        else if (attrs->sparse)
+                __update_pte_sparse(pte_w);
+        pte_dbg(g, attrs,
+                "vm=%s "
+                "PTE: i=%-4u size=%-2u offs=%-4u | "
+                "GPU %#-12llx  phys %#-12llx "
+                "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+                "ctag=0x%08x "
+                "[0x%08x, 0x%08x]",
+                vm->name,
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                page_size >> 10,
+                nvgpu_gmmu_perm_str(attrs->rw_flag),
+                attrs->kind_v,
+                nvgpu_aperture_str(attrs->aperture),
+                attrs->valid     ? 'V' : '-',
+                attrs->cacheable ? 'C' : '-',
+                attrs->sparse    ? 'S' : '-',
+                attrs->priv      ? 'P' : '-',
+                (u32)attrs->ctag / g->ops.fb.compression_page_size(g),
+                pte_w[1], pte_w[0]);
+        pd_write(g, pd, pd_offset + 0, pte_w[0]);
+        pd_write(g, pd, pd_offset + 1, pte_w[1]);
 }
 static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
author	Alex Waterman <alexw@nvidia.com>	2017-05-11 16:59:22 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-07-06 17:44:15 -0400
commit	c1393d5b68e63c992f4c689cb788139fdf8c2f1a (patch)
tree	00a588d35342d75c05fed7733e91da753ba640fb /drivers/gpu/nvgpu/gp10b/mm_gp10b.c
parent	84f712dee8b582dd7d2a19345c621a2ae3bd6292 (diff)

diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index d7391c6d..c3867e9d 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
14	*/	14	*/
15		15
16	#include <nvgpu/dma.h>	16	#include <nvgpu/dma.h>
		17	#include <nvgpu/gmmu.h>
17		18
18	#include "gk20a/gk20a.h"	19	#include "gk20a/gk20a.h"
19	#include "gk20a/platform_gk20a.h"	20	#include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a g, struct scatterlist sgl,
149	return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));	150	return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
150	}	151	}
151		152
152	static u32 pde3_from_index(u32 i)	153	static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
153	{	154	const struct gk20a_mmu_level *l,
154	return i * gmmu_new_pde__size_v() / sizeof(u32);	155	struct nvgpu_gmmu_pd *pd,
155	}	156	u32 pd_idx,
156		157	u64 virt_addr,
157	static u32 pte3_from_index(u32 i)	158	u64 phys_addr,
158	{	159	struct nvgpu_gmmu_attrs *attrs)
159	return i * gmmu_new_pte__size_v() / sizeof(u32);
160	}
161
162	static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
163	struct gk20a_mm_entry *parent,
164	u32 i, u32 gmmu_pgsz_idx,
165	struct scatterlist **sgl,
166	u64 *offset,
167	u64 *iova,
168	u32 kind_v, u64 *ctag,
169	bool cacheable, bool unmapped_pte,
170	int rw_flag, bool sparse, bool priv,
171	enum nvgpu_aperture aperture)
172	{	160	{
173	struct gk20a *g = gk20a_from_vm(vm);	161	struct gk20a *g = gk20a_from_vm(vm);
174	u64 pte_addr = 0;	162	u32 pd_offset = pd_offset_from_index(l, pd_idx);
175	struct gk20a_mm_entry *pte = parent->entries + i;
176	u32 pde_v[2] = {0, 0};	163	u32 pde_v[2] = {0, 0};
177	u32 pde;
178
179	gk20a_dbg_fn("");
180		164
181	pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();	165	phys_addr >>= gmmu_new_pde_address_shift_v();
182		166
183	pde_v[0] \|= nvgpu_aperture_mask(g, &pte->mem,	167	pde_v[0] \|= nvgpu_aperture_mask(g, &pd->mem,
184	gmmu_new_pde_aperture_sys_mem_ncoh_f(),	168	gmmu_new_pde_aperture_sys_mem_ncoh_f(),
185	gmmu_new_pde_aperture_video_memory_f());	169	gmmu_new_pde_aperture_video_memory_f());
186	pde_v[0] \|= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr));	170	pde_v[0] \|= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
187	pde_v[0] \|= gmmu_new_pde_vol_true_f();	171	pde_v[0] \|= gmmu_new_pde_vol_true_f();
188	pde_v[1] \|= pte_addr >> 24;	172	pde_v[1] \|= phys_addr >> 24;
189	pde = pde3_from_index(i);	173
190		174	pd_write(g, pd, pd_offset + 0, pde_v[0]);
191	gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);	175	pd_write(g, pd, pd_offset + 1, pde_v[1]);
192	gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);	176
193		177	pte_dbg(g, attrs,
194	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",	178	"PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- \| "
195	i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);	179	"GPU %#-12llx phys %#-12llx "
196	gk20a_dbg_fn("done");	180	"[0x%08x, 0x%08x]",
197	return 0;	181	pd_idx, l->entry_size, pd_offset,
		182	virt_addr, phys_addr,
		183	pde_v[1], pde_v[0]);
198	}	184	}
199		185
200	static u32 pde0_from_index(u32 i)	186	static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
201	{	187	const struct gk20a_mmu_level *l,
202	return i * gmmu_new_dual_pde__size_v() / sizeof(u32);	188	struct nvgpu_gmmu_pd *pd,
203	}	189	u32 pd_idx,
204		190	u64 virt_addr,
205	static int update_gmmu_pde0_locked(struct vm_gk20a *vm,	191	u64 phys_addr,
206	struct gk20a_mm_entry *pte,	192	struct nvgpu_gmmu_attrs *attrs)
207	u32 i, u32 gmmu_pgsz_idx,
208	struct scatterlist **sgl,
209	u64 *offset,
210	u64 *iova,
211	u32 kind_v, u64 *ctag,
212	bool cacheable, bool unmapped_pte,
213	int rw_flag, bool sparse, bool priv,
214	enum nvgpu_aperture aperture)
215	{	193	{
216	struct gk20a *g = gk20a_from_vm(vm);	194	struct gk20a *g = gk20a_from_vm(vm);
217	bool small_valid, big_valid;	195	bool small_valid, big_valid;
218	u32 pte_addr_small = 0, pte_addr_big = 0;	196	u32 small_addr = 0, big_addr = 0;
219	struct gk20a_mm_entry *entry = pte->entries + i;	197	u32 pd_offset = pd_offset_from_index(l, pd_idx);
220	u32 pde_v[4] = {0, 0, 0, 0};	198	u32 pde_v[4] = {0, 0, 0, 0};
221	u32 pde;
222
223	gk20a_dbg_fn("");
224		199
225	small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;	200	small_valid = attrs->pgsz == gmmu_page_size_small;
226	big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;	201	big_valid = attrs->pgsz == gmmu_page_size_big;
227		202
228	if (small_valid) {	203	if (small_valid)
229	pte_addr_small = gk20a_pde_addr(g, entry)	204	small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
230	>> gmmu_new_dual_pde_address_shift_v();
231	}
232		205
233	if (big_valid)	206	if (big_valid)
234	pte_addr_big = gk20a_pde_addr(g, entry)	207	big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
235	>> gmmu_new_dual_pde_address_big_shift_v();
236		208
237	if (small_valid) {	209	if (small_valid) {
238	pde_v[2] \|= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small);	210	pde_v[2] \|=
239	pde_v[2] \|= nvgpu_aperture_mask(g, &entry->mem,	211	gmmu_new_dual_pde_address_small_sys_f(small_addr);
		212	pde_v[2] \|= nvgpu_aperture_mask(g, &pd->mem,
240	gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),	213	gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
241	gmmu_new_dual_pde_aperture_small_video_memory_f());	214	gmmu_new_dual_pde_aperture_small_video_memory_f());
242	pde_v[2] \|= gmmu_new_dual_pde_vol_small_true_f();	215	pde_v[2] \|= gmmu_new_dual_pde_vol_small_true_f();
243	pde_v[3] \|= pte_addr_small >> 24;	216	pde_v[3] \|= small_addr >> 24;
244	}	217	}
245		218
246	if (big_valid) {	219	if (big_valid) {
247	pde_v[0] \|= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big);	220	pde_v[0] \|= gmmu_new_dual_pde_address_big_sys_f(big_addr);
248	pde_v[0] \|= gmmu_new_dual_pde_vol_big_true_f();	221	pde_v[0] \|= gmmu_new_dual_pde_vol_big_true_f();
249	pde_v[0] \|= nvgpu_aperture_mask(g, &entry->mem,	222	pde_v[0] \|= nvgpu_aperture_mask(g, &pd->mem,
250	gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),	223	gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
251	gmmu_new_dual_pde_aperture_big_video_memory_f());	224	gmmu_new_dual_pde_aperture_big_video_memory_f());
252	pde_v[1] \|= pte_addr_big >> 28;	225	pde_v[1] \|= big_addr >> 28;
253	}	226	}
254		227
255	pde = pde0_from_index(i);	228	pd_write(g, pd, pd_offset + 0, pde_v[0]);
256		229	pd_write(g, pd, pd_offset + 1, pde_v[1]);
257	gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);	230	pd_write(g, pd, pd_offset + 2, pde_v[2]);
258	gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);	231	pd_write(g, pd, pd_offset + 3, pde_v[3]);
259	gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);	232
260	gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);	233	pte_dbg(g, attrs,
261		234	"PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c \| "
262	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",	235	"GPU %#-12llx phys %#-12llx "
263	i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);	236	"[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
264	gk20a_dbg_fn("done");	237	pd_idx, l->entry_size, pd_offset,
265	return 0;	238	small_valid ? 'S' : '-',
		239	big_valid ? 'B' : '-',
		240	virt_addr, phys_addr,
		241	pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
266	}	242	}
267		243
268	static int update_gmmu_pte_locked(struct vm_gk20a *vm,	244	static void __update_pte(struct vm_gk20a *vm,
269	struct gk20a_mm_entry *pte,	245	u32 *pte_w,
270	u32 i, u32 gmmu_pgsz_idx,	246	u64 phys_addr,
271	struct scatterlist **sgl,	247	struct nvgpu_gmmu_attrs *attrs)
272	u64 *offset,
273	u64 *iova,
274	u32 kind_v, u64 *ctag,
275	bool cacheable, bool unmapped_pte,
276	int rw_flag, bool sparse, bool priv,
277	enum nvgpu_aperture aperture)
278	{	248	{
279	struct gk20a *g = vm->mm->g;	249	struct gk20a *g = gk20a_from_vm(vm);
280	u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
281	u64 ctag_granularity = g->ops.fb.compression_page_size(g);	250	u64 ctag_granularity = g->ops.fb.compression_page_size(g);
282	u32 pte_w[2] = {0, 0}; /* invalid pte */	251	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
283	u32 pte_i;	252	u32 pte_valid = attrs->valid ?
284		253	gmmu_new_pte_valid_true_f() :
285	if (*iova) {	254	gmmu_new_pte_valid_false_f();
286	u32 pte_valid = unmapped_pte ?	255	u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
287	gmmu_new_pte_valid_false_f() :	256	u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
288	gmmu_new_pte_valid_true_f();	257	gmmu_new_pte_address_sys_f(phys_shifted) :
289	u32 iova_v = *iova >> gmmu_new_pte_address_shift_v();	258	gmmu_new_pte_address_vid_f(phys_shifted);
290	u32 pte_addr = aperture == APERTURE_SYSMEM ?	259	u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
291	gmmu_new_pte_address_sys_f(iova_v) :	260	gmmu_new_pte_aperture_sys_mem_ncoh_f(),
292	gmmu_new_pte_address_vid_f(iova_v);	261	gmmu_new_pte_aperture_video_memory_f());
293	u32 pte_tgt = __nvgpu_aperture_mask(g, aperture,	262
294	gmmu_new_pte_aperture_sys_mem_ncoh_f(),	263	pte_w[0] = pte_valid \| pte_addr \| pte_tgt;
295	gmmu_new_pte_aperture_video_memory_f());	264
296		265	if (attrs->priv)
297	pte_w[0] = pte_valid \| pte_addr \| pte_tgt;	266	pte_w[0] \|= gmmu_new_pte_privilege_true_f();
298		267
299	if (priv)	268	pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) \|
300	pte_w[0] \|= gmmu_new_pte_privilege_true_f();	269	gmmu_new_pte_kind_f(attrs->kind_v) \|
301		270	gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
302	pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) \|	271	ctag_granularity));
303	gmmu_new_pte_kind_f(kind_v) \|	272
304	gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity));	273	if (attrs->rw_flag == gk20a_mem_flag_read_only)
305		274	pte_w[0] \|= gmmu_new_pte_read_only_true_f();
306	if (rw_flag == gk20a_mem_flag_read_only)	275
307	pte_w[0] \|= gmmu_new_pte_read_only_true_f();	276	if (!attrs->valid && !attrs->cacheable)
308	if (unmapped_pte && !cacheable)	277	pte_w[0] \|= gmmu_new_pte_read_only_true_f();
309	pte_w[0] \|= gmmu_new_pte_read_only_true_f();	278	else if (!attrs->cacheable)
310	else if (!cacheable)
311	pte_w[0] \|= gmmu_new_pte_vol_true_f();
312
313	gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
314	" ctag=%d vol=%d"
315	" [0x%08x, 0x%08x]",
316	i, *iova,
317	kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
318	pte_w[1], pte_w[0]);
319
320	if (*ctag)
321	*ctag += page_size;
322	} else if (sparse) {
323	pte_w[0] = gmmu_new_pte_valid_false_f();
324	pte_w[0] \|= gmmu_new_pte_vol_true_f();	279	pte_w[0] \|= gmmu_new_pte_vol_true_f();
325	} else {
326	gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
327	}
328		280
329	pte_i = pte3_from_index(i);	281	if (attrs->ctag)
330		282	attrs->ctag += page_size;
331	gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);	283
332	gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);	284	}
333		285
334	if (*iova) {	286	static void __update_pte_sparse(u32 *pte_w)
335	*iova += page_size;	287	{
336	*offset += page_size;	288	pte_w[0] = gmmu_new_pte_valid_false_f();
337	if (sgl && offset + page_size > (*sgl)->length) {	289	pte_w[0] \|= gmmu_new_pte_vol_true_f();
338	u64 new_iova;	290	}
339	sgl = sg_next(sgl);	291
340	if (*sgl) {	292	static void update_gmmu_pte_locked(struct vm_gk20a *vm,
341	new_iova = sg_phys(*sgl);	293	const struct gk20a_mmu_level *l,
342	gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",	294	struct nvgpu_gmmu_pd *pd,
343	new_iova, (*sgl)->length);	295	u32 pd_idx,
344	if (new_iova) {	296	u64 virt_addr,
345	*offset = 0;	297	u64 phys_addr,
346	*iova = new_iova;	298	struct nvgpu_gmmu_attrs *attrs)
347	}	299	{
348	}	300	struct gk20a *g = vm->mm->g;
349	}	301	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
350	}	302	u32 pd_offset = pd_offset_from_index(l, pd_idx);
351	return 0;	303	u32 pte_w[2] = {0, 0};
		304
		305	if (phys_addr)
		306	__update_pte(vm, pte_w, phys_addr, attrs);
		307	else if (attrs->sparse)
		308	__update_pte_sparse(pte_w);
		309
		310	pte_dbg(g, attrs,
		311	"vm=%s "
		312	"PTE: i=%-4u size=%-2u offs=%-4u \| "
		313	"GPU %#-12llx phys %#-12llx "
		314	"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
		315	"ctag=0x%08x "
		316	"[0x%08x, 0x%08x]",
		317	vm->name,
		318	pd_idx, l->entry_size, pd_offset,
		319	virt_addr, phys_addr,
		320	page_size >> 10,
		321	nvgpu_gmmu_perm_str(attrs->rw_flag),
		322	attrs->kind_v,
		323	nvgpu_aperture_str(attrs->aperture),
		324	attrs->valid ? 'V' : '-',
		325	attrs->cacheable ? 'C' : '-',
		326	attrs->sparse ? 'S' : '-',
		327	attrs->priv ? 'P' : '-',
		328	(u32)attrs->ctag / g->ops.fb.compression_page_size(g),
		329	pte_w[1], pte_w[0]);
		330
		331	pd_write(g, pd, pd_offset + 0, pte_w[0]);
		332	pd_write(g, pd, pd_offset + 1, pte_w[1]);
352	}	333	}
353		334
354	static const struct gk20a_mmu_level gp10b_mm_levels[] = {	335	static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level gp10b_mm_get_mmu_levels(struct gk20a g,
384	static void gp10b_mm_init_pdb(struct gk20a g, struct nvgpu_mem inst_block,	365	static void gp10b_mm_init_pdb(struct gk20a g, struct nvgpu_mem inst_block,
385	struct vm_gk20a *vm)	366	struct vm_gk20a *vm)
386	{	367	{
387	u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);	368	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
388	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());	369	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
389	u32 pdb_addr_hi = u64_hi32(pdb_addr);	370	u32 pdb_addr_hi = u64_hi32(pdb_addr);
390		371