gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory. For example, on Pascal, the final PD level for large pages is only 256 bytes; thus 16 PDs can fit in a single page. To allocate an entire page for each of these 256 B PDs is extremely wasteful. This patch aims to alleviate the wasted DMA memory from having small PDs in a full page by packing multiple small PDs into a single page. The packing is implemented as a slab allocator - each page is a slab and from each page multiple PD instances can be allocated. Several modifications to the nvgpu_gmmu_pd struct also needed to be made to support this. The nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem struct so that each nvgpu_gmmu_pd knows what portion of the memory it's using. The nvgpu_pde_phys_addr() function and the pd_write() functions also require some changes since the PD no longer is always situated at the start of the nvgpu_mem. Initialization and cleanup of the page tables for each VM was slightly modified to work through the new pd_cache implementation. Some PDs (i.e the PDB), despite not being a full page, still require a full page for alignment purposes (HW requirements). Thus a direct allocation method for PDs is still provided. This is also used when a PD that could in principle be cached is greater than a page in size. Lastly a new debug flag was added for the pd_cache code. JIRA NVGPU-30 Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1506610 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
author: Alex Waterman <alexw@nvidia.com> 2017-06-09 14:42:50 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-07-06 17:44:16 -0400
commit: 583704620db88e391f6b14acc57af859a70127de (patch)
tree: 8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/gp10b
parent: c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index c3867e9d..2ff199c6 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
        phys_addr >>= gmmu_new_pde_address_shift_v();
-        pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+        pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_pde_aperture_sys_mem_ncoh_f(),
                        gmmu_new_pde_aperture_video_memory_f());
        pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
        if (small_valid) {
                pde_v[2] |=
                        gmmu_new_dual_pde_address_small_sys_f(small_addr);
-                pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
+                pde_v[2] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_small_video_memory_f());
                pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
        if (big_valid) {
                pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
                pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-                pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
+                pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
                        gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
                        gmmu_new_dual_pde_aperture_big_video_memory_f());
                pde_v[1] |= big_addr >> 28;
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
                struct vm_gk20a *vm)
 {
-        u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
+        u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
        gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
-                nvgpu_aperture_mask(g, &vm->pdb.mem,
+                nvgpu_aperture_mask(g, vm->pdb.mem,
                  ram_in_page_dir_base_target_sys_mem_ncoh_f(),
                  ram_in_page_dir_base_target_vid_mem_f()) |
                ram_in_page_dir_base_vol_true_f() |
author	Alex Waterman <alexw@nvidia.com>	2017-06-09 14:42:50 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-07-06 17:44:16 -0400
commit	583704620db88e391f6b14acc57af859a70127de (patch)
tree	8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/gp10b
parent	c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)

diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index c3867e9d..2ff199c6 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
164		164
165	phys_addr >>= gmmu_new_pde_address_shift_v();	165	phys_addr >>= gmmu_new_pde_address_shift_v();
166		166
167	pde_v[0] \|= nvgpu_aperture_mask(g, &pd->mem,	167	pde_v[0] \|= nvgpu_aperture_mask(g, pd->mem,
168	gmmu_new_pde_aperture_sys_mem_ncoh_f(),	168	gmmu_new_pde_aperture_sys_mem_ncoh_f(),
169	gmmu_new_pde_aperture_video_memory_f());	169	gmmu_new_pde_aperture_video_memory_f());
170	pde_v[0] \|= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));	170	pde_v[0] \|= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
209	if (small_valid) {	209	if (small_valid) {
210	pde_v[2] \|=	210	pde_v[2] \|=
211	gmmu_new_dual_pde_address_small_sys_f(small_addr);	211	gmmu_new_dual_pde_address_small_sys_f(small_addr);
212	pde_v[2] \|= nvgpu_aperture_mask(g, &pd->mem,	212	pde_v[2] \|= nvgpu_aperture_mask(g, pd->mem,
213	gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),	213	gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
214	gmmu_new_dual_pde_aperture_small_video_memory_f());	214	gmmu_new_dual_pde_aperture_small_video_memory_f());
215	pde_v[2] \|= gmmu_new_dual_pde_vol_small_true_f();	215	pde_v[2] \|= gmmu_new_dual_pde_vol_small_true_f();
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
219	if (big_valid) {	219	if (big_valid) {
220	pde_v[0] \|= gmmu_new_dual_pde_address_big_sys_f(big_addr);	220	pde_v[0] \|= gmmu_new_dual_pde_address_big_sys_f(big_addr);
221	pde_v[0] \|= gmmu_new_dual_pde_vol_big_true_f();	221	pde_v[0] \|= gmmu_new_dual_pde_vol_big_true_f();
222	pde_v[0] \|= nvgpu_aperture_mask(g, &pd->mem,	222	pde_v[0] \|= nvgpu_aperture_mask(g, pd->mem,
223	gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),	223	gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
224	gmmu_new_dual_pde_aperture_big_video_memory_f());	224	gmmu_new_dual_pde_aperture_big_video_memory_f());
225	pde_v[1] \|= big_addr >> 28;	225	pde_v[1] \|= big_addr >> 28;
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level gp10b_mm_get_mmu_levels(struct gk20a g,
365	static void gp10b_mm_init_pdb(struct gk20a g, struct nvgpu_mem inst_block,	365	static void gp10b_mm_init_pdb(struct gk20a g, struct nvgpu_mem inst_block,
366	struct vm_gk20a *vm)	366	struct vm_gk20a *vm)
367	{	367	{
368	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);	368	u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
369	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());	369	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
370	u32 pdb_addr_hi = u64_hi32(pdb_addr);	370	u32 pdb_addr_hi = u64_hi32(pdb_addr);
371		371
372	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);	372	gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
373		373
374	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),	374	nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
375	nvgpu_aperture_mask(g, &vm->pdb.mem,	375	nvgpu_aperture_mask(g, vm->pdb.mem,
376	ram_in_page_dir_base_target_sys_mem_ncoh_f(),	376	ram_in_page_dir_base_target_sys_mem_ncoh_f(),
377	ram_in_page_dir_base_target_vid_mem_f()) \|	377	ram_in_page_dir_base_target_vid_mem_f()) \|
378	ram_in_page_dir_base_vol_true_f() \|	378	ram_in_page_dir_base_vol_true_f() \|