diff options
author | Alex Waterman <alexw@nvidia.com> | 2017-06-09 14:42:50 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-07-06 17:44:16 -0400 |
commit | 583704620db88e391f6b14acc57af859a70127de (patch) | |
tree | 8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/gp10b | |
parent | c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff) |
gpu: nvgpu: Implement PD packing
In some cases page directories require less than a full page of memory.
For example, on Pascal, the final PD level for large pages is only 256 bytes;
thus 16 PDs can fit in a single page. To allocate an entire page for each of
these 256 B PDs is extremely wasteful. This patch aims to alleviate the
wasted DMA memory from having small PDs in a full page by packing multiple
small PDs into a single page.
The packing is implemented as a slab allocator - each page is a slab and
from each page multiple PD instances can be allocated. Several modifications
to the nvgpu_gmmu_pd struct also needed to be made to support this. The
nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem
struct so that each nvgpu_gmmu_pd knows what portion of the memory it's
using.
The nvgpu_pde_phys_addr() function and the pd_write() functions also require
some changes since the PD no longer is always situated at the start of the
nvgpu_mem.
Initialization and cleanup of the page tables for each VM was slightly
modified to work through the new pd_cache implementation. Some PDs (i.e
the PDB), despite not being a full page, still require a full page for
alignment purposes (HW requirements). Thus a direct allocation method for
PDs is still provided. This is also used when a PD that could in principle
be cached is greater than a page in size.
Lastly a new debug flag was added for the pd_cache code.
JIRA NVGPU-30
Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master/r/1506610
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/gp10b')
-rw-r--r-- | drivers/gpu/nvgpu/gp10b/mm_gp10b.c | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index c3867e9d..2ff199c6 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c | |||
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm, | |||
164 | 164 | ||
165 | phys_addr >>= gmmu_new_pde_address_shift_v(); | 165 | phys_addr >>= gmmu_new_pde_address_shift_v(); |
166 | 166 | ||
167 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, | 167 | pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, |
168 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), | 168 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), |
169 | gmmu_new_pde_aperture_video_memory_f()); | 169 | gmmu_new_pde_aperture_video_memory_f()); |
170 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); | 170 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); |
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
209 | if (small_valid) { | 209 | if (small_valid) { |
210 | pde_v[2] |= | 210 | pde_v[2] |= |
211 | gmmu_new_dual_pde_address_small_sys_f(small_addr); | 211 | gmmu_new_dual_pde_address_small_sys_f(small_addr); |
212 | pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem, | 212 | pde_v[2] |= nvgpu_aperture_mask(g, pd->mem, |
213 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), | 213 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), |
214 | gmmu_new_dual_pde_aperture_small_video_memory_f()); | 214 | gmmu_new_dual_pde_aperture_small_video_memory_f()); |
215 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); | 215 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); |
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
219 | if (big_valid) { | 219 | if (big_valid) { |
220 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); | 220 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); |
221 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); | 221 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); |
222 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, | 222 | pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, |
223 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), | 223 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), |
224 | gmmu_new_dual_pde_aperture_big_video_memory_f()); | 224 | gmmu_new_dual_pde_aperture_big_video_memory_f()); |
225 | pde_v[1] |= big_addr >> 28; | 225 | pde_v[1] |= big_addr >> 28; |
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g, | |||
365 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, | 365 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, |
366 | struct vm_gk20a *vm) | 366 | struct vm_gk20a *vm) |
367 | { | 367 | { |
368 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); | 368 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0); |
369 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); | 369 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); |
370 | u32 pdb_addr_hi = u64_hi32(pdb_addr); | 370 | u32 pdb_addr_hi = u64_hi32(pdb_addr); |
371 | 371 | ||
372 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); | 372 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); |
373 | 373 | ||
374 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), | 374 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), |
375 | nvgpu_aperture_mask(g, &vm->pdb.mem, | 375 | nvgpu_aperture_mask(g, vm->pdb.mem, |
376 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), | 376 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), |
377 | ram_in_page_dir_base_target_vid_mem_f()) | | 377 | ram_in_page_dir_base_target_vid_mem_f()) | |
378 | ram_in_page_dir_base_vol_true_f() | | 378 | ram_in_page_dir_base_vol_true_f() | |