gpu: nvgpu: compact pte buffers

The lowest page table level may hold very few entries for mappings of large pages, but a new page is allocated for each list of entries at the lowest level, wasting memory and performance. Compact these so that the new "allocation" of ptes is appended at the end of the previous allocation, if there is space. Bug 1736604 Change-Id: I4c7c4cad9019de202325750aee6034076e7e61c2 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1222810 (cherry picked from commit 97303ecc946c17150496486a2f52bd481311dbf7) Reviewed-on: http://git-master/r/1234995 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2016-09-19 02:24:13 -0400
committer: Deepak Nibade <dnibade@nvidia.com> 2016-12-27 04:56:49 -0500
commit: 4afc6a1659ec058fd44953ccff7a1030275bcc92 (patch)
tree: a6953c8895f1ab917f3a5eca29f4568a6702749c /drivers/gpu/nvgpu/gp10b/mm_gp10b.c
parent: 49c3fb25822565a9078961cdef1222aaa8c7e89a (diff)
1 files changed, 14 insertions, 25 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index 03bab121..1e073ab2 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -151,18 +151,6 @@ static u32 pte3_from_index(u32 i)
        return i * gmmu_new_pte__size_v() / sizeof(u32);
 }
-static u64 entry_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
-{
-        u64 addr;
-        if (g->mm.has_physical_mode)
-                addr = sg_phys(entry->mem.sgt->sgl);
-        else
-                addr = gk20a_mem_get_base_addr(g, &entry->mem, 0);
-        return addr;
-}
 static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
                           struct gk20a_mm_entry *parent,
                           u32 i, u32 gmmu_pgsz_idx,
@@ -176,15 +164,13 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
 {
        struct gk20a *g = gk20a_from_vm(vm);
        u64 pte_addr = 0;
-        u64 pde_addr = 0;
        struct gk20a_mm_entry *pte = parent->entries + i;
        u32 pde_v[2] = {0, 0};
        u32 pde;
        gk20a_dbg_fn("");
-        pte_addr = entry_addr(g, pte) >> gmmu_new_pde_address_shift_v();
+        pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();
-        pde_addr = entry_addr(g, parent);
        pde_v[0] |= gk20a_aperture_mask(g, &pte->mem,
                        gmmu_new_pde_aperture_sys_mem_ncoh_f(),
@@ -194,8 +180,8 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
        pde_v[1] |= pte_addr >> 24;
        pde = pde3_from_index(i);
-        gk20a_mem_wr32(g, &parent->mem, pde + 0, pde_v[0]);
+        gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);
-        gk20a_mem_wr32(g, &parent->mem, pde + 1, pde_v[1]);
+        gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);
        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
                  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -232,12 +218,12 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
        big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
        if (small_valid) {
-                pte_addr_small = entry_addr(g, entry)
+                pte_addr_small = gk20a_pde_addr(g, entry)
                                 >> gmmu_new_dual_pde_address_shift_v();
        }
        if (big_valid)
-                pte_addr_big = entry_addr(g, entry)
+                pte_addr_big = gk20a_pde_addr(g, entry)
                               >> gmmu_new_dual_pde_address_big_shift_v();
        if (small_valid) {
@@ -260,10 +246,10 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
        pde = pde0_from_index(i);
-        gk20a_mem_wr32(g, &pte->mem, pde + 0, pde_v[0]);
+        gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);
-        gk20a_mem_wr32(g, &pte->mem, pde + 1, pde_v[1]);
+        gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);
-        gk20a_mem_wr32(g, &pte->mem, pde + 2, pde_v[2]);
+        gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);
-        gk20a_mem_wr32(g, &pte->mem, pde + 3, pde_v[3]);
+        gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);
        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",
                  i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
@@ -286,6 +272,7 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
        u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
        u64 ctag_granularity = g->ops.fb.compression_page_size(g);
        u32 pte_w[2] = {0, 0}; /* invalid pte */
+        u32 pte_i;
        if (*iova) {
                u32 pte_valid = unmapped_pte ?
@@ -331,8 +318,10 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
                gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
        }
-        gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 0, pte_w[0]);
+        pte_i = pte3_from_index(i);
-        gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 1, pte_w[1]);
+        gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);
+        gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);
        if (*iova) {
                *iova += page_size;
author	Konsta Holtta <kholtta@nvidia.com>	2016-09-19 02:24:13 -0400
committer	Deepak Nibade <dnibade@nvidia.com>	2016-12-27 04:56:49 -0500
commit	4afc6a1659ec058fd44953ccff7a1030275bcc92 (patch)
tree	a6953c8895f1ab917f3a5eca29f4568a6702749c /drivers/gpu/nvgpu/gp10b/mm_gp10b.c
parent	49c3fb25822565a9078961cdef1222aaa8c7e89a (diff)

diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index 03bab121..1e073ab2 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -151,18 +151,6 @@ static u32 pte3_from_index(u32 i)
151	return i * gmmu_new_pte__size_v() / sizeof(u32);	151	return i * gmmu_new_pte__size_v() / sizeof(u32);
152	}	152	}
153		153
154	static u64 entry_addr(struct gk20a g, struct gk20a_mm_entry entry)
155	{
156	u64 addr;
157
158	if (g->mm.has_physical_mode)
159	addr = sg_phys(entry->mem.sgt->sgl);
160	else
161	addr = gk20a_mem_get_base_addr(g, &entry->mem, 0);
162
163	return addr;
164	}
165
166	static int update_gmmu_pde3_locked(struct vm_gk20a *vm,	154	static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
167	struct gk20a_mm_entry *parent,	155	struct gk20a_mm_entry *parent,
168	u32 i, u32 gmmu_pgsz_idx,	156	u32 i, u32 gmmu_pgsz_idx,
@@ -176,15 +164,13 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
176	{	164	{
177	struct gk20a *g = gk20a_from_vm(vm);	165	struct gk20a *g = gk20a_from_vm(vm);
178	u64 pte_addr = 0;	166	u64 pte_addr = 0;
179	u64 pde_addr = 0;
180	struct gk20a_mm_entry *pte = parent->entries + i;	167	struct gk20a_mm_entry *pte = parent->entries + i;
181	u32 pde_v[2] = {0, 0};	168	u32 pde_v[2] = {0, 0};
182	u32 pde;	169	u32 pde;
183		170
184	gk20a_dbg_fn("");	171	gk20a_dbg_fn("");
185		172
186	pte_addr = entry_addr(g, pte) >> gmmu_new_pde_address_shift_v();	173	pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();
187	pde_addr = entry_addr(g, parent);
188		174
189	pde_v[0] \|= gk20a_aperture_mask(g, &pte->mem,	175	pde_v[0] \|= gk20a_aperture_mask(g, &pte->mem,
190	gmmu_new_pde_aperture_sys_mem_ncoh_f(),	176	gmmu_new_pde_aperture_sys_mem_ncoh_f(),
@@ -194,8 +180,8 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
194	pde_v[1] \|= pte_addr >> 24;	180	pde_v[1] \|= pte_addr >> 24;
195	pde = pde3_from_index(i);	181	pde = pde3_from_index(i);
196		182
197	gk20a_mem_wr32(g, &parent->mem, pde + 0, pde_v[0]);	183	gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);
198	gk20a_mem_wr32(g, &parent->mem, pde + 1, pde_v[1]);	184	gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);
199		185
200	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",	186	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
201	i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);	187	i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -232,12 +218,12 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
232	big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;	218	big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
233		219
234	if (small_valid) {	220	if (small_valid) {
235	pte_addr_small = entry_addr(g, entry)	221	pte_addr_small = gk20a_pde_addr(g, entry)
236	>> gmmu_new_dual_pde_address_shift_v();	222	>> gmmu_new_dual_pde_address_shift_v();
237	}	223	}
238		224
239	if (big_valid)	225	if (big_valid)
240	pte_addr_big = entry_addr(g, entry)	226	pte_addr_big = gk20a_pde_addr(g, entry)
241	>> gmmu_new_dual_pde_address_big_shift_v();	227	>> gmmu_new_dual_pde_address_big_shift_v();
242		228
243	if (small_valid) {	229	if (small_valid) {
@@ -260,10 +246,10 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
260		246
261	pde = pde0_from_index(i);	247	pde = pde0_from_index(i);
262		248
263	gk20a_mem_wr32(g, &pte->mem, pde + 0, pde_v[0]);	249	gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);
264	gk20a_mem_wr32(g, &pte->mem, pde + 1, pde_v[1]);	250	gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);
265	gk20a_mem_wr32(g, &pte->mem, pde + 2, pde_v[2]);	251	gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);
266	gk20a_mem_wr32(g, &pte->mem, pde + 3, pde_v[3]);	252	gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);
267		253
268	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",	254	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",
269	i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);	255	i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
@@ -286,6 +272,7 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
286	u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];	272	u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
287	u64 ctag_granularity = g->ops.fb.compression_page_size(g);	273	u64 ctag_granularity = g->ops.fb.compression_page_size(g);
288	u32 pte_w[2] = {0, 0}; /* invalid pte */	274	u32 pte_w[2] = {0, 0}; /* invalid pte */
		275	u32 pte_i;
289		276
290	if (*iova) {	277	if (*iova) {
291	u32 pte_valid = unmapped_pte ?	278	u32 pte_valid = unmapped_pte ?
@@ -331,8 +318,10 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
331	gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);	318	gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
332	}	319	}
333		320
334	gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 0, pte_w[0]);	321	pte_i = pte3_from_index(i);
335	gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 1, pte_w[1]);	322
		323	gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);
		324	gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);
336		325
337	if (*iova) {	326	if (*iova) {
338	*iova += page_size;	327	*iova += page_size;